aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/async.c13
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/cgroup.c17
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/cpuset.c260
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/exit.c316
-rw-r--r--kernel/fork.c48
-rw-r--r--kernel/futex.c1208
-rw-r--r--kernel/gcov/Kconfig48
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c148
-rw-r--r--kernel/gcov/fs.c673
-rw-r--r--kernel/gcov/gcc_3_4.c447
-rw-r--r--kernel/gcov/gcov.h128
-rw-r--r--kernel/groups.c288
-rw-r--r--kernel/hrtimer.c60
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/handle.c74
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c17
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/numa_migrate.c38
-rw-r--r--kernel/kallsyms.c134
-rw-r--r--kernel/kexec.c16
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kgdb.c4
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kthread.c87
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/lockdep_internals.h4
-rw-r--r--kernel/module.c108
-rw-r--r--kernel/mutex.c31
-rw-r--r--kernel/nsproxy.c19
-rw-r--r--kernel/panic.c35
-rw-r--r--kernel/params.c46
-rw-r--r--kernel/perf_counter.c4383
-rw-r--r--kernel/pid.c17
-rw-r--r--kernel/pid_namespace.c24
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/hibernate.c (renamed from kernel/power/disk.c)59
-rw-r--r--kernel/power/hibernate_nvs.c135
-rw-r--r--kernel/power/main.c526
-rw-r--r--kernel/power/power.h25
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c80
-rw-r--r--kernel/power/suspend.c300
-rw-r--r--kernel/power/suspend_test.c187
-rw-r--r--kernel/power/swsusp.c198
-rw-r--r--kernel/printk.c33
-rw-r--r--kernel/profile.c14
-rw-r--r--kernel/ptrace.c180
-rw-r--r--kernel/rcupreempt.c8
-rw-r--r--kernel/rcutree.c25
-rw-r--r--kernel/rcutree_trace.c64
-rw-r--r--kernel/res_counter.c12
-rw-r--r--kernel/rtmutex.c250
-rw-r--r--kernel/rtmutex_common.h8
-rw-r--r--kernel/sched.c473
-rw-r--r--kernel/sched_clock.c3
-rw-r--r--kernel/sched_cpupri.c10
-rw-r--r--kernel/sched_debug.c6
-rw-r--r--kernel/sched_fair.c16
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/sched_rt.c2
-rw-r--r--kernel/signal.c94
-rw-r--r--kernel/slow-work.c27
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/sys.c290
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c98
-rw-r--r--kernel/time/clockevents.c14
-rw-r--r--kernel/time/clocksource.c23
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-oneshot.c17
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timekeeping.c9
-rw-r--r--kernel/timer.c141
-rw-r--r--kernel/trace/Kconfig161
-rw-r--r--kernel/trace/Makefile20
-rw-r--r--kernel/trace/blktrace.c285
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c808
-rw-r--r--kernel/trace/kmemtrace.c12
-rw-r--r--kernel/trace/ring_buffer.c1021
-rw-r--r--kernel/trace/ring_buffer_benchmark.c419
-rw-r--r--kernel/trace/trace.c412
-rw-r--r--kernel/trace/trace.h243
-rw-r--r--kernel/trace/trace_boot.c5
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_event_profile.c24
-rw-r--r--kernel/trace/trace_event_types.h12
-rw-r--r--kernel/trace/trace_events.c839
-rw-r--r--kernel/trace/trace_events_filter.c1203
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c110
-rw-r--r--kernel/trace/trace_functions.c8
-rw-r--r--kernel/trace/trace_functions_graph.c67
-rw-r--r--kernel/trace/trace_hw_branches.c203
-rw-r--r--kernel/trace/trace_mmiotrace.c6
-rw-r--r--kernel/trace/trace_output.c240
-rw-r--r--kernel/trace/trace_output.h34
-rw-r--r--kernel/trace/trace_power.c8
-rw-r--r--kernel/trace/trace_printk.c6
-rw-r--r--kernel/trace/trace_sched_switch.c12
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_selftest.c58
-rw-r--r--kernel/trace/trace_stack.c15
-rw-r--r--kernel/trace/trace_stat.c208
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_sysprof.c9
-rw-r--r--kernel/trace/trace_workqueue.c25
-rw-r--r--kernel/user.c67
-rw-r--r--kernel/utsname.c13
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c11
123 files changed, 14701 insertions, 4562 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..0a32cb21ec97 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
13 async.o 13 async.o
14obj-y += groups.o
14 15
15ifdef CONFIG_FUNCTION_TRACER 16ifdef CONFIG_FUNCTION_TRACER
16# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
@@ -70,6 +71,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
70obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 71obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
71obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 72obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
72obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 73obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
74obj-$(CONFIG_GCOV_KERNEL) += gcov/
73obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 75obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 76obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 77obj-$(CONFIG_KGDB) += kgdb.o
@@ -93,8 +95,10 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o 95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 96obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 97obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 99obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 100obj-$(CONFIG_SLOW_WORK) += slow-work.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 102
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/async.c b/kernel/async.c
index 968ef9457d4e..27235f5de198 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -92,19 +92,18 @@ extern int initcall_debug;
92static async_cookie_t __lowest_in_progress(struct list_head *running) 92static async_cookie_t __lowest_in_progress(struct list_head *running)
93{ 93{
94 struct async_entry *entry; 94 struct async_entry *entry;
95
95 if (!list_empty(running)) { 96 if (!list_empty(running)) {
96 entry = list_first_entry(running, 97 entry = list_first_entry(running,
97 struct async_entry, list); 98 struct async_entry, list);
98 return entry->cookie; 99 return entry->cookie;
99 } else if (!list_empty(&async_pending)) {
100 entry = list_first_entry(&async_pending,
101 struct async_entry, list);
102 return entry->cookie;
103 } else {
104 /* nothing in progress... next_cookie is "infinity" */
105 return next_cookie;
106 } 100 }
107 101
102 list_for_each_entry(entry, &async_pending, list)
103 if (entry->running == running)
104 return entry->cookie;
105
106 return next_cookie; /* "infinity" value */
108} 107}
109 108
110static async_cookie_t lowest_in_progress(struct list_head *running) 109static async_cookie_t lowest_in_progress(struct list_head *running)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a82..1f6396d76687 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
568 if (err) 568 if (err)
569 goto skip_it; 569 goto skip_it;
570 570
571 root_mnt = collect_mounts(path.mnt, path.dentry); 571 root_mnt = collect_mounts(&path);
572 path_put(&path); 572 path_put(&path);
573 if (!root_mnt) 573 if (!root_mnt)
574 goto skip_it; 574 goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
660 err = kern_path(tree->pathname, 0, &path); 660 err = kern_path(tree->pathname, 0, &path);
661 if (err) 661 if (err)
662 goto Err; 662 goto Err;
663 mnt = collect_mounts(path.mnt, path.dentry); 663 mnt = collect_mounts(&path);
664 path_put(&path); 664 path_put(&path);
665 if (!mnt) { 665 if (!mnt) {
666 err = -ENOMEM; 666 err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
720 err = kern_path(new, 0, &path); 720 err = kern_path(new, 0, &path);
721 if (err) 721 if (err)
722 return err; 722 return err;
723 tagged = collect_mounts(path.mnt, path.dentry); 723 tagged = collect_mounts(&path);
724 path_put(&path); 724 path_put(&path);
725 if (!tagged) 725 if (!tagged)
726 return -ENOMEM; 726 return -ENOMEM;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765..3737a682cdf5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h>
49 50
50#include <asm/atomic.h> 51#include <asm/atomic.h>
51 52
@@ -842,6 +843,11 @@ static int parse_cgroupfs_options(char *data,
842 struct cgroup_sb_opts *opts) 843 struct cgroup_sb_opts *opts)
843{ 844{
844 char *token, *o = data ?: "all"; 845 char *token, *o = data ?: "all";
846 unsigned long mask = (unsigned long)-1;
847
848#ifdef CONFIG_CPUSETS
849 mask = ~(1UL << cpuset_subsys_id);
850#endif
845 851
846 opts->subsys_bits = 0; 852 opts->subsys_bits = 0;
847 opts->flags = 0; 853 opts->flags = 0;
@@ -886,6 +892,15 @@ static int parse_cgroupfs_options(char *data,
886 } 892 }
887 } 893 }
888 894
895 /*
896 * Option noprefix was introduced just for backward compatibility
897 * with the old cpuset, so we allow noprefix only if mounting just
898 * the cpuset subsystem.
899 */
900 if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
901 (opts->subsys_bits & mask))
902 return -EINVAL;
903
889 /* We can't have an empty hierarchy */ 904 /* We can't have an empty hierarchy */
890 if (!opts->subsys_bits) 905 if (!opts->subsys_bits)
891 return -EINVAL; 906 return -EINVAL;
@@ -900,6 +915,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
900 struct cgroup *cgrp = &root->top_cgroup; 915 struct cgroup *cgrp = &root->top_cgroup;
901 struct cgroup_sb_opts opts; 916 struct cgroup_sb_opts opts;
902 917
918 lock_kernel();
903 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 919 mutex_lock(&cgrp->dentry->d_inode->i_mutex);
904 mutex_lock(&cgroup_mutex); 920 mutex_lock(&cgroup_mutex);
905 921
@@ -927,6 +943,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
927 kfree(opts.release_agent); 943 kfree(opts.release_agent);
928 mutex_unlock(&cgroup_mutex); 944 mutex_unlock(&cgroup_mutex);
929 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 945 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
946 unlock_kernel();
930 return ret; 947 return ret;
931} 948}
932 949
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869..7e75a41bd508 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -97,12 +97,6 @@ struct cpuset {
97 97
98 struct cpuset *parent; /* my parent */ 98 struct cpuset *parent; /* my parent */
99 99
100 /*
101 * Copy of global cpuset_mems_generation as of the most
102 * recent time this cpuset changed its mems_allowed.
103 */
104 int mems_generation;
105
106 struct fmeter fmeter; /* memory_pressure filter */ 100 struct fmeter fmeter; /* memory_pressure filter */
107 101
108 /* partition number for rebuild_sched_domains() */ 102 /* partition number for rebuild_sched_domains() */
@@ -176,27 +170,6 @@ static inline int is_spread_slab(const struct cpuset *cs)
176 return test_bit(CS_SPREAD_SLAB, &cs->flags); 170 return test_bit(CS_SPREAD_SLAB, &cs->flags);
177} 171}
178 172
179/*
180 * Increment this integer everytime any cpuset changes its
181 * mems_allowed value. Users of cpusets can track this generation
182 * number, and avoid having to lock and reload mems_allowed unless
183 * the cpuset they're using changes generation.
184 *
185 * A single, global generation is needed because cpuset_attach_task() could
186 * reattach a task to a different cpuset, which must not have its
187 * generation numbers aliased with those of that tasks previous cpuset.
188 *
189 * Generations are needed for mems_allowed because one task cannot
190 * modify another's memory placement. So we must enable every task,
191 * on every visit to __alloc_pages(), to efficiently check whether
192 * its current->cpuset->mems_allowed has changed, requiring an update
193 * of its current->mems_allowed.
194 *
195 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
196 * there is no need to mark it atomic.
197 */
198static int cpuset_mems_generation;
199
200static struct cpuset top_cpuset = { 173static struct cpuset top_cpuset = {
201 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 174 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
202}; 175};
@@ -228,8 +201,9 @@ static struct cpuset top_cpuset = {
228 * If a task is only holding callback_mutex, then it has read-only 201 * If a task is only holding callback_mutex, then it has read-only
229 * access to cpusets. 202 * access to cpusets.
230 * 203 *
231 * The task_struct fields mems_allowed and mems_generation may only 204 * Now, the task_struct fields mems_allowed and mempolicy may be changed
232 * be accessed in the context of that task, so require no locks. 205 * by other task, we use alloc_lock in the task_struct fields to protect
206 * them.
233 * 207 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 208 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 209 * small pieces of code, such as when reading out possibly multi-word
@@ -331,75 +305,22 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
331 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); 305 BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
332} 306}
333 307
334/** 308/*
335 * cpuset_update_task_memory_state - update task memory placement 309 * update task's spread flag if cpuset's page/slab spread flag is set
336 * 310 *
337 * If the current tasks cpusets mems_allowed changed behind our 311 * Called with callback_mutex/cgroup_mutex held
338 * backs, update current->mems_allowed, mems_generation and task NUMA
339 * mempolicy to the new value.
340 *
341 * Task mempolicy is updated by rebinding it relative to the
342 * current->cpuset if a task has its memory placement changed.
343 * Do not call this routine if in_interrupt().
344 *
345 * Call without callback_mutex or task_lock() held. May be
346 * called with or without cgroup_mutex held. Thanks in part to
347 * 'the_top_cpuset_hack', the task's cpuset pointer will never
348 * be NULL. This routine also might acquire callback_mutex during
349 * call.
350 *
351 * Reading current->cpuset->mems_generation doesn't need task_lock
352 * to guard the current->cpuset derefence, because it is guarded
353 * from concurrent freeing of current->cpuset using RCU.
354 *
355 * The rcu_dereference() is technically probably not needed,
356 * as I don't actually mind if I see a new cpuset pointer but
357 * an old value of mems_generation. However this really only
358 * matters on alpha systems using cpusets heavily. If I dropped
359 * that rcu_dereference(), it would save them a memory barrier.
360 * For all other arch's, rcu_dereference is a no-op anyway, and for
361 * alpha systems not using cpusets, another planned optimization,
362 * avoiding the rcu critical section for tasks in the root cpuset
363 * which is statically allocated, so can't vanish, will make this
364 * irrelevant. Better to use RCU as intended, than to engage in
365 * some cute trick to save a memory barrier that is impossible to
366 * test, for alpha systems using cpusets heavily, which might not
367 * even exist.
368 *
369 * This routine is needed to update the per-task mems_allowed data,
370 * within the tasks context, when it is trying to allocate memory
371 * (in various mm/mempolicy.c routines) and notices that some other
372 * task has been modifying its cpuset.
373 */ 312 */
374 313static void cpuset_update_task_spread_flag(struct cpuset *cs,
375void cpuset_update_task_memory_state(void) 314 struct task_struct *tsk)
376{ 315{
377 int my_cpusets_mem_gen; 316 if (is_spread_page(cs))
378 struct task_struct *tsk = current; 317 tsk->flags |= PF_SPREAD_PAGE;
379 struct cpuset *cs; 318 else
380 319 tsk->flags &= ~PF_SPREAD_PAGE;
381 rcu_read_lock(); 320 if (is_spread_slab(cs))
382 my_cpusets_mem_gen = task_cs(tsk)->mems_generation; 321 tsk->flags |= PF_SPREAD_SLAB;
383 rcu_read_unlock(); 322 else
384 323 tsk->flags &= ~PF_SPREAD_SLAB;
385 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
386 mutex_lock(&callback_mutex);
387 task_lock(tsk);
388 cs = task_cs(tsk); /* Maybe changed when task not locked */
389 guarantee_online_mems(cs, &tsk->mems_allowed);
390 tsk->cpuset_mems_generation = cs->mems_generation;
391 if (is_spread_page(cs))
392 tsk->flags |= PF_SPREAD_PAGE;
393 else
394 tsk->flags &= ~PF_SPREAD_PAGE;
395 if (is_spread_slab(cs))
396 tsk->flags |= PF_SPREAD_SLAB;
397 else
398 tsk->flags &= ~PF_SPREAD_SLAB;
399 task_unlock(tsk);
400 mutex_unlock(&callback_mutex);
401 mpol_rebind_task(tsk, &tsk->mems_allowed);
402 }
403} 324}
404 325
405/* 326/*
@@ -1007,14 +928,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1007 * other task, the task_struct mems_allowed that we are hacking 928 * other task, the task_struct mems_allowed that we are hacking
1008 * is for our current task, which must allocate new pages for that 929 * is for our current task, which must allocate new pages for that
1009 * migrating memory region. 930 * migrating memory region.
1010 *
1011 * We call cpuset_update_task_memory_state() before hacking
1012 * our tasks mems_allowed, so that we are assured of being in
1013 * sync with our tasks cpuset, and in particular, callbacks to
1014 * cpuset_update_task_memory_state() from nested page allocations
1015 * won't see any mismatch of our cpuset and task mems_generation
1016 * values, so won't overwrite our hacked tasks mems_allowed
1017 * nodemask.
1018 */ 931 */
1019 932
1020static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 933static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
@@ -1022,22 +935,37 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1022{ 935{
1023 struct task_struct *tsk = current; 936 struct task_struct *tsk = current;
1024 937
1025 cpuset_update_task_memory_state();
1026
1027 mutex_lock(&callback_mutex);
1028 tsk->mems_allowed = *to; 938 tsk->mems_allowed = *to;
1029 mutex_unlock(&callback_mutex);
1030 939
1031 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 940 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
1032 941
1033 mutex_lock(&callback_mutex);
1034 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 942 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
1035 mutex_unlock(&callback_mutex);
1036} 943}
1037 944
1038/* 945/*
1039 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new 946 * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
1040 * nodes if memory_migrate flag is set. Called with cgroup_mutex held. 947 * @tsk: the task to change
948 * @newmems: new nodes that the task will be set
949 *
950 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
951 * we structure updates as setting all new allowed nodes, then clearing newly
952 * disallowed ones.
953 *
954 * Called with task's alloc_lock held
955 */
956static void cpuset_change_task_nodemask(struct task_struct *tsk,
957 nodemask_t *newmems)
958{
959 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
960 mpol_rebind_task(tsk, &tsk->mems_allowed);
961 mpol_rebind_task(tsk, newmems);
962 tsk->mems_allowed = *newmems;
963}
964
965/*
966 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
967 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
968 * memory_migrate flag is set. Called with cgroup_mutex held.
1041 */ 969 */
1042static void cpuset_change_nodemask(struct task_struct *p, 970static void cpuset_change_nodemask(struct task_struct *p,
1043 struct cgroup_scanner *scan) 971 struct cgroup_scanner *scan)
@@ -1046,12 +974,19 @@ static void cpuset_change_nodemask(struct task_struct *p,
1046 struct cpuset *cs; 974 struct cpuset *cs;
1047 int migrate; 975 int migrate;
1048 const nodemask_t *oldmem = scan->data; 976 const nodemask_t *oldmem = scan->data;
977 nodemask_t newmems;
978
979 cs = cgroup_cs(scan->cg);
980 guarantee_online_mems(cs, &newmems);
981
982 task_lock(p);
983 cpuset_change_task_nodemask(p, &newmems);
984 task_unlock(p);
1049 985
1050 mm = get_task_mm(p); 986 mm = get_task_mm(p);
1051 if (!mm) 987 if (!mm)
1052 return; 988 return;
1053 989
1054 cs = cgroup_cs(scan->cg);
1055 migrate = is_memory_migrate(cs); 990 migrate = is_memory_migrate(cs);
1056 991
1057 mpol_rebind_mm(mm, &cs->mems_allowed); 992 mpol_rebind_mm(mm, &cs->mems_allowed);
@@ -1104,10 +1039,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1104/* 1039/*
1105 * Handle user request to change the 'mems' memory placement 1040 * Handle user request to change the 'mems' memory placement
1106 * of a cpuset. Needs to validate the request, update the 1041 * of a cpuset. Needs to validate the request, update the
1107 * cpusets mems_allowed and mems_generation, and for each 1042 * cpusets mems_allowed, and for each task in the cpuset,
1108 * task in the cpuset, rebind any vma mempolicies and if 1043 * update mems_allowed and rebind task's mempolicy and any vma
1109 * the cpuset is marked 'memory_migrate', migrate the tasks 1044 * mempolicies and if the cpuset is marked 'memory_migrate',
1110 * pages to the new memory. 1045 * migrate the tasks pages to the new memory.
1111 * 1046 *
1112 * Call with cgroup_mutex held. May take callback_mutex during call. 1047 * Call with cgroup_mutex held. May take callback_mutex during call.
1113 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1048 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
@@ -1160,7 +1095,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1160 1095
1161 mutex_lock(&callback_mutex); 1096 mutex_lock(&callback_mutex);
1162 cs->mems_allowed = trialcs->mems_allowed; 1097 cs->mems_allowed = trialcs->mems_allowed;
1163 cs->mems_generation = cpuset_mems_generation++;
1164 mutex_unlock(&callback_mutex); 1098 mutex_unlock(&callback_mutex);
1165 1099
1166 update_tasks_nodemask(cs, &oldmem, &heap); 1100 update_tasks_nodemask(cs, &oldmem, &heap);
@@ -1193,6 +1127,46 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1193} 1127}
1194 1128
1195/* 1129/*
1130 * cpuset_change_flag - make a task's spread flags the same as its cpuset's
1131 * @tsk: task to be updated
1132 * @scan: struct cgroup_scanner containing the cgroup of the task
1133 *
1134 * Called by cgroup_scan_tasks() for each task in a cgroup.
1135 *
1136 * We don't need to re-check for the cgroup/cpuset membership, since we're
1137 * holding cgroup_lock() at this point.
1138 */
1139static void cpuset_change_flag(struct task_struct *tsk,
1140 struct cgroup_scanner *scan)
1141{
1142 cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
1143}
1144
1145/*
1146 * update_tasks_flags - update the spread flags of tasks in the cpuset.
1147 * @cs: the cpuset in which each task's spread flags needs to be changed
1148 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1149 *
1150 * Called with cgroup_mutex held
1151 *
1152 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1153 * calling callback functions for each.
1154 *
1155 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1156 * if @heap != NULL.
1157 */
1158static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1159{
1160 struct cgroup_scanner scan;
1161
1162 scan.cg = cs->css.cgroup;
1163 scan.test_task = NULL;
1164 scan.process_task = cpuset_change_flag;
1165 scan.heap = heap;
1166 cgroup_scan_tasks(&scan);
1167}
1168
1169/*
1196 * update_flag - read a 0 or a 1 in a file and update associated flag 1170 * update_flag - read a 0 or a 1 in a file and update associated flag
1197 * bit: the bit to update (see cpuset_flagbits_t) 1171 * bit: the bit to update (see cpuset_flagbits_t)
1198 * cs: the cpuset to update 1172 * cs: the cpuset to update
@@ -1205,8 +1179,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1205 int turning_on) 1179 int turning_on)
1206{ 1180{
1207 struct cpuset *trialcs; 1181 struct cpuset *trialcs;
1208 int err;
1209 int balance_flag_changed; 1182 int balance_flag_changed;
1183 int spread_flag_changed;
1184 struct ptr_heap heap;
1185 int err;
1210 1186
1211 trialcs = alloc_trial_cpuset(cs); 1187 trialcs = alloc_trial_cpuset(cs);
1212 if (!trialcs) 1188 if (!trialcs)
@@ -1221,9 +1197,16 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1221 if (err < 0) 1197 if (err < 0)
1222 goto out; 1198 goto out;
1223 1199
1200 err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
1201 if (err < 0)
1202 goto out;
1203
1224 balance_flag_changed = (is_sched_load_balance(cs) != 1204 balance_flag_changed = (is_sched_load_balance(cs) !=
1225 is_sched_load_balance(trialcs)); 1205 is_sched_load_balance(trialcs));
1226 1206
1207 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1208 || (is_spread_page(cs) != is_spread_page(trialcs)));
1209
1227 mutex_lock(&callback_mutex); 1210 mutex_lock(&callback_mutex);
1228 cs->flags = trialcs->flags; 1211 cs->flags = trialcs->flags;
1229 mutex_unlock(&callback_mutex); 1212 mutex_unlock(&callback_mutex);
@@ -1231,6 +1214,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1231 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1214 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1232 async_rebuild_sched_domains(); 1215 async_rebuild_sched_domains();
1233 1216
1217 if (spread_flag_changed)
1218 update_tasks_flags(cs, &heap);
1219 heap_free(&heap);
1234out: 1220out:
1235 free_trial_cpuset(trialcs); 1221 free_trial_cpuset(trialcs);
1236 return err; 1222 return err;
@@ -1372,15 +1358,20 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1372 1358
1373 if (cs == &top_cpuset) { 1359 if (cs == &top_cpuset) {
1374 cpumask_copy(cpus_attach, cpu_possible_mask); 1360 cpumask_copy(cpus_attach, cpu_possible_mask);
1361 to = node_possible_map;
1375 } else { 1362 } else {
1376 mutex_lock(&callback_mutex);
1377 guarantee_online_cpus(cs, cpus_attach); 1363 guarantee_online_cpus(cs, cpus_attach);
1378 mutex_unlock(&callback_mutex); 1364 guarantee_online_mems(cs, &to);
1379 } 1365 }
1380 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1381 if (err) 1367 if (err)
1382 return; 1368 return;
1383 1369
1370 task_lock(tsk);
1371 cpuset_change_task_nodemask(tsk, &to);
1372 task_unlock(tsk);
1373 cpuset_update_task_spread_flag(cs, tsk);
1374
1384 from = oldcs->mems_allowed; 1375 from = oldcs->mems_allowed;
1385 to = cs->mems_allowed; 1376 to = cs->mems_allowed;
1386 mm = get_task_mm(tsk); 1377 mm = get_task_mm(tsk);
@@ -1442,11 +1433,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1442 break; 1433 break;
1443 case FILE_SPREAD_PAGE: 1434 case FILE_SPREAD_PAGE:
1444 retval = update_flag(CS_SPREAD_PAGE, cs, val); 1435 retval = update_flag(CS_SPREAD_PAGE, cs, val);
1445 cs->mems_generation = cpuset_mems_generation++;
1446 break; 1436 break;
1447 case FILE_SPREAD_SLAB: 1437 case FILE_SPREAD_SLAB:
1448 retval = update_flag(CS_SPREAD_SLAB, cs, val); 1438 retval = update_flag(CS_SPREAD_SLAB, cs, val);
1449 cs->mems_generation = cpuset_mems_generation++;
1450 break; 1439 break;
1451 default: 1440 default:
1452 retval = -EINVAL; 1441 retval = -EINVAL;
@@ -1786,8 +1775,6 @@ static struct cgroup_subsys_state *cpuset_create(
1786 struct cpuset *parent; 1775 struct cpuset *parent;
1787 1776
1788 if (!cont->parent) { 1777 if (!cont->parent) {
1789 /* This is early initialization for the top cgroup */
1790 top_cpuset.mems_generation = cpuset_mems_generation++;
1791 return &top_cpuset.css; 1778 return &top_cpuset.css;
1792 } 1779 }
1793 parent = cgroup_cs(cont->parent); 1780 parent = cgroup_cs(cont->parent);
@@ -1799,7 +1786,6 @@ static struct cgroup_subsys_state *cpuset_create(
1799 return ERR_PTR(-ENOMEM); 1786 return ERR_PTR(-ENOMEM);
1800 } 1787 }
1801 1788
1802 cpuset_update_task_memory_state();
1803 cs->flags = 0; 1789 cs->flags = 0;
1804 if (is_spread_page(parent)) 1790 if (is_spread_page(parent))
1805 set_bit(CS_SPREAD_PAGE, &cs->flags); 1791 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1808,7 +1794,6 @@ static struct cgroup_subsys_state *cpuset_create(
1808 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1794 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1809 cpumask_clear(cs->cpus_allowed); 1795 cpumask_clear(cs->cpus_allowed);
1810 nodes_clear(cs->mems_allowed); 1796 nodes_clear(cs->mems_allowed);
1811 cs->mems_generation = cpuset_mems_generation++;
1812 fmeter_init(&cs->fmeter); 1797 fmeter_init(&cs->fmeter);
1813 cs->relax_domain_level = -1; 1798 cs->relax_domain_level = -1;
1814 1799
@@ -1827,8 +1812,6 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1827{ 1812{
1828 struct cpuset *cs = cgroup_cs(cont); 1813 struct cpuset *cs = cgroup_cs(cont);
1829 1814
1830 cpuset_update_task_memory_state();
1831
1832 if (is_sched_load_balance(cs)) 1815 if (is_sched_load_balance(cs))
1833 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1816 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
1834 1817
@@ -1849,21 +1832,6 @@ struct cgroup_subsys cpuset_subsys = {
1849 .early_init = 1, 1832 .early_init = 1,
1850}; 1833};
1851 1834
1852/*
1853 * cpuset_init_early - just enough so that the calls to
1854 * cpuset_update_task_memory_state() in early init code
1855 * are harmless.
1856 */
1857
1858int __init cpuset_init_early(void)
1859{
1860 alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
1861
1862 top_cpuset.mems_generation = cpuset_mems_generation++;
1863 return 0;
1864}
1865
1866
1867/** 1835/**
1868 * cpuset_init - initialize cpusets at system boot 1836 * cpuset_init - initialize cpusets at system boot
1869 * 1837 *
@@ -1874,11 +1842,13 @@ int __init cpuset_init(void)
1874{ 1842{
1875 int err = 0; 1843 int err = 0;
1876 1844
1845 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
1846 BUG();
1847
1877 cpumask_setall(top_cpuset.cpus_allowed); 1848 cpumask_setall(top_cpuset.cpus_allowed);
1878 nodes_setall(top_cpuset.mems_allowed); 1849 nodes_setall(top_cpuset.mems_allowed);
1879 1850
1880 fmeter_init(&top_cpuset.fmeter); 1851 fmeter_init(&top_cpuset.fmeter);
1881 top_cpuset.mems_generation = cpuset_mems_generation++;
1882 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); 1852 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1883 top_cpuset.relax_domain_level = -1; 1853 top_cpuset.relax_domain_level = -1;
1884 1854
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d707..1bb4d7e5d616 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
167 167
168/* 168/*
169 * Prepare credentials for current to perform an execve() 169 * Prepare credentials for current to perform an execve()
170 * - The caller must hold current->cred_exec_mutex 170 * - The caller must hold current->cred_guard_mutex
171 */ 171 */
172struct cred *prepare_exec_creds(void) 172struct cred *prepare_exec_creds(void)
173{ 173{
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
276 struct cred *new; 276 struct cred *new;
277 int ret; 277 int ret;
278 278
279 mutex_init(&p->cred_exec_mutex); 279 mutex_init(&p->cred_guard_mutex);
280 280
281 if ( 281 if (
282#ifdef CONFIG_KEYS 282#ifdef CONFIG_KEYS
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..628d41f0dd54 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,8 @@
48#include <linux/tracehook.h> 48#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 49#include <linux/fs_struct.h>
50#include <linux/init_task.h> 50#include <linux/init_task.h>
51#include <trace/sched.h> 51#include <linux/perf_counter.h>
52#include <trace/events/sched.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/unistd.h> 55#include <asm/unistd.h>
@@ -56,10 +57,6 @@
56#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
57#include "cred-internals.h" 58#include "cred-internals.h"
58 59
59DEFINE_TRACE(sched_process_free);
60DEFINE_TRACE(sched_process_exit);
61DEFINE_TRACE(sched_process_wait);
62
63static void exit_mm(struct task_struct * tsk); 60static void exit_mm(struct task_struct * tsk);
64 61
65static void __unhash_process(struct task_struct *p) 62static void __unhash_process(struct task_struct *p)
@@ -158,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 155{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 157
158#ifdef CONFIG_PERF_COUNTERS
159 WARN_ON_ONCE(tsk->perf_counter_ctxp);
160#endif
161 trace_sched_process_free(tsk); 161 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 162 put_task_struct(tsk);
163} 163}
@@ -174,6 +174,7 @@ repeat:
174 atomic_dec(&__task_cred(p)->user->processes); 174 atomic_dec(&__task_cred(p)->user->processes);
175 175
176 proc_flush_task(p); 176 proc_flush_task(p);
177
177 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
178 tracehook_finish_release_task(p); 179 tracehook_finish_release_task(p);
179 __exit_signal(p); 180 __exit_signal(p);
@@ -374,9 +375,8 @@ static void set_special_pids(struct pid *pid)
374} 375}
375 376
376/* 377/*
377 * Let kernel threads use this to say that they 378 * Let kernel threads use this to say that they allow a certain signal.
378 * allow a certain signal (since daemonize() will 379 * Must not be used if kthread was cloned with CLONE_SIGHAND.
379 * have disabled all of them by default).
380 */ 380 */
381int allow_signal(int sig) 381int allow_signal(int sig)
382{ 382{
@@ -384,14 +384,14 @@ int allow_signal(int sig)
384 return -EINVAL; 384 return -EINVAL;
385 385
386 spin_lock_irq(&current->sighand->siglock); 386 spin_lock_irq(&current->sighand->siglock);
387 /* This is only needed for daemonize()'ed kthreads */
387 sigdelset(&current->blocked, sig); 388 sigdelset(&current->blocked, sig);
388 if (!current->mm) { 389 /*
389 /* Kernel threads handle their own signals. 390 * Kernel threads handle their own signals. Let the signal code
390 Let the signal code know it'll be handled, so 391 * know it'll be handled, so that they don't get converted to
391 that they don't get converted to SIGKILL or 392 * SIGKILL or just silently dropped.
392 just silently dropped */ 393 */
393 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; 394 current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
394 }
395 recalc_sigpending(); 395 recalc_sigpending();
396 spin_unlock_irq(&current->sighand->siglock); 396 spin_unlock_irq(&current->sighand->siglock);
397 return 0; 397 return 0;
@@ -590,7 +590,7 @@ retry:
590 /* 590 /*
591 * Search in the siblings 591 * Search in the siblings
592 */ 592 */
593 list_for_each_entry(c, &p->parent->children, sibling) { 593 list_for_each_entry(c, &p->real_parent->children, sibling) {
594 if (c->mm == mm) 594 if (c->mm == mm)
595 goto assign_new_owner; 595 goto assign_new_owner;
596 } 596 }
@@ -757,7 +757,7 @@ static void reparent_thread(struct task_struct *father, struct task_struct *p,
757 p->exit_signal = SIGCHLD; 757 p->exit_signal = SIGCHLD;
758 758
759 /* If it has exited notify the new parent about this child's death. */ 759 /* If it has exited notify the new parent about this child's death. */
760 if (!p->ptrace && 760 if (!task_ptrace(p) &&
761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { 761 p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
762 do_notify_parent(p, p->exit_signal); 762 do_notify_parent(p, p->exit_signal);
763 if (task_detached(p)) { 763 if (task_detached(p)) {
@@ -782,7 +782,7 @@ static void forget_original_parent(struct task_struct *father)
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 782 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 783 p->real_parent = reaper;
784 if (p->parent == father) { 784 if (p->parent == father) {
785 BUG_ON(p->ptrace); 785 BUG_ON(task_ptrace(p));
786 p->parent = p->real_parent; 786 p->parent = p->real_parent;
787 } 787 }
788 reparent_thread(father, p, &dead_children); 788 reparent_thread(father, p, &dead_children);
@@ -975,16 +975,19 @@ NORET_TYPE void do_exit(long code)
975 module_put(tsk->binfmt->module); 975 module_put(tsk->binfmt->module);
976 976
977 proc_exit_connector(tsk); 977 proc_exit_connector(tsk);
978
979 /*
980 * Flush inherited counters to the parent - before the parent
981 * gets woken up by child-exit notifications.
982 */
983 perf_counter_exit_task(tsk);
984
978 exit_notify(tsk, group_dead); 985 exit_notify(tsk, group_dead);
979#ifdef CONFIG_NUMA 986#ifdef CONFIG_NUMA
980 mpol_put(tsk->mempolicy); 987 mpol_put(tsk->mempolicy);
981 tsk->mempolicy = NULL; 988 tsk->mempolicy = NULL;
982#endif 989#endif
983#ifdef CONFIG_FUTEX 990#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 991 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 992 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 993 if (unlikely(current->pi_state_cache))
@@ -1077,6 +1080,18 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
1077 return 0; 1080 return 0;
1078} 1081}
1079 1082
1083struct wait_opts {
1084 enum pid_type wo_type;
1085 int wo_flags;
1086 struct pid *wo_pid;
1087
1088 struct siginfo __user *wo_info;
1089 int __user *wo_stat;
1090 struct rusage __user *wo_rusage;
1091
1092 int notask_error;
1093};
1094
1080static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1095static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1081{ 1096{
1082 struct pid *pid = NULL; 1097 struct pid *pid = NULL;
@@ -1087,13 +1102,12 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1087 return pid; 1102 return pid;
1088} 1103}
1089 1104
1090static int eligible_child(enum pid_type type, struct pid *pid, int options, 1105static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1091 struct task_struct *p)
1092{ 1106{
1093 int err; 1107 int err;
1094 1108
1095 if (type < PIDTYPE_MAX) { 1109 if (wo->wo_type < PIDTYPE_MAX) {
1096 if (task_pid_type(p, type) != pid) 1110 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1097 return 0; 1111 return 0;
1098 } 1112 }
1099 1113
@@ -1102,8 +1116,8 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1102 * set; otherwise, wait for non-clone children *only*. (Note: 1116 * set; otherwise, wait for non-clone children *only*. (Note:
1103 * A "clone" child here is one that reports to its parent 1117 * A "clone" child here is one that reports to its parent
1104 * using a signal other than SIGCHLD.) */ 1118 * using a signal other than SIGCHLD.) */
1105 if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0)) 1119 if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
1106 && !(options & __WALL)) 1120 && !(wo->wo_flags & __WALL))
1107 return 0; 1121 return 0;
1108 1122
1109 err = security_task_wait(p); 1123 err = security_task_wait(p);
@@ -1113,14 +1127,15 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1113 return 1; 1127 return 1;
1114} 1128}
1115 1129
1116static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1130static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1117 int why, int status, 1131 pid_t pid, uid_t uid, int why, int status)
1118 struct siginfo __user *infop,
1119 struct rusage __user *rusagep)
1120{ 1132{
1121 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1133 struct siginfo __user *infop;
1134 int retval = wo->wo_rusage
1135 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1122 1136
1123 put_task_struct(p); 1137 put_task_struct(p);
1138 infop = wo->wo_info;
1124 if (!retval) 1139 if (!retval)
1125 retval = put_user(SIGCHLD, &infop->si_signo); 1140 retval = put_user(SIGCHLD, &infop->si_signo);
1126 if (!retval) 1141 if (!retval)
@@ -1144,19 +1159,18 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1144 * the lock and this task is uninteresting. If we return nonzero, we have 1159 * the lock and this task is uninteresting. If we return nonzero, we have
1145 * released the lock and the system call should return. 1160 * released the lock and the system call should return.
1146 */ 1161 */
1147static int wait_task_zombie(struct task_struct *p, int options, 1162static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1148 struct siginfo __user *infop,
1149 int __user *stat_addr, struct rusage __user *ru)
1150{ 1163{
1151 unsigned long state; 1164 unsigned long state;
1152 int retval, status, traced; 1165 int retval, status, traced;
1153 pid_t pid = task_pid_vnr(p); 1166 pid_t pid = task_pid_vnr(p);
1154 uid_t uid = __task_cred(p)->uid; 1167 uid_t uid = __task_cred(p)->uid;
1168 struct siginfo __user *infop;
1155 1169
1156 if (!likely(options & WEXITED)) 1170 if (!likely(wo->wo_flags & WEXITED))
1157 return 0; 1171 return 0;
1158 1172
1159 if (unlikely(options & WNOWAIT)) { 1173 if (unlikely(wo->wo_flags & WNOWAIT)) {
1160 int exit_code = p->exit_code; 1174 int exit_code = p->exit_code;
1161 int why, status; 1175 int why, status;
1162 1176
@@ -1169,8 +1183,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1169 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; 1183 why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
1170 status = exit_code & 0x7f; 1184 status = exit_code & 0x7f;
1171 } 1185 }
1172 return wait_noreap_copyout(p, pid, uid, why, 1186 return wait_noreap_copyout(wo, p, pid, uid, why, status);
1173 status, infop, ru);
1174 } 1187 }
1175 1188
1176 /* 1189 /*
@@ -1184,11 +1197,13 @@ static int wait_task_zombie(struct task_struct *p, int options,
1184 } 1197 }
1185 1198
1186 traced = ptrace_reparented(p); 1199 traced = ptrace_reparented(p);
1187 1200 /*
1188 if (likely(!traced)) { 1201 * It can be ptraced but not reparented, check
1202 * !task_detached() to filter out sub-threads.
1203 */
1204 if (likely(!traced) && likely(!task_detached(p))) {
1189 struct signal_struct *psig; 1205 struct signal_struct *psig;
1190 struct signal_struct *sig; 1206 struct signal_struct *sig;
1191 struct task_cputime cputime;
1192 1207
1193 /* 1208 /*
1194 * The resource counters for the group leader are in its 1209 * The resource counters for the group leader are in its
@@ -1201,26 +1216,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1201 * p->signal fields, because they are only touched by 1216 * p->signal fields, because they are only touched by
1202 * __exit_signal, which runs with tasklist_lock 1217 * __exit_signal, which runs with tasklist_lock
1203 * write-locked anyway, and so is excluded here. We do 1218 * write-locked anyway, and so is excluded here. We do
1204 * need to protect the access to p->parent->signal fields, 1219 * need to protect the access to parent->signal fields,
1205 * as other threads in the parent group can be right 1220 * as other threads in the parent group can be right
1206 * here reaping other children at the same time. 1221 * here reaping other children at the same time.
1207 *
1208 * We use thread_group_cputime() to get times for the thread
1209 * group, which consolidates times for all threads in the
1210 * group including the group leader.
1211 */ 1222 */
1212 thread_group_cputime(p, &cputime); 1223 spin_lock_irq(&p->real_parent->sighand->siglock);
1213 spin_lock_irq(&p->parent->sighand->siglock); 1224 psig = p->real_parent->signal;
1214 psig = p->parent->signal;
1215 sig = p->signal; 1225 sig = p->signal;
1216 psig->cutime = 1226 psig->cutime =
1217 cputime_add(psig->cutime, 1227 cputime_add(psig->cutime,
1218 cputime_add(cputime.utime, 1228 cputime_add(p->utime,
1219 sig->cutime)); 1229 cputime_add(sig->utime,
1230 sig->cutime)));
1220 psig->cstime = 1231 psig->cstime =
1221 cputime_add(psig->cstime, 1232 cputime_add(psig->cstime,
1222 cputime_add(cputime.stime, 1233 cputime_add(p->stime,
1223 sig->cstime)); 1234 cputime_add(sig->stime,
1235 sig->cstime)));
1224 psig->cgtime = 1236 psig->cgtime =
1225 cputime_add(psig->cgtime, 1237 cputime_add(psig->cgtime,
1226 cputime_add(p->gtime, 1238 cputime_add(p->gtime,
@@ -1242,7 +1254,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1242 sig->oublock + sig->coublock; 1254 sig->oublock + sig->coublock;
1243 task_io_accounting_add(&psig->ioac, &p->ioac); 1255 task_io_accounting_add(&psig->ioac, &p->ioac);
1244 task_io_accounting_add(&psig->ioac, &sig->ioac); 1256 task_io_accounting_add(&psig->ioac, &sig->ioac);
1245 spin_unlock_irq(&p->parent->sighand->siglock); 1257 spin_unlock_irq(&p->real_parent->sighand->siglock);
1246 } 1258 }
1247 1259
1248 /* 1260 /*
@@ -1251,11 +1263,14 @@ static int wait_task_zombie(struct task_struct *p, int options,
1251 */ 1263 */
1252 read_unlock(&tasklist_lock); 1264 read_unlock(&tasklist_lock);
1253 1265
1254 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1266 retval = wo->wo_rusage
1267 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1255 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1268 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1256 ? p->signal->group_exit_code : p->exit_code; 1269 ? p->signal->group_exit_code : p->exit_code;
1257 if (!retval && stat_addr) 1270 if (!retval && wo->wo_stat)
1258 retval = put_user(status, stat_addr); 1271 retval = put_user(status, wo->wo_stat);
1272
1273 infop = wo->wo_info;
1259 if (!retval && infop) 1274 if (!retval && infop)
1260 retval = put_user(SIGCHLD, &infop->si_signo); 1275 retval = put_user(SIGCHLD, &infop->si_signo);
1261 if (!retval && infop) 1276 if (!retval && infop)
@@ -1323,15 +1338,18 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
1323 * the lock and this task is uninteresting. If we return nonzero, we have 1338 * the lock and this task is uninteresting. If we return nonzero, we have
1324 * released the lock and the system call should return. 1339 * released the lock and the system call should return.
1325 */ 1340 */
1326static int wait_task_stopped(int ptrace, struct task_struct *p, 1341static int wait_task_stopped(struct wait_opts *wo,
1327 int options, struct siginfo __user *infop, 1342 int ptrace, struct task_struct *p)
1328 int __user *stat_addr, struct rusage __user *ru)
1329{ 1343{
1344 struct siginfo __user *infop;
1330 int retval, exit_code, *p_code, why; 1345 int retval, exit_code, *p_code, why;
1331 uid_t uid = 0; /* unneeded, required by compiler */ 1346 uid_t uid = 0; /* unneeded, required by compiler */
1332 pid_t pid; 1347 pid_t pid;
1333 1348
1334 if (!(options & WUNTRACED)) 1349 /*
1350 * Traditionally we see ptrace'd stopped tasks regardless of options.
1351 */
1352 if (!ptrace && !(wo->wo_flags & WUNTRACED))
1335 return 0; 1353 return 0;
1336 1354
1337 exit_code = 0; 1355 exit_code = 0;
@@ -1345,7 +1363,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
1345 if (!exit_code) 1363 if (!exit_code)
1346 goto unlock_sig; 1364 goto unlock_sig;
1347 1365
1348 if (!unlikely(options & WNOWAIT)) 1366 if (!unlikely(wo->wo_flags & WNOWAIT))
1349 *p_code = 0; 1367 *p_code = 0;
1350 1368
1351 /* don't need the RCU readlock here as we're holding a spinlock */ 1369 /* don't need the RCU readlock here as we're holding a spinlock */
@@ -1367,14 +1385,15 @@ unlock_sig:
1367 why = ptrace ? CLD_TRAPPED : CLD_STOPPED; 1385 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1368 read_unlock(&tasklist_lock); 1386 read_unlock(&tasklist_lock);
1369 1387
1370 if (unlikely(options & WNOWAIT)) 1388 if (unlikely(wo->wo_flags & WNOWAIT))
1371 return wait_noreap_copyout(p, pid, uid, 1389 return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
1372 why, exit_code, 1390
1373 infop, ru); 1391 retval = wo->wo_rusage
1392 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1393 if (!retval && wo->wo_stat)
1394 retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
1374 1395
1375 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1396 infop = wo->wo_info;
1376 if (!retval && stat_addr)
1377 retval = put_user((exit_code << 8) | 0x7f, stat_addr);
1378 if (!retval && infop) 1397 if (!retval && infop)
1379 retval = put_user(SIGCHLD, &infop->si_signo); 1398 retval = put_user(SIGCHLD, &infop->si_signo);
1380 if (!retval && infop) 1399 if (!retval && infop)
@@ -1401,15 +1420,13 @@ unlock_sig:
1401 * the lock and this task is uninteresting. If we return nonzero, we have 1420 * the lock and this task is uninteresting. If we return nonzero, we have
1402 * released the lock and the system call should return. 1421 * released the lock and the system call should return.
1403 */ 1422 */
1404static int wait_task_continued(struct task_struct *p, int options, 1423static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1405 struct siginfo __user *infop,
1406 int __user *stat_addr, struct rusage __user *ru)
1407{ 1424{
1408 int retval; 1425 int retval;
1409 pid_t pid; 1426 pid_t pid;
1410 uid_t uid; 1427 uid_t uid;
1411 1428
1412 if (!unlikely(options & WCONTINUED)) 1429 if (!unlikely(wo->wo_flags & WCONTINUED))
1413 return 0; 1430 return 0;
1414 1431
1415 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1432 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
@@ -1421,7 +1438,7 @@ static int wait_task_continued(struct task_struct *p, int options,
1421 spin_unlock_irq(&p->sighand->siglock); 1438 spin_unlock_irq(&p->sighand->siglock);
1422 return 0; 1439 return 0;
1423 } 1440 }
1424 if (!unlikely(options & WNOWAIT)) 1441 if (!unlikely(wo->wo_flags & WNOWAIT))
1425 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1442 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1426 uid = __task_cred(p)->uid; 1443 uid = __task_cred(p)->uid;
1427 spin_unlock_irq(&p->sighand->siglock); 1444 spin_unlock_irq(&p->sighand->siglock);
@@ -1430,17 +1447,17 @@ static int wait_task_continued(struct task_struct *p, int options,
1430 get_task_struct(p); 1447 get_task_struct(p);
1431 read_unlock(&tasklist_lock); 1448 read_unlock(&tasklist_lock);
1432 1449
1433 if (!infop) { 1450 if (!wo->wo_info) {
1434 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1451 retval = wo->wo_rusage
1452 ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
1435 put_task_struct(p); 1453 put_task_struct(p);
1436 if (!retval && stat_addr) 1454 if (!retval && wo->wo_stat)
1437 retval = put_user(0xffff, stat_addr); 1455 retval = put_user(0xffff, wo->wo_stat);
1438 if (!retval) 1456 if (!retval)
1439 retval = pid; 1457 retval = pid;
1440 } else { 1458 } else {
1441 retval = wait_noreap_copyout(p, pid, uid, 1459 retval = wait_noreap_copyout(wo, p, pid, uid,
1442 CLD_CONTINUED, SIGCONT, 1460 CLD_CONTINUED, SIGCONT);
1443 infop, ru);
1444 BUG_ON(retval == 0); 1461 BUG_ON(retval == 0);
1445 } 1462 }
1446 1463
@@ -1450,19 +1467,16 @@ static int wait_task_continued(struct task_struct *p, int options,
1450/* 1467/*
1451 * Consider @p for a wait by @parent. 1468 * Consider @p for a wait by @parent.
1452 * 1469 *
1453 * -ECHILD should be in *@notask_error before the first call. 1470 * -ECHILD should be in ->notask_error before the first call.
1454 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1471 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1455 * Returns zero if the search for a child should continue; 1472 * Returns zero if the search for a child should continue;
1456 * then *@notask_error is 0 if @p is an eligible child, 1473 * then ->notask_error is 0 if @p is an eligible child,
1457 * or another error from security_task_wait(), or still -ECHILD. 1474 * or another error from security_task_wait(), or still -ECHILD.
1458 */ 1475 */
1459static int wait_consider_task(struct task_struct *parent, int ptrace, 1476static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent,
1460 struct task_struct *p, int *notask_error, 1477 int ptrace, struct task_struct *p)
1461 enum pid_type type, struct pid *pid, int options,
1462 struct siginfo __user *infop,
1463 int __user *stat_addr, struct rusage __user *ru)
1464{ 1478{
1465 int ret = eligible_child(type, pid, options, p); 1479 int ret = eligible_child(wo, p);
1466 if (!ret) 1480 if (!ret)
1467 return ret; 1481 return ret;
1468 1482
@@ -1474,16 +1488,17 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1474 * to look for security policy problems, rather 1488 * to look for security policy problems, rather
1475 * than for mysterious wait bugs. 1489 * than for mysterious wait bugs.
1476 */ 1490 */
1477 if (*notask_error) 1491 if (wo->notask_error)
1478 *notask_error = ret; 1492 wo->notask_error = ret;
1493 return 0;
1479 } 1494 }
1480 1495
1481 if (likely(!ptrace) && unlikely(p->ptrace)) { 1496 if (likely(!ptrace) && unlikely(task_ptrace(p))) {
1482 /* 1497 /*
1483 * This child is hidden by ptrace. 1498 * This child is hidden by ptrace.
1484 * We aren't allowed to see it now, but eventually we will. 1499 * We aren't allowed to see it now, but eventually we will.
1485 */ 1500 */
1486 *notask_error = 0; 1501 wo->notask_error = 0;
1487 return 0; 1502 return 0;
1488 } 1503 }
1489 1504
@@ -1494,34 +1509,30 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
1494 * We don't reap group leaders with subthreads. 1509 * We don't reap group leaders with subthreads.
1495 */ 1510 */
1496 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) 1511 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1497 return wait_task_zombie(p, options, infop, stat_addr, ru); 1512 return wait_task_zombie(wo, p);
1498 1513
1499 /* 1514 /*
1500 * It's stopped or running now, so it might 1515 * It's stopped or running now, so it might
1501 * later continue, exit, or stop again. 1516 * later continue, exit, or stop again.
1502 */ 1517 */
1503 *notask_error = 0; 1518 wo->notask_error = 0;
1504 1519
1505 if (task_stopped_code(p, ptrace)) 1520 if (task_stopped_code(p, ptrace))
1506 return wait_task_stopped(ptrace, p, options, 1521 return wait_task_stopped(wo, ptrace, p);
1507 infop, stat_addr, ru);
1508 1522
1509 return wait_task_continued(p, options, infop, stat_addr, ru); 1523 return wait_task_continued(wo, p);
1510} 1524}
1511 1525
1512/* 1526/*
1513 * Do the work of do_wait() for one thread in the group, @tsk. 1527 * Do the work of do_wait() for one thread in the group, @tsk.
1514 * 1528 *
1515 * -ECHILD should be in *@notask_error before the first call. 1529 * -ECHILD should be in ->notask_error before the first call.
1516 * Returns nonzero for a final return, when we have unlocked tasklist_lock. 1530 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1517 * Returns zero if the search for a child should continue; then 1531 * Returns zero if the search for a child should continue; then
1518 * *@notask_error is 0 if there were any eligible children, 1532 * ->notask_error is 0 if there were any eligible children,
1519 * or another error from security_task_wait(), or still -ECHILD. 1533 * or another error from security_task_wait(), or still -ECHILD.
1520 */ 1534 */
1521static int do_wait_thread(struct task_struct *tsk, int *notask_error, 1535static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1522 enum pid_type type, struct pid *pid, int options,
1523 struct siginfo __user *infop, int __user *stat_addr,
1524 struct rusage __user *ru)
1525{ 1536{
1526 struct task_struct *p; 1537 struct task_struct *p;
1527 1538
@@ -1530,9 +1541,7 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1530 * Do not consider detached threads. 1541 * Do not consider detached threads.
1531 */ 1542 */
1532 if (!task_detached(p)) { 1543 if (!task_detached(p)) {
1533 int ret = wait_consider_task(tsk, 0, p, notask_error, 1544 int ret = wait_consider_task(wo, tsk, 0, p);
1534 type, pid, options,
1535 infop, stat_addr, ru);
1536 if (ret) 1545 if (ret)
1537 return ret; 1546 return ret;
1538 } 1547 }
@@ -1541,22 +1550,12 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1541 return 0; 1550 return 0;
1542} 1551}
1543 1552
1544static int ptrace_do_wait(struct task_struct *tsk, int *notask_error, 1553static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1545 enum pid_type type, struct pid *pid, int options,
1546 struct siginfo __user *infop, int __user *stat_addr,
1547 struct rusage __user *ru)
1548{ 1554{
1549 struct task_struct *p; 1555 struct task_struct *p;
1550 1556
1551 /*
1552 * Traditionally we see ptrace'd stopped tasks regardless of options.
1553 */
1554 options |= WUNTRACED;
1555
1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1557 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1557 int ret = wait_consider_task(tsk, 1, p, notask_error, 1558 int ret = wait_consider_task(wo, tsk, 1, p);
1558 type, pid, options,
1559 infop, stat_addr, ru);
1560 if (ret) 1559 if (ret)
1561 return ret; 1560 return ret;
1562 } 1561 }
@@ -1564,65 +1563,59 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1564 return 0; 1563 return 0;
1565} 1564}
1566 1565
1567static long do_wait(enum pid_type type, struct pid *pid, int options, 1566static long do_wait(struct wait_opts *wo)
1568 struct siginfo __user *infop, int __user *stat_addr,
1569 struct rusage __user *ru)
1570{ 1567{
1571 DECLARE_WAITQUEUE(wait, current); 1568 DECLARE_WAITQUEUE(wait, current);
1572 struct task_struct *tsk; 1569 struct task_struct *tsk;
1573 int retval; 1570 int retval;
1574 1571
1575 trace_sched_process_wait(pid); 1572 trace_sched_process_wait(wo->wo_pid);
1576 1573
1577 add_wait_queue(&current->signal->wait_chldexit,&wait); 1574 add_wait_queue(&current->signal->wait_chldexit,&wait);
1578repeat: 1575repeat:
1579 /* 1576 /*
1580 * If there is nothing that can match our critiera just get out. 1577 * If there is nothing that can match our critiera just get out.
1581 * We will clear @retval to zero if we see any child that might later 1578 * We will clear ->notask_error to zero if we see any child that
1582 * match our criteria, even if we are not able to reap it yet. 1579 * might later match our criteria, even if we are not able to reap
1580 * it yet.
1583 */ 1581 */
1584 retval = -ECHILD; 1582 wo->notask_error = -ECHILD;
1585 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1583 if ((wo->wo_type < PIDTYPE_MAX) &&
1586 goto end; 1584 (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
1585 goto notask;
1587 1586
1588 current->state = TASK_INTERRUPTIBLE; 1587 set_current_state(TASK_INTERRUPTIBLE);
1589 read_lock(&tasklist_lock); 1588 read_lock(&tasklist_lock);
1590 tsk = current; 1589 tsk = current;
1591 do { 1590 do {
1592 int tsk_result = do_wait_thread(tsk, &retval, 1591 retval = do_wait_thread(wo, tsk);
1593 type, pid, options, 1592 if (retval)
1594 infop, stat_addr, ru);
1595 if (!tsk_result)
1596 tsk_result = ptrace_do_wait(tsk, &retval,
1597 type, pid, options,
1598 infop, stat_addr, ru);
1599 if (tsk_result) {
1600 /*
1601 * tasklist_lock is unlocked and we have a final result.
1602 */
1603 retval = tsk_result;
1604 goto end; 1593 goto end;
1605 }
1606 1594
1607 if (options & __WNOTHREAD) 1595 retval = ptrace_do_wait(wo, tsk);
1596 if (retval)
1597 goto end;
1598
1599 if (wo->wo_flags & __WNOTHREAD)
1608 break; 1600 break;
1609 tsk = next_thread(tsk); 1601 } while_each_thread(current, tsk);
1610 BUG_ON(tsk->signal != current->signal);
1611 } while (tsk != current);
1612 read_unlock(&tasklist_lock); 1602 read_unlock(&tasklist_lock);
1613 1603
1614 if (!retval && !(options & WNOHANG)) { 1604notask:
1605 retval = wo->notask_error;
1606 if (!retval && !(wo->wo_flags & WNOHANG)) {
1615 retval = -ERESTARTSYS; 1607 retval = -ERESTARTSYS;
1616 if (!signal_pending(current)) { 1608 if (!signal_pending(current)) {
1617 schedule(); 1609 schedule();
1618 goto repeat; 1610 goto repeat;
1619 } 1611 }
1620 } 1612 }
1621
1622end: 1613end:
1623 current->state = TASK_RUNNING; 1614 __set_current_state(TASK_RUNNING);
1624 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1615 remove_wait_queue(&current->signal->wait_chldexit,&wait);
1625 if (infop) { 1616 if (wo->wo_info) {
1617 struct siginfo __user *infop = wo->wo_info;
1618
1626 if (retval > 0) 1619 if (retval > 0)
1627 retval = 0; 1620 retval = 0;
1628 else { 1621 else {
@@ -1651,6 +1644,7 @@ end:
1651SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, 1644SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1652 infop, int, options, struct rusage __user *, ru) 1645 infop, int, options, struct rusage __user *, ru)
1653{ 1646{
1647 struct wait_opts wo;
1654 struct pid *pid = NULL; 1648 struct pid *pid = NULL;
1655 enum pid_type type; 1649 enum pid_type type;
1656 long ret; 1650 long ret;
@@ -1680,7 +1674,14 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1680 1674
1681 if (type < PIDTYPE_MAX) 1675 if (type < PIDTYPE_MAX)
1682 pid = find_get_pid(upid); 1676 pid = find_get_pid(upid);
1683 ret = do_wait(type, pid, options, infop, NULL, ru); 1677
1678 wo.wo_type = type;
1679 wo.wo_pid = pid;
1680 wo.wo_flags = options;
1681 wo.wo_info = infop;
1682 wo.wo_stat = NULL;
1683 wo.wo_rusage = ru;
1684 ret = do_wait(&wo);
1684 put_pid(pid); 1685 put_pid(pid);
1685 1686
1686 /* avoid REGPARM breakage on x86: */ 1687 /* avoid REGPARM breakage on x86: */
@@ -1691,6 +1692,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1691SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, 1692SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1692 int, options, struct rusage __user *, ru) 1693 int, options, struct rusage __user *, ru)
1693{ 1694{
1695 struct wait_opts wo;
1694 struct pid *pid = NULL; 1696 struct pid *pid = NULL;
1695 enum pid_type type; 1697 enum pid_type type;
1696 long ret; 1698 long ret;
@@ -1712,7 +1714,13 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
1712 pid = find_get_pid(upid); 1714 pid = find_get_pid(upid);
1713 } 1715 }
1714 1716
1715 ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru); 1717 wo.wo_type = type;
1718 wo.wo_pid = pid;
1719 wo.wo_flags = options | WEXITED;
1720 wo.wo_info = NULL;
1721 wo.wo_stat = stat_addr;
1722 wo.wo_rusage = ru;
1723 ret = do_wait(&wo);
1716 put_pid(pid); 1724 put_pid(pid);
1717 1725
1718 /* avoid REGPARM breakage on x86: */ 1726 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd00726..467746b3f0aa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,8 +61,8 @@
61#include <linux/proc_fs.h> 61#include <linux/proc_fs.h>
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <trace/sched.h>
65#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h>
66 66
67#include <asm/pgtable.h> 67#include <asm/pgtable.h>
68#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -71,6 +71,8 @@
71#include <asm/cacheflush.h> 71#include <asm/cacheflush.h>
72#include <asm/tlbflush.h> 72#include <asm/tlbflush.h>
73 73
74#include <trace/events/sched.h>
75
74/* 76/*
75 * Protected counters by write_lock_irq(&tasklist_lock) 77 * Protected counters by write_lock_irq(&tasklist_lock)
76 */ 78 */
@@ -83,8 +85,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
83 85
84__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 86__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
85 87
86DEFINE_TRACE(sched_process_fork);
87
88int nr_processes(void) 88int nr_processes(void)
89{ 89{
90 int cpu; 90 int cpu;
@@ -178,7 +178,7 @@ void __init fork_init(unsigned long mempages)
178 /* create a slab on which task_structs can be allocated */ 178 /* create a slab on which task_structs can be allocated */
179 task_struct_cachep = 179 task_struct_cachep =
180 kmem_cache_create("task_struct", sizeof(struct task_struct), 180 kmem_cache_create("task_struct", sizeof(struct task_struct),
181 ARCH_MIN_TASKALIGN, SLAB_PANIC, NULL); 181 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
182#endif 182#endif
183 183
184 /* do the arch specific task caches init */ 184 /* do the arch specific task caches init */
@@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
982 if (!p) 982 if (!p)
983 goto fork_out; 983 goto fork_out;
984 984
985 ftrace_graph_init_task(p);
986
985 rt_mutex_init_task(p); 987 rt_mutex_init_task(p);
986 988
987#ifdef CONFIG_PROVE_LOCKING 989#ifdef CONFIG_PROVE_LOCKING
@@ -1027,7 +1029,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1027 p->vfork_done = NULL; 1029 p->vfork_done = NULL;
1028 spin_lock_init(&p->alloc_lock); 1030 spin_lock_init(&p->alloc_lock);
1029 1031
1030 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1031 init_sigpending(&p->pending); 1032 init_sigpending(&p->pending);
1032 1033
1033 p->utime = cputime_zero; 1034 p->utime = cputime_zero;
@@ -1089,12 +1090,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1089#ifdef CONFIG_DEBUG_MUTEXES 1090#ifdef CONFIG_DEBUG_MUTEXES
1090 p->blocked_on = NULL; /* not blocked yet */ 1091 p->blocked_on = NULL; /* not blocked yet */
1091#endif 1092#endif
1092 if (unlikely(current->ptrace)) 1093
1093 ptrace_fork(p, clone_flags); 1094 p->bts = NULL;
1094 1095
1095 /* Perform scheduler related setup. Assign this task to a CPU. */ 1096 /* Perform scheduler related setup. Assign this task to a CPU. */
1096 sched_fork(p, clone_flags); 1097 sched_fork(p, clone_flags);
1097 1098
1099 retval = perf_counter_init_task(p);
1100 if (retval)
1101 goto bad_fork_cleanup_policy;
1102
1098 if ((retval = audit_alloc(p))) 1103 if ((retval = audit_alloc(p)))
1099 goto bad_fork_cleanup_policy; 1104 goto bad_fork_cleanup_policy;
1100 /* copy all the process information */ 1105 /* copy all the process information */
@@ -1131,8 +1136,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1131 } 1136 }
1132 } 1137 }
1133 1138
1134 ftrace_graph_init_task(p);
1135
1136 p->pid = pid_nr(pid); 1139 p->pid = pid_nr(pid);
1137 p->tgid = p->pid; 1140 p->tgid = p->pid;
1138 if (clone_flags & CLONE_THREAD) 1141 if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1144,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1141 if (current->nsproxy != p->nsproxy) { 1144 if (current->nsproxy != p->nsproxy) {
1142 retval = ns_cgroup_clone(p, pid); 1145 retval = ns_cgroup_clone(p, pid);
1143 if (retval) 1146 if (retval)
1144 goto bad_fork_free_graph; 1147 goto bad_fork_free_pid;
1145 } 1148 }
1146 1149
1147 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1150 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1236,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 spin_unlock(&current->sighand->siglock); 1236 spin_unlock(&current->sighand->siglock);
1234 write_unlock_irq(&tasklist_lock); 1237 write_unlock_irq(&tasklist_lock);
1235 retval = -ERESTARTNOINTR; 1238 retval = -ERESTARTNOINTR;
1236 goto bad_fork_free_graph; 1239 goto bad_fork_free_pid;
1237 } 1240 }
1238 1241
1239 if (clone_flags & CLONE_THREAD) { 1242 if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1271,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1268 cgroup_post_fork(p); 1271 cgroup_post_fork(p);
1269 return p; 1272 return p;
1270 1273
1271bad_fork_free_graph:
1272 ftrace_graph_exit_task(p);
1273bad_fork_free_pid: 1274bad_fork_free_pid:
1274 if (pid != &init_struct_pid) 1275 if (pid != &init_struct_pid)
1275 free_pid(pid); 1276 free_pid(pid);
@@ -1293,6 +1294,7 @@ bad_fork_cleanup_semundo:
1293bad_fork_cleanup_audit: 1294bad_fork_cleanup_audit:
1294 audit_free(p); 1295 audit_free(p);
1295bad_fork_cleanup_policy: 1296bad_fork_cleanup_policy:
1297 perf_counter_free_task(p);
1296#ifdef CONFIG_NUMA 1298#ifdef CONFIG_NUMA
1297 mpol_put(p->mempolicy); 1299 mpol_put(p->mempolicy);
1298bad_fork_cleanup_cgroup: 1300bad_fork_cleanup_cgroup:
@@ -1406,10 +1408,16 @@ long do_fork(unsigned long clone_flags,
1406 if (clone_flags & CLONE_VFORK) { 1408 if (clone_flags & CLONE_VFORK) {
1407 p->vfork_done = &vfork; 1409 p->vfork_done = &vfork;
1408 init_completion(&vfork); 1410 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1409 } 1417 }
1410 1418
1411 audit_finish_fork(p); 1419 audit_finish_fork(p);
1412 tracehook_report_clone(trace, regs, clone_flags, nr, p); 1420 tracehook_report_clone(regs, clone_flags, nr, p);
1413 1421
1414 /* 1422 /*
1415 * We set PF_STARTING at creation in case tracing wants to 1423 * We set PF_STARTING at creation in case tracing wants to
@@ -1461,20 +1469,20 @@ void __init proc_caches_init(void)
1461{ 1469{
1462 sighand_cachep = kmem_cache_create("sighand_cache", 1470 sighand_cachep = kmem_cache_create("sighand_cache",
1463 sizeof(struct sighand_struct), 0, 1471 sizeof(struct sighand_struct), 0,
1464 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU, 1472 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
1465 sighand_ctor); 1473 SLAB_NOTRACK, sighand_ctor);
1466 signal_cachep = kmem_cache_create("signal_cache", 1474 signal_cachep = kmem_cache_create("signal_cache",
1467 sizeof(struct signal_struct), 0, 1475 sizeof(struct signal_struct), 0,
1468 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1476 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1469 files_cachep = kmem_cache_create("files_cache", 1477 files_cachep = kmem_cache_create("files_cache",
1470 sizeof(struct files_struct), 0, 1478 sizeof(struct files_struct), 0,
1471 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1479 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1472 fs_cachep = kmem_cache_create("fs_cache", 1480 fs_cachep = kmem_cache_create("fs_cache",
1473 sizeof(struct fs_struct), 0, 1481 sizeof(struct fs_struct), 0,
1474 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1482 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1475 mm_cachep = kmem_cache_create("mm_struct", 1483 mm_cachep = kmem_cache_create("mm_struct",
1476 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1484 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1477 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 1485 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1478 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); 1486 vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
1479 mmap_init(); 1487 mmap_init();
1480} 1488}
diff --git a/kernel/futex.c b/kernel/futex.c
index eef8cd26b5e5..80b5ce716596 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
19 * PRIVATE futexes by Eric Dumazet 19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> 20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 * 21 *
22 * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23 * Copyright (C) IBM Corporation, 2009
24 * Thanks to Thomas Gleixner for conceptual design and careful reviews.
25 *
22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 26 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
23 * enough at me, Linus for the original (flawed) idea, Matthew 27 * enough at me, Linus for the original (flawed) idea, Matthew
24 * Kirkwood for proof-of-concept implementation. 28 * Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
96 */ 100 */
97struct futex_q { 101struct futex_q {
98 struct plist_node list; 102 struct plist_node list;
99 /* There can only be a single waiter */ 103 /* Waiter reference */
100 wait_queue_head_t waiter; 104 struct task_struct *task;
101 105
102 /* Which hash list lock to use: */ 106 /* Which hash list lock to use: */
103 spinlock_t *lock_ptr; 107 spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
107 111
108 /* Optional priority inheritance state: */ 112 /* Optional priority inheritance state: */
109 struct futex_pi_state *pi_state; 113 struct futex_pi_state *pi_state;
110 struct task_struct *task; 114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter;
111 117
112 /* Bitset for the optional bitmasked wakeup */ 118 /* Bitset for the optional bitmasked wakeup */
113 u32 bitset; 119 u32 bitset;
@@ -193,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key)
193 * @uaddr: virtual address of the futex 199 * @uaddr: virtual address of the futex
194 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 200 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
195 * @key: address where result is stored. 201 * @key: address where result is stored.
202 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
196 * 203 *
197 * Returns a negative error code or 0 204 * Returns a negative error code or 0
198 * The key words are stored in *key on success. 205 * The key words are stored in *key on success.
@@ -203,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key)
203 * 210 *
204 * lock_page() might sleep, the caller should not hold a spinlock. 211 * lock_page() might sleep, the caller should not hold a spinlock.
205 */ 212 */
206static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 213static int
214get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
207{ 215{
208 unsigned long address = (unsigned long)uaddr; 216 unsigned long address = (unsigned long)uaddr;
209 struct mm_struct *mm = current->mm; 217 struct mm_struct *mm = current->mm;
@@ -226,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
226 * but access_ok() should be faster than find_vma() 234 * but access_ok() should be faster than find_vma()
227 */ 235 */
228 if (!fshared) { 236 if (!fshared) {
229 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) 237 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
230 return -EFAULT; 238 return -EFAULT;
231 key->private.mm = mm; 239 key->private.mm = mm;
232 key->private.address = address; 240 key->private.address = address;
@@ -235,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
235 } 243 }
236 244
237again: 245again:
238 err = get_user_pages_fast(address, 1, 0, &page); 246 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
239 if (err < 0) 247 if (err < 0)
240 return err; 248 return err;
241 249
@@ -276,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
276 drop_futex_key_refs(key); 284 drop_futex_key_refs(key);
277} 285}
278 286
287/**
288 * futex_top_waiter() - Return the highest priority waiter on a futex
289 * @hb: the hash bucket the futex_q's reside in
290 * @key: the futex key (to distinguish it from other futex futex_q's)
291 *
292 * Must be called with the hb lock held.
293 */
294static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
295 union futex_key *key)
296{
297 struct futex_q *this;
298
299 plist_for_each_entry(this, &hb->chain, list) {
300 if (match_futex(&this->key, key))
301 return this;
302 }
303 return NULL;
304}
305
279static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 306static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
280{ 307{
281 u32 curval; 308 u32 curval;
@@ -537,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
537 return 0; 564 return 0;
538} 565}
539 566
567/**
568 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
569 * @uaddr: the pi futex user address
570 * @hb: the pi futex hash bucket
571 * @key: the futex key associated with uaddr and hb
572 * @ps: the pi_state pointer where we store the result of the
573 * lookup
574 * @task: the task to perform the atomic lock work for. This will
575 * be "current" except in the case of requeue pi.
576 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
577 *
578 * Returns:
579 * 0 - ready to wait
580 * 1 - acquired the lock
581 * <0 - error
582 *
583 * The hb->lock and futex_key refs shall be held by the caller.
584 */
585static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
586 union futex_key *key,
587 struct futex_pi_state **ps,
588 struct task_struct *task, int set_waiters)
589{
590 int lock_taken, ret, ownerdied = 0;
591 u32 uval, newval, curval;
592
593retry:
594 ret = lock_taken = 0;
595
596 /*
597 * To avoid races, we attempt to take the lock here again
598 * (by doing a 0 -> TID atomic cmpxchg), while holding all
599 * the locks. It will most likely not succeed.
600 */
601 newval = task_pid_vnr(task);
602 if (set_waiters)
603 newval |= FUTEX_WAITERS;
604
605 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
606
607 if (unlikely(curval == -EFAULT))
608 return -EFAULT;
609
610 /*
611 * Detect deadlocks.
612 */
613 if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
614 return -EDEADLK;
615
616 /*
617 * Surprise - we got the lock. Just return to userspace:
618 */
619 if (unlikely(!curval))
620 return 1;
621
622 uval = curval;
623
624 /*
625 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
626 * to wake at the next unlock.
627 */
628 newval = curval | FUTEX_WAITERS;
629
630 /*
631 * There are two cases, where a futex might have no owner (the
632 * owner TID is 0): OWNER_DIED. We take over the futex in this
633 * case. We also do an unconditional take over, when the owner
634 * of the futex died.
635 *
636 * This is safe as we are protected by the hash bucket lock !
637 */
638 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
639 /* Keep the OWNER_DIED bit */
640 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
641 ownerdied = 0;
642 lock_taken = 1;
643 }
644
645 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
646
647 if (unlikely(curval == -EFAULT))
648 return -EFAULT;
649 if (unlikely(curval != uval))
650 goto retry;
651
652 /*
653 * We took the lock due to owner died take over.
654 */
655 if (unlikely(lock_taken))
656 return 1;
657
658 /*
659 * We dont have the lock. Look up the PI state (or create it if
660 * we are the first waiter):
661 */
662 ret = lookup_pi_state(uval, hb, key, ps);
663
664 if (unlikely(ret)) {
665 switch (ret) {
666 case -ESRCH:
667 /*
668 * No owner found for this futex. Check if the
669 * OWNER_DIED bit is set to figure out whether
670 * this is a robust futex or not.
671 */
672 if (get_futex_value_locked(&curval, uaddr))
673 return -EFAULT;
674
675 /*
676 * We simply start over in case of a robust
677 * futex. The code above will take the futex
678 * and return happy.
679 */
680 if (curval & FUTEX_OWNER_DIED) {
681 ownerdied = 1;
682 goto retry;
683 }
684 default:
685 break;
686 }
687 }
688
689 return ret;
690}
691
540/* 692/*
541 * The hash bucket lock must be held when this is called. 693 * The hash bucket lock must be held when this is called.
542 * Afterwards, the futex_q must not be accessed. 694 * Afterwards, the futex_q must not be accessed.
543 */ 695 */
544static void wake_futex(struct futex_q *q) 696static void wake_futex(struct futex_q *q)
545{ 697{
546 plist_del(&q->list, &q->list.plist); 698 struct task_struct *p = q->task;
699
547 /* 700 /*
548 * The lock in wake_up_all() is a crucial memory barrier after the 701 * We set q->lock_ptr = NULL _before_ we wake up the task. If
549 * plist_del() and also before assigning to q->lock_ptr. 702 * a non futex wake up happens on another CPU then the task
703 * might exit and p would dereference a non existing task
704 * struct. Prevent this by holding a reference on p across the
705 * wake up.
550 */ 706 */
551 wake_up(&q->waiter); 707 get_task_struct(p);
708
709 plist_del(&q->list, &q->list.plist);
552 /* 710 /*
553 * The waiting task can free the futex_q as soon as this is written, 711 * The waiting task can free the futex_q as soon as
554 * without taking any locks. This must come last. 712 * q->lock_ptr = NULL is written, without taking any locks. A
555 * 713 * memory barrier is required here to prevent the following
556 * A memory barrier is required here to prevent the following store to 714 * store to lock_ptr from getting ahead of the plist_del.
557 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
558 * end of wake_up() does not prevent this store from moving.
559 */ 715 */
560 smp_wmb(); 716 smp_wmb();
561 q->lock_ptr = NULL; 717 q->lock_ptr = NULL;
718
719 wake_up_state(p, TASK_NORMAL);
720 put_task_struct(p);
562} 721}
563 722
564static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) 723static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -677,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
677 if (!bitset) 836 if (!bitset)
678 return -EINVAL; 837 return -EINVAL;
679 838
680 ret = get_futex_key(uaddr, fshared, &key); 839 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
681 if (unlikely(ret != 0)) 840 if (unlikely(ret != 0))
682 goto out; 841 goto out;
683 842
@@ -687,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
687 846
688 plist_for_each_entry_safe(this, next, head, list) { 847 plist_for_each_entry_safe(this, next, head, list) {
689 if (match_futex (&this->key, &key)) { 848 if (match_futex (&this->key, &key)) {
690 if (this->pi_state) { 849 if (this->pi_state || this->rt_waiter) {
691 ret = -EINVAL; 850 ret = -EINVAL;
692 break; 851 break;
693 } 852 }
@@ -723,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
723 int ret, op_ret; 882 int ret, op_ret;
724 883
725retry: 884retry:
726 ret = get_futex_key(uaddr1, fshared, &key1); 885 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
727 if (unlikely(ret != 0)) 886 if (unlikely(ret != 0))
728 goto out; 887 goto out;
729 ret = get_futex_key(uaddr2, fshared, &key2); 888 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
730 if (unlikely(ret != 0)) 889 if (unlikely(ret != 0))
731 goto out_put_key1; 890 goto out_put_key1;
732 891
@@ -800,24 +959,185 @@ out:
800 return ret; 959 return ret;
801} 960}
802 961
803/* 962/**
804 * Requeue all waiters hashed on one physical page to another 963 * requeue_futex() - Requeue a futex_q from one hb to another
805 * physical page. 964 * @q: the futex_q to requeue
965 * @hb1: the source hash_bucket
966 * @hb2: the target hash_bucket
967 * @key2: the new key for the requeued futex_q
968 */
969static inline
970void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
971 struct futex_hash_bucket *hb2, union futex_key *key2)
972{
973
974 /*
975 * If key1 and key2 hash to the same bucket, no need to
976 * requeue.
977 */
978 if (likely(&hb1->chain != &hb2->chain)) {
979 plist_del(&q->list, &hb1->chain);
980 plist_add(&q->list, &hb2->chain);
981 q->lock_ptr = &hb2->lock;
982#ifdef CONFIG_DEBUG_PI_LIST
983 q->list.plist.lock = &hb2->lock;
984#endif
985 }
986 get_futex_key_refs(key2);
987 q->key = *key2;
988}
989
990/**
991 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
992 * q: the futex_q
993 * key: the key of the requeue target futex
994 *
995 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
996 * target futex if it is uncontended or via a lock steal. Set the futex_q key
997 * to the requeue target futex so the waiter can detect the wakeup on the right
998 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
999 * atomic lock acquisition. Must be called with the q->lock_ptr held.
1000 */
1001static inline
1002void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1003{
1004 drop_futex_key_refs(&q->key);
1005 get_futex_key_refs(key);
1006 q->key = *key;
1007
1008 WARN_ON(plist_node_empty(&q->list));
1009 plist_del(&q->list, &q->list.plist);
1010
1011 WARN_ON(!q->rt_waiter);
1012 q->rt_waiter = NULL;
1013
1014 wake_up_state(q->task, TASK_NORMAL);
1015}
1016
1017/**
1018 * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1019 * @pifutex: the user address of the to futex
1020 * @hb1: the from futex hash bucket, must be locked by the caller
1021 * @hb2: the to futex hash bucket, must be locked by the caller
1022 * @key1: the from futex key
1023 * @key2: the to futex key
1024 * @ps: address to store the pi_state pointer
1025 * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
1026 *
1027 * Try and get the lock on behalf of the top waiter if we can do it atomically.
1028 * Wake the top waiter if we succeed. If the caller specified set_waiters,
1029 * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1030 * hb1 and hb2 must be held by the caller.
1031 *
1032 * Returns:
1033 * 0 - failed to acquire the lock atomicly
1034 * 1 - acquired the lock
1035 * <0 - error
1036 */
1037static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1038 struct futex_hash_bucket *hb1,
1039 struct futex_hash_bucket *hb2,
1040 union futex_key *key1, union futex_key *key2,
1041 struct futex_pi_state **ps, int set_waiters)
1042{
1043 struct futex_q *top_waiter = NULL;
1044 u32 curval;
1045 int ret;
1046
1047 if (get_futex_value_locked(&curval, pifutex))
1048 return -EFAULT;
1049
1050 /*
1051 * Find the top_waiter and determine if there are additional waiters.
1052 * If the caller intends to requeue more than 1 waiter to pifutex,
1053 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1054 * as we have means to handle the possible fault. If not, don't set
1055 * the bit unecessarily as it will force the subsequent unlock to enter
1056 * the kernel.
1057 */
1058 top_waiter = futex_top_waiter(hb1, key1);
1059
1060 /* There are no waiters, nothing for us to do. */
1061 if (!top_waiter)
1062 return 0;
1063
1064 /*
1065 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1066 * the contended case or if set_waiters is 1. The pi_state is returned
1067 * in ps in contended cases.
1068 */
1069 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1070 set_waiters);
1071 if (ret == 1)
1072 requeue_pi_wake_futex(top_waiter, key2);
1073
1074 return ret;
1075}
1076
1077/**
1078 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1079 * uaddr1: source futex user address
1080 * uaddr2: target futex user address
1081 * nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1082 * nr_requeue: number of waiters to requeue (0-INT_MAX)
1083 * requeue_pi: if we are attempting to requeue from a non-pi futex to a
1084 * pi futex (pi to pi requeue is not supported)
1085 *
1086 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1087 * uaddr2 atomically on behalf of the top waiter.
1088 *
1089 * Returns:
1090 * >=0 - on success, the number of tasks requeued or woken
1091 * <0 - on error
806 */ 1092 */
807static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, 1093static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
808 int nr_wake, int nr_requeue, u32 *cmpval) 1094 int nr_wake, int nr_requeue, u32 *cmpval,
1095 int requeue_pi)
809{ 1096{
810 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1097 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1098 int drop_count = 0, task_count = 0, ret;
1099 struct futex_pi_state *pi_state = NULL;
811 struct futex_hash_bucket *hb1, *hb2; 1100 struct futex_hash_bucket *hb1, *hb2;
812 struct plist_head *head1; 1101 struct plist_head *head1;
813 struct futex_q *this, *next; 1102 struct futex_q *this, *next;
814 int ret, drop_count = 0; 1103 u32 curval2;
1104
1105 if (requeue_pi) {
1106 /*
1107 * requeue_pi requires a pi_state, try to allocate it now
1108 * without any locks in case it fails.
1109 */
1110 if (refill_pi_state_cache())
1111 return -ENOMEM;
1112 /*
1113 * requeue_pi must wake as many tasks as it can, up to nr_wake
1114 * + nr_requeue, since it acquires the rt_mutex prior to
1115 * returning to userspace, so as to not leave the rt_mutex with
1116 * waiters and no owner. However, second and third wake-ups
1117 * cannot be predicted as they involve race conditions with the
1118 * first wake and a fault while looking up the pi_state. Both
1119 * pthread_cond_signal() and pthread_cond_broadcast() should
1120 * use nr_wake=1.
1121 */
1122 if (nr_wake != 1)
1123 return -EINVAL;
1124 }
815 1125
816retry: 1126retry:
817 ret = get_futex_key(uaddr1, fshared, &key1); 1127 if (pi_state != NULL) {
1128 /*
1129 * We will have to lookup the pi_state again, so free this one
1130 * to keep the accounting correct.
1131 */
1132 free_pi_state(pi_state);
1133 pi_state = NULL;
1134 }
1135
1136 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
818 if (unlikely(ret != 0)) 1137 if (unlikely(ret != 0))
819 goto out; 1138 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2); 1139 ret = get_futex_key(uaddr2, fshared, &key2,
1140 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
821 if (unlikely(ret != 0)) 1141 if (unlikely(ret != 0))
822 goto out_put_key1; 1142 goto out_put_key1;
823 1143
@@ -852,32 +1172,99 @@ retry_private:
852 } 1172 }
853 } 1173 }
854 1174
1175 if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1176 /*
1177 * Attempt to acquire uaddr2 and wake the top waiter. If we
1178 * intend to requeue waiters, force setting the FUTEX_WAITERS
1179 * bit. We force this here where we are able to easily handle
1180 * faults rather in the requeue loop below.
1181 */
1182 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1183 &key2, &pi_state, nr_requeue);
1184
1185 /*
1186 * At this point the top_waiter has either taken uaddr2 or is
1187 * waiting on it. If the former, then the pi_state will not
1188 * exist yet, look it up one more time to ensure we have a
1189 * reference to it.
1190 */
1191 if (ret == 1) {
1192 WARN_ON(pi_state);
1193 task_count++;
1194 ret = get_futex_value_locked(&curval2, uaddr2);
1195 if (!ret)
1196 ret = lookup_pi_state(curval2, hb2, &key2,
1197 &pi_state);
1198 }
1199
1200 switch (ret) {
1201 case 0:
1202 break;
1203 case -EFAULT:
1204 double_unlock_hb(hb1, hb2);
1205 put_futex_key(fshared, &key2);
1206 put_futex_key(fshared, &key1);
1207 ret = get_user(curval2, uaddr2);
1208 if (!ret)
1209 goto retry;
1210 goto out;
1211 case -EAGAIN:
1212 /* The owner was exiting, try again. */
1213 double_unlock_hb(hb1, hb2);
1214 put_futex_key(fshared, &key2);
1215 put_futex_key(fshared, &key1);
1216 cond_resched();
1217 goto retry;
1218 default:
1219 goto out_unlock;
1220 }
1221 }
1222
855 head1 = &hb1->chain; 1223 head1 = &hb1->chain;
856 plist_for_each_entry_safe(this, next, head1, list) { 1224 plist_for_each_entry_safe(this, next, head1, list) {
857 if (!match_futex (&this->key, &key1)) 1225 if (task_count - nr_wake >= nr_requeue)
1226 break;
1227
1228 if (!match_futex(&this->key, &key1))
858 continue; 1229 continue;
859 if (++ret <= nr_wake) { 1230
1231 WARN_ON(!requeue_pi && this->rt_waiter);
1232 WARN_ON(requeue_pi && !this->rt_waiter);
1233
1234 /*
1235 * Wake nr_wake waiters. For requeue_pi, if we acquired the
1236 * lock, we already woke the top_waiter. If not, it will be
1237 * woken by futex_unlock_pi().
1238 */
1239 if (++task_count <= nr_wake && !requeue_pi) {
860 wake_futex(this); 1240 wake_futex(this);
861 } else { 1241 continue;
862 /* 1242 }
863 * If key1 and key2 hash to the same bucket, no need to
864 * requeue.
865 */
866 if (likely(head1 != &hb2->chain)) {
867 plist_del(&this->list, &hb1->chain);
868 plist_add(&this->list, &hb2->chain);
869 this->lock_ptr = &hb2->lock;
870#ifdef CONFIG_DEBUG_PI_LIST
871 this->list.plist.lock = &hb2->lock;
872#endif
873 }
874 this->key = key2;
875 get_futex_key_refs(&key2);
876 drop_count++;
877 1243
878 if (ret - nr_wake >= nr_requeue) 1244 /*
879 break; 1245 * Requeue nr_requeue waiters and possibly one more in the case
1246 * of requeue_pi if we couldn't acquire the lock atomically.
1247 */
1248 if (requeue_pi) {
1249 /* Prepare the waiter to take the rt_mutex. */
1250 atomic_inc(&pi_state->refcount);
1251 this->pi_state = pi_state;
1252 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1253 this->rt_waiter,
1254 this->task, 1);
1255 if (ret == 1) {
1256 /* We got the lock. */
1257 requeue_pi_wake_futex(this, &key2);
1258 continue;
1259 } else if (ret) {
1260 /* -EDEADLK */
1261 this->pi_state = NULL;
1262 free_pi_state(pi_state);
1263 goto out_unlock;
1264 }
880 } 1265 }
1266 requeue_futex(this, hb1, hb2, &key2);
1267 drop_count++;
881 } 1268 }
882 1269
883out_unlock: 1270out_unlock:
@@ -897,7 +1284,9 @@ out_put_keys:
897out_put_key1: 1284out_put_key1:
898 put_futex_key(fshared, &key1); 1285 put_futex_key(fshared, &key1);
899out: 1286out:
900 return ret; 1287 if (pi_state != NULL)
1288 free_pi_state(pi_state);
1289 return ret ? ret : task_count;
901} 1290}
902 1291
903/* The key must be already stored in q->key. */ 1292/* The key must be already stored in q->key. */
@@ -905,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
905{ 1294{
906 struct futex_hash_bucket *hb; 1295 struct futex_hash_bucket *hb;
907 1296
908 init_waitqueue_head(&q->waiter);
909
910 get_futex_key_refs(&q->key); 1297 get_futex_key_refs(&q->key);
911 hb = hash_futex(&q->key); 1298 hb = hash_futex(&q->key);
912 q->lock_ptr = &hb->lock; 1299 q->lock_ptr = &hb->lock;
@@ -1117,35 +1504,149 @@ handle_fault:
1117 */ 1504 */
1118#define FLAGS_SHARED 0x01 1505#define FLAGS_SHARED 0x01
1119#define FLAGS_CLOCKRT 0x02 1506#define FLAGS_CLOCKRT 0x02
1507#define FLAGS_HAS_TIMEOUT 0x04
1120 1508
1121static long futex_wait_restart(struct restart_block *restart); 1509static long futex_wait_restart(struct restart_block *restart);
1122 1510
1123static int futex_wait(u32 __user *uaddr, int fshared, 1511/**
1124 u32 val, ktime_t *abs_time, u32 bitset, int clockrt) 1512 * fixup_owner() - Post lock pi_state and corner case management
1513 * @uaddr: user address of the futex
1514 * @fshared: whether the futex is shared (1) or not (0)
1515 * @q: futex_q (contains pi_state and access to the rt_mutex)
1516 * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
1517 *
1518 * After attempting to lock an rt_mutex, this function is called to cleanup
1519 * the pi_state owner as well as handle race conditions that may allow us to
1520 * acquire the lock. Must be called with the hb lock held.
1521 *
1522 * Returns:
1523 * 1 - success, lock taken
1524 * 0 - success, lock not taken
1525 * <0 - on error (-EFAULT)
1526 */
1527static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
1528 int locked)
1125{ 1529{
1126 struct task_struct *curr = current; 1530 struct task_struct *owner;
1127 struct restart_block *restart; 1531 int ret = 0;
1128 DECLARE_WAITQUEUE(wait, curr);
1129 struct futex_hash_bucket *hb;
1130 struct futex_q q;
1131 u32 uval;
1132 int ret;
1133 struct hrtimer_sleeper t;
1134 int rem = 0;
1135 1532
1136 if (!bitset) 1533 if (locked) {
1137 return -EINVAL; 1534 /*
1535 * Got the lock. We might not be the anticipated owner if we
1536 * did a lock-steal - fix up the PI-state in that case:
1537 */
1538 if (q->pi_state->owner != current)
1539 ret = fixup_pi_state_owner(uaddr, q, current, fshared);
1540 goto out;
1541 }
1138 1542
1139 q.pi_state = NULL; 1543 /*
1140 q.bitset = bitset; 1544 * Catch the rare case, where the lock was released when we were on the
1141retry: 1545 * way back before we locked the hash bucket.
1142 q.key = FUTEX_KEY_INIT; 1546 */
1143 ret = get_futex_key(uaddr, fshared, &q.key); 1547 if (q->pi_state->owner == current) {
1144 if (unlikely(ret != 0)) 1548 /*
1549 * Try to get the rt_mutex now. This might fail as some other
1550 * task acquired the rt_mutex after we removed ourself from the
1551 * rt_mutex waiters list.
1552 */
1553 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1554 locked = 1;
1555 goto out;
1556 }
1557
1558 /*
1559 * pi_state is incorrect, some other task did a lock steal and
1560 * we returned due to timeout or signal without taking the
1561 * rt_mutex. Too late. We can access the rt_mutex_owner without
1562 * locking, as the other task is now blocked on the hash bucket
1563 * lock. Fix the state up.
1564 */
1565 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1566 ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
1145 goto out; 1567 goto out;
1568 }
1146 1569
1147retry_private: 1570 /*
1148 hb = queue_lock(&q); 1571 * Paranoia check. If we did not take the lock, then we should not be
1572 * the owner, nor the pending owner, of the rt_mutex.
1573 */
1574 if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1575 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1576 "pi-state %p\n", ret,
1577 q->pi_state->pi_mutex.owner,
1578 q->pi_state->owner);
1579
1580out:
1581 return ret ? ret : locked;
1582}
1583
1584/**
1585 * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1586 * @hb: the futex hash bucket, must be locked by the caller
1587 * @q: the futex_q to queue up on
1588 * @timeout: the prepared hrtimer_sleeper, or null for no timeout
1589 */
1590static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1591 struct hrtimer_sleeper *timeout)
1592{
1593 queue_me(q, hb);
1594
1595 /*
1596 * There might have been scheduling since the queue_me(), as we
1597 * cannot hold a spinlock across the get_user() in case it
1598 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
1599 * queueing ourselves into the futex hash. This code thus has to
1600 * rely on the futex_wake() code removing us from hash when it
1601 * wakes us up.
1602 */
1603 set_current_state(TASK_INTERRUPTIBLE);
1604
1605 /* Arm the timer */
1606 if (timeout) {
1607 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1608 if (!hrtimer_active(&timeout->timer))
1609 timeout->task = NULL;
1610 }
1611
1612 /*
1613 * !plist_node_empty() is safe here without any lock.
1614 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1615 */
1616 if (likely(!plist_node_empty(&q->list))) {
1617 /*
1618 * If the timer has already expired, current will already be
1619 * flagged for rescheduling. Only call schedule if there
1620 * is no timeout, or if it has yet to expire.
1621 */
1622 if (!timeout || timeout->task)
1623 schedule();
1624 }
1625 __set_current_state(TASK_RUNNING);
1626}
1627
1628/**
1629 * futex_wait_setup() - Prepare to wait on a futex
1630 * @uaddr: the futex userspace address
1631 * @val: the expected value
1632 * @fshared: whether the futex is shared (1) or not (0)
1633 * @q: the associated futex_q
1634 * @hb: storage for hash_bucket pointer to be returned to caller
1635 *
1636 * Setup the futex_q and locate the hash_bucket. Get the futex value and
1637 * compare it with the expected value. Handle atomic faults internally.
1638 * Return with the hb lock held and a q.key reference on success, and unlocked
1639 * with no q.key reference on failure.
1640 *
1641 * Returns:
1642 * 0 - uaddr contains val and hb has been locked
1643 * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
1644 */
1645static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1646 struct futex_q *q, struct futex_hash_bucket **hb)
1647{
1648 u32 uval;
1649 int ret;
1149 1650
1150 /* 1651 /*
1151 * Access the page AFTER the hash-bucket is locked. 1652 * Access the page AFTER the hash-bucket is locked.
@@ -1163,95 +1664,83 @@ retry_private:
1163 * A consequence is that futex_wait() can return zero and absorb 1664 * A consequence is that futex_wait() can return zero and absorb
1164 * a wakeup when *uaddr != val on entry to the syscall. This is 1665 * a wakeup when *uaddr != val on entry to the syscall. This is
1165 * rare, but normal. 1666 * rare, but normal.
1166 *
1167 * For shared futexes, we hold the mmap semaphore, so the mapping
1168 * cannot have changed since we looked it up in get_futex_key.
1169 */ 1667 */
1668retry:
1669 q->key = FUTEX_KEY_INIT;
1670 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1671 if (unlikely(ret != 0))
1672 return ret;
1673
1674retry_private:
1675 *hb = queue_lock(q);
1676
1170 ret = get_futex_value_locked(&uval, uaddr); 1677 ret = get_futex_value_locked(&uval, uaddr);
1171 1678
1172 if (unlikely(ret)) { 1679 if (ret) {
1173 queue_unlock(&q, hb); 1680 queue_unlock(q, *hb);
1174 1681
1175 ret = get_user(uval, uaddr); 1682 ret = get_user(uval, uaddr);
1176 if (ret) 1683 if (ret)
1177 goto out_put_key; 1684 goto out;
1178 1685
1179 if (!fshared) 1686 if (!fshared)
1180 goto retry_private; 1687 goto retry_private;
1181 1688
1182 put_futex_key(fshared, &q.key); 1689 put_futex_key(fshared, &q->key);
1183 goto retry; 1690 goto retry;
1184 } 1691 }
1185 ret = -EWOULDBLOCK;
1186 if (unlikely(uval != val)) {
1187 queue_unlock(&q, hb);
1188 goto out_put_key;
1189 }
1190 1692
1191 /* Only actually queue if *uaddr contained val. */ 1693 if (uval != val) {
1192 queue_me(&q, hb); 1694 queue_unlock(q, *hb);
1695 ret = -EWOULDBLOCK;
1696 }
1193 1697
1194 /* 1698out:
1195 * There might have been scheduling since the queue_me(), as we 1699 if (ret)
1196 * cannot hold a spinlock across the get_user() in case it 1700 put_futex_key(fshared, &q->key);
1197 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1701 return ret;
1198 * queueing ourselves into the futex hash. This code thus has to 1702}
1199 * rely on the futex_wake() code removing us from hash when it
1200 * wakes us up.
1201 */
1202 1703
1203 /* add_wait_queue is the barrier after __set_current_state. */ 1704static int futex_wait(u32 __user *uaddr, int fshared,
1204 __set_current_state(TASK_INTERRUPTIBLE); 1705 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1205 add_wait_queue(&q.waiter, &wait); 1706{
1206 /* 1707 struct hrtimer_sleeper timeout, *to = NULL;
1207 * !plist_node_empty() is safe here without any lock. 1708 struct restart_block *restart;
1208 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1709 struct futex_hash_bucket *hb;
1209 */ 1710 struct futex_q q;
1210 if (likely(!plist_node_empty(&q.list))) { 1711 int ret;
1211 if (!abs_time)
1212 schedule();
1213 else {
1214 hrtimer_init_on_stack(&t.timer,
1215 clockrt ? CLOCK_REALTIME :
1216 CLOCK_MONOTONIC,
1217 HRTIMER_MODE_ABS);
1218 hrtimer_init_sleeper(&t, current);
1219 hrtimer_set_expires_range_ns(&t.timer, *abs_time,
1220 current->timer_slack_ns);
1221
1222 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1223 if (!hrtimer_active(&t.timer))
1224 t.task = NULL;
1225 1712
1226 /* 1713 if (!bitset)
1227 * the timer could have already expired, in which 1714 return -EINVAL;
1228 * case current would be flagged for rescheduling.
1229 * Don't bother calling schedule.
1230 */
1231 if (likely(t.task))
1232 schedule();
1233 1715
1234 hrtimer_cancel(&t.timer); 1716 q.pi_state = NULL;
1717 q.bitset = bitset;
1718 q.rt_waiter = NULL;
1235 1719
1236 /* Flag if a timeout occured */ 1720 if (abs_time) {
1237 rem = (t.task == NULL); 1721 to = &timeout;
1238 1722
1239 destroy_hrtimer_on_stack(&t.timer); 1723 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
1240 } 1724 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1725 hrtimer_init_sleeper(to, current);
1726 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
1727 current->timer_slack_ns);
1241 } 1728 }
1242 __set_current_state(TASK_RUNNING);
1243 1729
1244 /* 1730 /* Prepare to wait on uaddr. */
1245 * NOTE: we don't remove ourselves from the waitqueue because 1731 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1246 * we are the only user of it. 1732 if (ret)
1247 */ 1733 goto out;
1734
1735 /* queue_me and wait for wakeup, timeout, or a signal. */
1736 futex_wait_queue_me(hb, &q, to);
1248 1737
1249 /* If we were woken (and unqueued), we succeeded, whatever. */ 1738 /* If we were woken (and unqueued), we succeeded, whatever. */
1250 ret = 0; 1739 ret = 0;
1251 if (!unqueue_me(&q)) 1740 if (!unqueue_me(&q))
1252 goto out_put_key; 1741 goto out_put_key;
1253 ret = -ETIMEDOUT; 1742 ret = -ETIMEDOUT;
1254 if (rem) 1743 if (to && !to->task)
1255 goto out_put_key; 1744 goto out_put_key;
1256 1745
1257 /* 1746 /*
@@ -1268,7 +1757,7 @@ retry_private:
1268 restart->futex.val = val; 1757 restart->futex.val = val;
1269 restart->futex.time = abs_time->tv64; 1758 restart->futex.time = abs_time->tv64;
1270 restart->futex.bitset = bitset; 1759 restart->futex.bitset = bitset;
1271 restart->futex.flags = 0; 1760 restart->futex.flags = FLAGS_HAS_TIMEOUT;
1272 1761
1273 if (fshared) 1762 if (fshared)
1274 restart->futex.flags |= FLAGS_SHARED; 1763 restart->futex.flags |= FLAGS_SHARED;
@@ -1280,6 +1769,10 @@ retry_private:
1280out_put_key: 1769out_put_key:
1281 put_futex_key(fshared, &q.key); 1770 put_futex_key(fshared, &q.key);
1282out: 1771out:
1772 if (to) {
1773 hrtimer_cancel(&to->timer);
1774 destroy_hrtimer_on_stack(&to->timer);
1775 }
1283 return ret; 1776 return ret;
1284} 1777}
1285 1778
@@ -1288,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
1288{ 1781{
1289 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1782 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1290 int fshared = 0; 1783 int fshared = 0;
1291 ktime_t t; 1784 ktime_t t, *tp = NULL;
1292 1785
1293 t.tv64 = restart->futex.time; 1786 if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
1787 t.tv64 = restart->futex.time;
1788 tp = &t;
1789 }
1294 restart->fn = do_no_restart_syscall; 1790 restart->fn = do_no_restart_syscall;
1295 if (restart->futex.flags & FLAGS_SHARED) 1791 if (restart->futex.flags & FLAGS_SHARED)
1296 fshared = 1; 1792 fshared = 1;
1297 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1793 return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
1298 restart->futex.bitset, 1794 restart->futex.bitset,
1299 restart->futex.flags & FLAGS_CLOCKRT); 1795 restart->futex.flags & FLAGS_CLOCKRT);
1300} 1796}
@@ -1310,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1310 int detect, ktime_t *time, int trylock) 1806 int detect, ktime_t *time, int trylock)
1311{ 1807{
1312 struct hrtimer_sleeper timeout, *to = NULL; 1808 struct hrtimer_sleeper timeout, *to = NULL;
1313 struct task_struct *curr = current;
1314 struct futex_hash_bucket *hb; 1809 struct futex_hash_bucket *hb;
1315 u32 uval, newval, curval; 1810 u32 uval;
1316 struct futex_q q; 1811 struct futex_q q;
1317 int ret, lock_taken, ownerdied = 0; 1812 int res, ret;
1318 1813
1319 if (refill_pi_state_cache()) 1814 if (refill_pi_state_cache())
1320 return -ENOMEM; 1815 return -ENOMEM;
@@ -1328,90 +1823,25 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1328 } 1823 }
1329 1824
1330 q.pi_state = NULL; 1825 q.pi_state = NULL;
1826 q.rt_waiter = NULL;
1331retry: 1827retry:
1332 q.key = FUTEX_KEY_INIT; 1828 q.key = FUTEX_KEY_INIT;
1333 ret = get_futex_key(uaddr, fshared, &q.key); 1829 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
1334 if (unlikely(ret != 0)) 1830 if (unlikely(ret != 0))
1335 goto out; 1831 goto out;
1336 1832
1337retry_private: 1833retry_private:
1338 hb = queue_lock(&q); 1834 hb = queue_lock(&q);
1339 1835
1340retry_locked: 1836 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
1341 ret = lock_taken = 0;
1342
1343 /*
1344 * To avoid races, we attempt to take the lock here again
1345 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1346 * the locks. It will most likely not succeed.
1347 */
1348 newval = task_pid_vnr(current);
1349
1350 curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
1351
1352 if (unlikely(curval == -EFAULT))
1353 goto uaddr_faulted;
1354
1355 /*
1356 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1357 * situation and we return success to user space.
1358 */
1359 if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
1360 ret = -EDEADLK;
1361 goto out_unlock_put_key;
1362 }
1363
1364 /*
1365 * Surprise - we got the lock. Just return to userspace:
1366 */
1367 if (unlikely(!curval))
1368 goto out_unlock_put_key;
1369
1370 uval = curval;
1371
1372 /*
1373 * Set the WAITERS flag, so the owner will know it has someone
1374 * to wake at next unlock
1375 */
1376 newval = curval | FUTEX_WAITERS;
1377
1378 /*
1379 * There are two cases, where a futex might have no owner (the
1380 * owner TID is 0): OWNER_DIED. We take over the futex in this
1381 * case. We also do an unconditional take over, when the owner
1382 * of the futex died.
1383 *
1384 * This is safe as we are protected by the hash bucket lock !
1385 */
1386 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1387 /* Keep the OWNER_DIED bit */
1388 newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
1389 ownerdied = 0;
1390 lock_taken = 1;
1391 }
1392
1393 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1394
1395 if (unlikely(curval == -EFAULT))
1396 goto uaddr_faulted;
1397 if (unlikely(curval != uval))
1398 goto retry_locked;
1399
1400 /*
1401 * We took the lock due to owner died take over.
1402 */
1403 if (unlikely(lock_taken))
1404 goto out_unlock_put_key;
1405
1406 /*
1407 * We dont have the lock. Look up the PI state (or create it if
1408 * we are the first waiter):
1409 */
1410 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1411
1412 if (unlikely(ret)) { 1837 if (unlikely(ret)) {
1413 switch (ret) { 1838 switch (ret) {
1414 1839 case 1:
1840 /* We got the lock. */
1841 ret = 0;
1842 goto out_unlock_put_key;
1843 case -EFAULT:
1844 goto uaddr_faulted;
1415 case -EAGAIN: 1845 case -EAGAIN:
1416 /* 1846 /*
1417 * Task is exiting and we just wait for the 1847 * Task is exiting and we just wait for the
@@ -1421,25 +1851,6 @@ retry_locked:
1421 put_futex_key(fshared, &q.key); 1851 put_futex_key(fshared, &q.key);
1422 cond_resched(); 1852 cond_resched();
1423 goto retry; 1853 goto retry;
1424
1425 case -ESRCH:
1426 /*
1427 * No owner found for this futex. Check if the
1428 * OWNER_DIED bit is set to figure out whether
1429 * this is a robust futex or not.
1430 */
1431 if (get_futex_value_locked(&curval, uaddr))
1432 goto uaddr_faulted;
1433
1434 /*
1435 * We simply start over in case of a robust
1436 * futex. The code above will take the futex
1437 * and return happy.
1438 */
1439 if (curval & FUTEX_OWNER_DIED) {
1440 ownerdied = 1;
1441 goto retry_locked;
1442 }
1443 default: 1854 default:
1444 goto out_unlock_put_key; 1855 goto out_unlock_put_key;
1445 } 1856 }
@@ -1463,71 +1874,21 @@ retry_locked:
1463 } 1874 }
1464 1875
1465 spin_lock(q.lock_ptr); 1876 spin_lock(q.lock_ptr);
1466 1877 /*
1467 if (!ret) { 1878 * Fixup the pi_state owner and possibly acquire the lock if we
1468 /* 1879 * haven't already.
1469 * Got the lock. We might not be the anticipated owner 1880 */
1470 * if we did a lock-steal - fix up the PI-state in 1881 res = fixup_owner(uaddr, fshared, &q, !ret);
1471 * that case: 1882 /*
1472 */ 1883 * If fixup_owner() returned an error, proprogate that. If it acquired
1473 if (q.pi_state->owner != curr) 1884 * the lock, clear our -ETIMEDOUT or -EINTR.
1474 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); 1885 */
1475 } else { 1886 if (res)
1476 /* 1887 ret = (res < 0) ? res : 0;
1477 * Catch the rare case, where the lock was released
1478 * when we were on the way back before we locked the
1479 * hash bucket.
1480 */
1481 if (q.pi_state->owner == curr) {
1482 /*
1483 * Try to get the rt_mutex now. This might
1484 * fail as some other task acquired the
1485 * rt_mutex after we removed ourself from the
1486 * rt_mutex waiters list.
1487 */
1488 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1489 ret = 0;
1490 else {
1491 /*
1492 * pi_state is incorrect, some other
1493 * task did a lock steal and we
1494 * returned due to timeout or signal
1495 * without taking the rt_mutex. Too
1496 * late. We can access the
1497 * rt_mutex_owner without locking, as
1498 * the other task is now blocked on
1499 * the hash bucket lock. Fix the state
1500 * up.
1501 */
1502 struct task_struct *owner;
1503 int res;
1504
1505 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1506 res = fixup_pi_state_owner(uaddr, &q, owner,
1507 fshared);
1508
1509 /* propagate -EFAULT, if the fixup failed */
1510 if (res)
1511 ret = res;
1512 }
1513 } else {
1514 /*
1515 * Paranoia check. If we did not take the lock
1516 * in the trylock above, then we should not be
1517 * the owner of the rtmutex, neither the real
1518 * nor the pending one:
1519 */
1520 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1521 printk(KERN_ERR "futex_lock_pi: ret = %d "
1522 "pi-mutex: %p pi-state %p\n", ret,
1523 q.pi_state->pi_mutex.owner,
1524 q.pi_state->owner);
1525 }
1526 }
1527 1888
1528 /* 1889 /*
1529 * If fixup_pi_state_owner() faulted and was unable to handle the 1890 * If fixup_owner() faulted and was unable to handle the fault, unlock
1530 * fault, unlock it and return the fault to userspace. 1891 * it and return the fault to userspace.
1531 */ 1892 */
1532 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) 1893 if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
1533 rt_mutex_unlock(&q.pi_state->pi_mutex); 1894 rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1535,9 +1896,7 @@ retry_locked:
1535 /* Unqueue and drop the lock */ 1896 /* Unqueue and drop the lock */
1536 unqueue_me_pi(&q); 1897 unqueue_me_pi(&q);
1537 1898
1538 if (to) 1899 goto out;
1539 destroy_hrtimer_on_stack(&to->timer);
1540 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1541 1900
1542out_unlock_put_key: 1901out_unlock_put_key:
1543 queue_unlock(&q, hb); 1902 queue_unlock(&q, hb);
@@ -1547,7 +1906,7 @@ out_put_key:
1547out: 1906out:
1548 if (to) 1907 if (to)
1549 destroy_hrtimer_on_stack(&to->timer); 1908 destroy_hrtimer_on_stack(&to->timer);
1550 return ret; 1909 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1551 1910
1552uaddr_faulted: 1911uaddr_faulted:
1553 /* 1912 /*
@@ -1570,7 +1929,6 @@ uaddr_faulted:
1570 goto retry; 1929 goto retry;
1571} 1930}
1572 1931
1573
1574/* 1932/*
1575 * Userspace attempted a TID -> 0 atomic transition, and failed. 1933 * Userspace attempted a TID -> 0 atomic transition, and failed.
1576 * This is the in-kernel slowpath: we look up the PI state (if any), 1934 * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1594,7 +1952,7 @@ retry:
1594 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1952 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1595 return -EPERM; 1953 return -EPERM;
1596 1954
1597 ret = get_futex_key(uaddr, fshared, &key); 1955 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
1598 if (unlikely(ret != 0)) 1956 if (unlikely(ret != 0))
1599 goto out; 1957 goto out;
1600 1958
@@ -1672,6 +2030,229 @@ pi_faulted:
1672 return ret; 2030 return ret;
1673} 2031}
1674 2032
2033/**
2034 * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2035 * @hb: the hash_bucket futex_q was original enqueued on
2036 * @q: the futex_q woken while waiting to be requeued
2037 * @key2: the futex_key of the requeue target futex
2038 * @timeout: the timeout associated with the wait (NULL if none)
2039 *
2040 * Detect if the task was woken on the initial futex as opposed to the requeue
2041 * target futex. If so, determine if it was a timeout or a signal that caused
2042 * the wakeup and return the appropriate error code to the caller. Must be
2043 * called with the hb lock held.
2044 *
2045 * Returns
2046 * 0 - no early wakeup detected
2047 * <0 - -ETIMEDOUT or -ERESTARTNOINTR
2048 */
2049static inline
2050int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2051 struct futex_q *q, union futex_key *key2,
2052 struct hrtimer_sleeper *timeout)
2053{
2054 int ret = 0;
2055
2056 /*
2057 * With the hb lock held, we avoid races while we process the wakeup.
2058 * We only need to hold hb (and not hb2) to ensure atomicity as the
2059 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2060 * It can't be requeued from uaddr2 to something else since we don't
2061 * support a PI aware source futex for requeue.
2062 */
2063 if (!match_futex(&q->key, key2)) {
2064 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2065 /*
2066 * We were woken prior to requeue by a timeout or a signal.
2067 * Unqueue the futex_q and determine which it was.
2068 */
2069 plist_del(&q->list, &q->list.plist);
2070 drop_futex_key_refs(&q->key);
2071
2072 if (timeout && !timeout->task)
2073 ret = -ETIMEDOUT;
2074 else
2075 ret = -ERESTARTNOINTR;
2076 }
2077 return ret;
2078}
2079
2080/**
2081 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2082 * @uaddr: the futex we initialyl wait on (non-pi)
2083 * @fshared: whether the futexes are shared (1) or not (0). They must be
2084 * the same type, no requeueing from private to shared, etc.
2085 * @val: the expected value of uaddr
2086 * @abs_time: absolute timeout
2087 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all.
2088 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2089 * @uaddr2: the pi futex we will take prior to returning to user-space
2090 *
2091 * The caller will wait on uaddr and will be requeued by futex_requeue() to
2092 * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
2093 * complete the acquisition of the rt_mutex prior to returning to userspace.
2094 * This ensures the rt_mutex maintains an owner when it has waiters; without
2095 * one, the pi logic wouldn't know which task to boost/deboost, if there was a
2096 * need to.
2097 *
2098 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2099 * via the following:
2100 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2101 * 2) wakeup on uaddr2 after a requeue and subsequent unlock
2102 * 3) signal (before or after requeue)
2103 * 4) timeout (before or after requeue)
2104 *
2105 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
2106 *
2107 * If 2, we may then block on trying to take the rt_mutex and return via:
2108 * 5) successful lock
2109 * 6) signal
2110 * 7) timeout
2111 * 8) other lock acquisition failure
2112 *
2113 * If 6, we setup a restart_block with futex_lock_pi() as the function.
2114 *
2115 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2116 *
2117 * Returns:
2118 * 0 - On success
2119 * <0 - On error
2120 */
2121static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2122 u32 val, ktime_t *abs_time, u32 bitset,
2123 int clockrt, u32 __user *uaddr2)
2124{
2125 struct hrtimer_sleeper timeout, *to = NULL;
2126 struct rt_mutex_waiter rt_waiter;
2127 struct rt_mutex *pi_mutex = NULL;
2128 struct futex_hash_bucket *hb;
2129 union futex_key key2;
2130 struct futex_q q;
2131 int res, ret;
2132
2133 if (!bitset)
2134 return -EINVAL;
2135
2136 if (abs_time) {
2137 to = &timeout;
2138 hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
2139 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2140 hrtimer_init_sleeper(to, current);
2141 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2142 current->timer_slack_ns);
2143 }
2144
2145 /*
2146 * The waiter is allocated on our stack, manipulated by the requeue
2147 * code while we sleep on uaddr.
2148 */
2149 debug_rt_mutex_init_waiter(&rt_waiter);
2150 rt_waiter.task = NULL;
2151
2152 q.pi_state = NULL;
2153 q.bitset = bitset;
2154 q.rt_waiter = &rt_waiter;
2155
2156 key2 = FUTEX_KEY_INIT;
2157 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2158 if (unlikely(ret != 0))
2159 goto out;
2160
2161 /* Prepare to wait on uaddr. */
2162 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2163 if (ret)
2164 goto out_key2;
2165
2166 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2167 futex_wait_queue_me(hb, &q, to);
2168
2169 spin_lock(&hb->lock);
2170 ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2171 spin_unlock(&hb->lock);
2172 if (ret)
2173 goto out_put_keys;
2174
2175 /*
2176 * In order for us to be here, we know our q.key == key2, and since
2177 * we took the hb->lock above, we also know that futex_requeue() has
2178 * completed and we no longer have to concern ourselves with a wakeup
2179 * race with the atomic proxy lock acquition by the requeue code.
2180 */
2181
2182 /* Check if the requeue code acquired the second futex for us. */
2183 if (!q.rt_waiter) {
2184 /*
2185 * Got the lock. We might not be the anticipated owner if we
2186 * did a lock-steal - fix up the PI-state in that case.
2187 */
2188 if (q.pi_state && (q.pi_state->owner != current)) {
2189 spin_lock(q.lock_ptr);
2190 ret = fixup_pi_state_owner(uaddr2, &q, current,
2191 fshared);
2192 spin_unlock(q.lock_ptr);
2193 }
2194 } else {
2195 /*
2196 * We have been woken up by futex_unlock_pi(), a timeout, or a
2197 * signal. futex_unlock_pi() will not destroy the lock_ptr nor
2198 * the pi_state.
2199 */
2200 WARN_ON(!&q.pi_state);
2201 pi_mutex = &q.pi_state->pi_mutex;
2202 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2203 debug_rt_mutex_free_waiter(&rt_waiter);
2204
2205 spin_lock(q.lock_ptr);
2206 /*
2207 * Fixup the pi_state owner and possibly acquire the lock if we
2208 * haven't already.
2209 */
2210 res = fixup_owner(uaddr2, fshared, &q, !ret);
2211 /*
2212 * If fixup_owner() returned an error, proprogate that. If it
2213 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
2214 */
2215 if (res)
2216 ret = (res < 0) ? res : 0;
2217
2218 /* Unqueue and drop the lock. */
2219 unqueue_me_pi(&q);
2220 }
2221
2222 /*
2223 * If fixup_pi_state_owner() faulted and was unable to handle the
2224 * fault, unlock the rt_mutex and return the fault to userspace.
2225 */
2226 if (ret == -EFAULT) {
2227 if (rt_mutex_owner(pi_mutex) == current)
2228 rt_mutex_unlock(pi_mutex);
2229 } else if (ret == -EINTR) {
2230 /*
2231 * We've already been requeued, but we have no way to
2232 * restart by calling futex_lock_pi() directly. We
2233 * could restart the syscall, but that will look at
2234 * the user space value and return right away. So we
2235 * drop back with EWOULDBLOCK to tell user space that
2236 * "val" has been changed. That's the same what the
2237 * restart of the syscall would do in
2238 * futex_wait_setup().
2239 */
2240 ret = -EWOULDBLOCK;
2241 }
2242
2243out_put_keys:
2244 put_futex_key(fshared, &q.key);
2245out_key2:
2246 put_futex_key(fshared, &key2);
2247
2248out:
2249 if (to) {
2250 hrtimer_cancel(&to->timer);
2251 destroy_hrtimer_on_stack(&to->timer);
2252 }
2253 return ret;
2254}
2255
1675/* 2256/*
1676 * Support for robust futexes: the kernel cleans up held futexes at 2257 * Support for robust futexes: the kernel cleans up held futexes at
1677 * thread exit time. 2258 * thread exit time.
@@ -1894,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1894 fshared = 1; 2475 fshared = 1;
1895 2476
1896 clockrt = op & FUTEX_CLOCK_REALTIME; 2477 clockrt = op & FUTEX_CLOCK_REALTIME;
1897 if (clockrt && cmd != FUTEX_WAIT_BITSET) 2478 if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
1898 return -ENOSYS; 2479 return -ENOSYS;
1899 2480
1900 switch (cmd) { 2481 switch (cmd) {
@@ -1909,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1909 ret = futex_wake(uaddr, fshared, val, val3); 2490 ret = futex_wake(uaddr, fshared, val, val3);
1910 break; 2491 break;
1911 case FUTEX_REQUEUE: 2492 case FUTEX_REQUEUE:
1912 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); 2493 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
1913 break; 2494 break;
1914 case FUTEX_CMP_REQUEUE: 2495 case FUTEX_CMP_REQUEUE:
1915 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); 2496 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2497 0);
1916 break; 2498 break;
1917 case FUTEX_WAKE_OP: 2499 case FUTEX_WAKE_OP:
1918 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); 2500 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1929,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1929 if (futex_cmpxchg_enabled) 2511 if (futex_cmpxchg_enabled)
1930 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); 2512 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
1931 break; 2513 break;
2514 case FUTEX_WAIT_REQUEUE_PI:
2515 val3 = FUTEX_BITSET_MATCH_ANY;
2516 ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
2517 clockrt, uaddr2);
2518 break;
2519 case FUTEX_CMP_REQUEUE_PI:
2520 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
2521 1);
2522 break;
1932 default: 2523 default:
1933 ret = -ENOSYS; 2524 ret = -ENOSYS;
1934 } 2525 }
@@ -1946,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1946 int cmd = op & FUTEX_CMD_MASK; 2537 int cmd = op & FUTEX_CMD_MASK;
1947 2538
1948 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 2539 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
1949 cmd == FUTEX_WAIT_BITSET)) { 2540 cmd == FUTEX_WAIT_BITSET ||
2541 cmd == FUTEX_WAIT_REQUEUE_PI)) {
1950 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2542 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1951 return -EFAULT; 2543 return -EFAULT;
1952 if (!timespec_valid(&ts)) 2544 if (!timespec_valid(&ts))
@@ -1958,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
1958 tp = &t; 2550 tp = &t;
1959 } 2551 }
1960 /* 2552 /*
1961 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. 2553 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
1962 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. 2554 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
1963 */ 2555 */
1964 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || 2556 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
1965 cmd == FUTEX_WAKE_OP) 2557 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
1966 val2 = (u32) (unsigned long) utime; 2558 val2 = (u32) (unsigned long) utime;
1967 2559
1968 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2560 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
new file mode 100644
index 000000000000..22e9dcfaa3d3
--- /dev/null
+++ b/kernel/gcov/Kconfig
@@ -0,0 +1,48 @@
1menu "GCOV-based kernel profiling"
2
3config GCOV_KERNEL
4 bool "Enable gcov-based kernel profiling"
5 depends on DEBUG_FS && CONSTRUCTORS
6 default n
7 ---help---
8 This option enables gcov-based code profiling (e.g. for code coverage
9 measurements).
10
11 If unsure, say N.
12
13 Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data
14 for the entire kernel. To enable profiling for specific files or
15 directories, add a line similar to the following to the respective
16 Makefile:
17
18 For a single file (e.g. main.o):
19 GCOV_PROFILE_main.o := y
20
21 For all files in one directory:
22 GCOV_PROFILE := y
23
24 To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL
25 is specified, use:
26
27 GCOV_PROFILE_main.o := n
28 and:
29 GCOV_PROFILE := n
30
31 Note that the debugfs filesystem has to be mounted to access
32 profiling data.
33
34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL
37 depends on S390 || X86
38 default n
39 ---help---
40 This options activates profiling for the entire kernel.
41
42 If unsure, say N.
43
44 Note that a kernel compiled with profiling flags will be significantly
45 larger and run slower. Also be sure to exclude files from profiling
46 which are not linked to the kernel image to prevent linker errors.
47
48endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
new file mode 100644
index 000000000000..3f761001d517
--- /dev/null
+++ b/kernel/gcov/Makefile
@@ -0,0 +1,3 @@
1EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
2
3obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
new file mode 100644
index 000000000000..9b22d03cc581
--- /dev/null
+++ b/kernel/gcov/base.c
@@ -0,0 +1,148 @@
1/*
2 * This code maintains a list of active profiling data structures.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 */
15
16#define pr_fmt(fmt) "gcov: " fmt
17
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/mutex.h>
21#include "gcov.h"
22
23static struct gcov_info *gcov_info_head;
24static int gcov_events_enabled;
25static DEFINE_MUTEX(gcov_lock);
26
27/*
28 * __gcov_init is called by gcc-generated constructor code for each object
29 * file compiled with -fprofile-arcs.
30 */
31void __gcov_init(struct gcov_info *info)
32{
33 static unsigned int gcov_version;
34
35 mutex_lock(&gcov_lock);
36 if (gcov_version == 0) {
37 gcov_version = info->version;
38 /*
39 * Printing gcc's version magic may prove useful for debugging
40 * incompatibility reports.
41 */
42 pr_info("version magic: 0x%x\n", gcov_version);
43 }
44 /*
45 * Add new profiling data structure to list and inform event
46 * listener.
47 */
48 info->next = gcov_info_head;
49 gcov_info_head = info;
50 if (gcov_events_enabled)
51 gcov_event(GCOV_ADD, info);
52 mutex_unlock(&gcov_lock);
53}
54EXPORT_SYMBOL(__gcov_init);
55
56/*
57 * These functions may be referenced by gcc-generated profiling code but serve
58 * no function for kernel profiling.
59 */
60void __gcov_flush(void)
61{
62 /* Unused. */
63}
64EXPORT_SYMBOL(__gcov_flush);
65
66void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
67{
68 /* Unused. */
69}
70EXPORT_SYMBOL(__gcov_merge_add);
71
72void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
73{
74 /* Unused. */
75}
76EXPORT_SYMBOL(__gcov_merge_single);
77
78void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
79{
80 /* Unused. */
81}
82EXPORT_SYMBOL(__gcov_merge_delta);
83
84/**
85 * gcov_enable_events - enable event reporting through gcov_event()
86 *
87 * Turn on reporting of profiling data load/unload-events through the
88 * gcov_event() callback. Also replay all previous events once. This function
89 * is needed because some events are potentially generated too early for the
90 * callback implementation to handle them initially.
91 */
92void gcov_enable_events(void)
93{
94 struct gcov_info *info;
95
96 mutex_lock(&gcov_lock);
97 gcov_events_enabled = 1;
98 /* Perform event callback for previously registered entries. */
99 for (info = gcov_info_head; info; info = info->next)
100 gcov_event(GCOV_ADD, info);
101 mutex_unlock(&gcov_lock);
102}
103
104#ifdef CONFIG_MODULES
105static inline int within(void *addr, void *start, unsigned long size)
106{
107 return ((addr >= start) && (addr < start + size));
108}
109
110/* Update list and generate events when modules are unloaded. */
111static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
112 void *data)
113{
114 struct module *mod = data;
115 struct gcov_info *info;
116 struct gcov_info *prev;
117
118 if (event != MODULE_STATE_GOING)
119 return NOTIFY_OK;
120 mutex_lock(&gcov_lock);
121 prev = NULL;
122 /* Remove entries located in module from linked list. */
123 for (info = gcov_info_head; info; info = info->next) {
124 if (within(info, mod->module_core, mod->core_size)) {
125 if (prev)
126 prev->next = info->next;
127 else
128 gcov_info_head = info->next;
129 if (gcov_events_enabled)
130 gcov_event(GCOV_REMOVE, info);
131 } else
132 prev = info;
133 }
134 mutex_unlock(&gcov_lock);
135
136 return NOTIFY_OK;
137}
138
139static struct notifier_block gcov_nb = {
140 .notifier_call = gcov_module_notifier,
141};
142
143static int __init gcov_init(void)
144{
145 return register_module_notifier(&gcov_nb);
146}
147device_initcall(gcov_init);
148#endif /* CONFIG_MODULES */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
new file mode 100644
index 000000000000..ef3c3f88a7a3
--- /dev/null
+++ b/kernel/gcov/fs.c
@@ -0,0 +1,673 @@
1/*
2 * This code exports profiling data as debugfs files to userspace.
3 *
4 * Copyright IBM Corp. 2009
5 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
6 *
7 * Uses gcc-internal data definitions.
8 * Based on the gcov-kernel patch by:
9 * Hubertus Franke <frankeh@us.ibm.com>
10 * Nigel Hinds <nhinds@us.ibm.com>
11 * Rajan Ravindran <rajancr@us.ibm.com>
12 * Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 * Paul Larson
14 * Yi CDL Yang
15 */
16
17#define pr_fmt(fmt) "gcov: " fmt
18
19#include <linux/init.h>
20#include <linux/module.h>
21#include <linux/debugfs.h>
22#include <linux/fs.h>
23#include <linux/list.h>
24#include <linux/string.h>
25#include <linux/slab.h>
26#include <linux/mutex.h>
27#include <linux/seq_file.h>
28#include "gcov.h"
29
30/**
31 * struct gcov_node - represents a debugfs entry
32 * @list: list head for child node list
33 * @children: child nodes
34 * @all: list head for list of all nodes
35 * @parent: parent node
36 * @info: associated profiling data structure if not a directory
37 * @ghost: when an object file containing profiling data is unloaded we keep a
38 * copy of the profiling data here to allow collecting coverage data
39 * for cleanup code. Such a node is called a "ghost".
40 * @dentry: main debugfs entry, either a directory or data file
41 * @links: associated symbolic links
42 * @name: data file basename
43 *
44 * struct gcov_node represents an entity within the gcov/ subdirectory
45 * of debugfs. There are directory and data file nodes. The latter represent
46 * the actual synthesized data file plus any associated symbolic links which
47 * are needed by the gcov tool to work correctly.
48 */
49struct gcov_node {
50 struct list_head list;
51 struct list_head children;
52 struct list_head all;
53 struct gcov_node *parent;
54 struct gcov_info *info;
55 struct gcov_info *ghost;
56 struct dentry *dentry;
57 struct dentry **links;
58 char name[0];
59};
60
61static const char objtree[] = OBJTREE;
62static const char srctree[] = SRCTREE;
63static struct gcov_node root_node;
64static struct dentry *reset_dentry;
65static LIST_HEAD(all_head);
66static DEFINE_MUTEX(node_lock);
67
68/* If non-zero, keep copies of profiling data for unloaded modules. */
69static int gcov_persist = 1;
70
71static int __init gcov_persist_setup(char *str)
72{
73 unsigned long val;
74
75 if (strict_strtoul(str, 0, &val)) {
76 pr_warning("invalid gcov_persist parameter '%s'\n", str);
77 return 0;
78 }
79 gcov_persist = val;
80 pr_info("setting gcov_persist to %d\n", gcov_persist);
81
82 return 1;
83}
84__setup("gcov_persist=", gcov_persist_setup);
85
86/*
87 * seq_file.start() implementation for gcov data files. Note that the
88 * gcov_iterator interface is designed to be more restrictive than seq_file
89 * (no start from arbitrary position, etc.), to simplify the iterator
90 * implementation.
91 */
92static void *gcov_seq_start(struct seq_file *seq, loff_t *pos)
93{
94 loff_t i;
95
96 gcov_iter_start(seq->private);
97 for (i = 0; i < *pos; i++) {
98 if (gcov_iter_next(seq->private))
99 return NULL;
100 }
101 return seq->private;
102}
103
104/* seq_file.next() implementation for gcov data files. */
105static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
106{
107 struct gcov_iterator *iter = data;
108
109 if (gcov_iter_next(iter))
110 return NULL;
111 (*pos)++;
112
113 return iter;
114}
115
116/* seq_file.show() implementation for gcov data files. */
117static int gcov_seq_show(struct seq_file *seq, void *data)
118{
119 struct gcov_iterator *iter = data;
120
121 if (gcov_iter_write(iter, seq))
122 return -EINVAL;
123 return 0;
124}
125
126static void gcov_seq_stop(struct seq_file *seq, void *data)
127{
128 /* Unused. */
129}
130
131static const struct seq_operations gcov_seq_ops = {
132 .start = gcov_seq_start,
133 .next = gcov_seq_next,
134 .show = gcov_seq_show,
135 .stop = gcov_seq_stop,
136};
137
138/*
139 * Return the profiling data set for a given node. This can either be the
140 * original profiling data structure or a duplicate (also called "ghost")
141 * in case the associated object file has been unloaded.
142 */
143static struct gcov_info *get_node_info(struct gcov_node *node)
144{
145 if (node->info)
146 return node->info;
147
148 return node->ghost;
149}
150
151/*
152 * open() implementation for gcov data files. Create a copy of the profiling
153 * data set and initialize the iterator and seq_file interface.
154 */
155static int gcov_seq_open(struct inode *inode, struct file *file)
156{
157 struct gcov_node *node = inode->i_private;
158 struct gcov_iterator *iter;
159 struct seq_file *seq;
160 struct gcov_info *info;
161 int rc = -ENOMEM;
162
163 mutex_lock(&node_lock);
164 /*
165 * Read from a profiling data copy to minimize reference tracking
166 * complexity and concurrent access.
167 */
168 info = gcov_info_dup(get_node_info(node));
169 if (!info)
170 goto out_unlock;
171 iter = gcov_iter_new(info);
172 if (!iter)
173 goto err_free_info;
174 rc = seq_open(file, &gcov_seq_ops);
175 if (rc)
176 goto err_free_iter_info;
177 seq = file->private_data;
178 seq->private = iter;
179out_unlock:
180 mutex_unlock(&node_lock);
181 return rc;
182
183err_free_iter_info:
184 gcov_iter_free(iter);
185err_free_info:
186 gcov_info_free(info);
187 goto out_unlock;
188}
189
190/*
191 * release() implementation for gcov data files. Release resources allocated
192 * by open().
193 */
194static int gcov_seq_release(struct inode *inode, struct file *file)
195{
196 struct gcov_iterator *iter;
197 struct gcov_info *info;
198 struct seq_file *seq;
199
200 seq = file->private_data;
201 iter = seq->private;
202 info = gcov_iter_get_info(iter);
203 gcov_iter_free(iter);
204 gcov_info_free(info);
205 seq_release(inode, file);
206
207 return 0;
208}
209
210/*
211 * Find a node by the associated data file name. Needs to be called with
212 * node_lock held.
213 */
214static struct gcov_node *get_node_by_name(const char *name)
215{
216 struct gcov_node *node;
217 struct gcov_info *info;
218
219 list_for_each_entry(node, &all_head, all) {
220 info = get_node_info(node);
221 if (info && (strcmp(info->filename, name) == 0))
222 return node;
223 }
224
225 return NULL;
226}
227
228static void remove_node(struct gcov_node *node);
229
230/*
231 * write() implementation for gcov data files. Reset profiling data for the
232 * associated file. If the object file has been unloaded (i.e. this is
233 * a "ghost" node), remove the debug fs node as well.
234 */
235static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
236 size_t len, loff_t *pos)
237{
238 struct seq_file *seq;
239 struct gcov_info *info;
240 struct gcov_node *node;
241
242 seq = file->private_data;
243 info = gcov_iter_get_info(seq->private);
244 mutex_lock(&node_lock);
245 node = get_node_by_name(info->filename);
246 if (node) {
247 /* Reset counts or remove node for unloaded modules. */
248 if (node->ghost)
249 remove_node(node);
250 else
251 gcov_info_reset(node->info);
252 }
253 /* Reset counts for open file. */
254 gcov_info_reset(info);
255 mutex_unlock(&node_lock);
256
257 return len;
258}
259
260/*
261 * Given a string <path> representing a file path of format:
262 * path/to/file.gcda
263 * construct and return a new string:
264 * <dir/>path/to/file.<ext>
265 */
266static char *link_target(const char *dir, const char *path, const char *ext)
267{
268 char *target;
269 char *old_ext;
270 char *copy;
271
272 copy = kstrdup(path, GFP_KERNEL);
273 if (!copy)
274 return NULL;
275 old_ext = strrchr(copy, '.');
276 if (old_ext)
277 *old_ext = '\0';
278 if (dir)
279 target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext);
280 else
281 target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext);
282 kfree(copy);
283
284 return target;
285}
286
287/*
288 * Construct a string representing the symbolic link target for the given
289 * gcov data file name and link type. Depending on the link type and the
290 * location of the data file, the link target can either point to a
291 * subdirectory of srctree, objtree or in an external location.
292 */
293static char *get_link_target(const char *filename, const struct gcov_link *ext)
294{
295 const char *rel;
296 char *result;
297
298 if (strncmp(filename, objtree, strlen(objtree)) == 0) {
299 rel = filename + strlen(objtree) + 1;
300 if (ext->dir == SRC_TREE)
301 result = link_target(srctree, rel, ext->ext);
302 else
303 result = link_target(objtree, rel, ext->ext);
304 } else {
305 /* External compilation. */
306 result = link_target(NULL, filename, ext->ext);
307 }
308
309 return result;
310}
311
312#define SKEW_PREFIX ".tmp_"
313
314/*
315 * For a filename .tmp_filename.ext return filename.ext. Needed to compensate
316 * for filename skewing caused by the mod-versioning mechanism.
317 */
318static const char *deskew(const char *basename)
319{
320 if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0)
321 return basename + sizeof(SKEW_PREFIX) - 1;
322 return basename;
323}
324
325/*
326 * Create links to additional files (usually .c and .gcno files) which the
327 * gcov tool expects to find in the same directory as the gcov data file.
328 */
329static void add_links(struct gcov_node *node, struct dentry *parent)
330{
331 char *basename;
332 char *target;
333 int num;
334 int i;
335
336 for (num = 0; gcov_link[num].ext; num++)
337 /* Nothing. */;
338 node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL);
339 if (!node->links)
340 return;
341 for (i = 0; i < num; i++) {
342 target = get_link_target(get_node_info(node)->filename,
343 &gcov_link[i]);
344 if (!target)
345 goto out_err;
346 basename = strrchr(target, '/');
347 if (!basename)
348 goto out_err;
349 basename++;
350 node->links[i] = debugfs_create_symlink(deskew(basename),
351 parent, target);
352 if (!node->links[i])
353 goto out_err;
354 kfree(target);
355 }
356
357 return;
358out_err:
359 kfree(target);
360 while (i-- > 0)
361 debugfs_remove(node->links[i]);
362 kfree(node->links);
363 node->links = NULL;
364}
365
366static const struct file_operations gcov_data_fops = {
367 .open = gcov_seq_open,
368 .release = gcov_seq_release,
369 .read = seq_read,
370 .llseek = seq_lseek,
371 .write = gcov_seq_write,
372};
373
374/* Basic initialization of a new node. */
375static void init_node(struct gcov_node *node, struct gcov_info *info,
376 const char *name, struct gcov_node *parent)
377{
378 INIT_LIST_HEAD(&node->list);
379 INIT_LIST_HEAD(&node->children);
380 INIT_LIST_HEAD(&node->all);
381 node->info = info;
382 node->parent = parent;
383 if (name)
384 strcpy(node->name, name);
385}
386
387/*
388 * Create a new node and associated debugfs entry. Needs to be called with
389 * node_lock held.
390 */
391static struct gcov_node *new_node(struct gcov_node *parent,
392 struct gcov_info *info, const char *name)
393{
394 struct gcov_node *node;
395
396 node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
397 if (!node) {
398 pr_warning("out of memory\n");
399 return NULL;
400 }
401 init_node(node, info, name, parent);
402 /* Differentiate between gcov data file nodes and directory nodes. */
403 if (info) {
404 node->dentry = debugfs_create_file(deskew(node->name), 0600,
405 parent->dentry, node, &gcov_data_fops);
406 } else
407 node->dentry = debugfs_create_dir(node->name, parent->dentry);
408 if (!node->dentry) {
409 pr_warning("could not create file\n");
410 kfree(node);
411 return NULL;
412 }
413 if (info)
414 add_links(node, parent->dentry);
415 list_add(&node->list, &parent->children);
416 list_add(&node->all, &all_head);
417
418 return node;
419}
420
421/* Remove symbolic links associated with node. */
422static void remove_links(struct gcov_node *node)
423{
424 int i;
425
426 if (!node->links)
427 return;
428 for (i = 0; gcov_link[i].ext; i++)
429 debugfs_remove(node->links[i]);
430 kfree(node->links);
431 node->links = NULL;
432}
433
434/*
435 * Remove node from all lists and debugfs and release associated resources.
436 * Needs to be called with node_lock held.
437 */
438static void release_node(struct gcov_node *node)
439{
440 list_del(&node->list);
441 list_del(&node->all);
442 debugfs_remove(node->dentry);
443 remove_links(node);
444 if (node->ghost)
445 gcov_info_free(node->ghost);
446 kfree(node);
447}
448
449/* Release node and empty parents. Needs to be called with node_lock held. */
450static void remove_node(struct gcov_node *node)
451{
452 struct gcov_node *parent;
453
454 while ((node != &root_node) && list_empty(&node->children)) {
455 parent = node->parent;
456 release_node(node);
457 node = parent;
458 }
459}
460
461/*
462 * Find child node with given basename. Needs to be called with node_lock
463 * held.
464 */
465static struct gcov_node *get_child_by_name(struct gcov_node *parent,
466 const char *name)
467{
468 struct gcov_node *node;
469
470 list_for_each_entry(node, &parent->children, list) {
471 if (strcmp(node->name, name) == 0)
472 return node;
473 }
474
475 return NULL;
476}
477
478/*
479 * write() implementation for reset file. Reset all profiling data to zero
480 * and remove ghost nodes.
481 */
482static ssize_t reset_write(struct file *file, const char __user *addr,
483 size_t len, loff_t *pos)
484{
485 struct gcov_node *node;
486
487 mutex_lock(&node_lock);
488restart:
489 list_for_each_entry(node, &all_head, all) {
490 if (node->info)
491 gcov_info_reset(node->info);
492 else if (list_empty(&node->children)) {
493 remove_node(node);
494 /* Several nodes may have gone - restart loop. */
495 goto restart;
496 }
497 }
498 mutex_unlock(&node_lock);
499
500 return len;
501}
502
503/* read() implementation for reset file. Unused. */
504static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
505 loff_t *pos)
506{
507 /* Allow read operation so that a recursive copy won't fail. */
508 return 0;
509}
510
511static const struct file_operations gcov_reset_fops = {
512 .write = reset_write,
513 .read = reset_read,
514};
515
516/*
517 * Create a node for a given profiling data set and add it to all lists and
518 * debugfs. Needs to be called with node_lock held.
519 */
520static void add_node(struct gcov_info *info)
521{
522 char *filename;
523 char *curr;
524 char *next;
525 struct gcov_node *parent;
526 struct gcov_node *node;
527
528 filename = kstrdup(info->filename, GFP_KERNEL);
529 if (!filename)
530 return;
531 parent = &root_node;
532 /* Create directory nodes along the path. */
533 for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) {
534 if (curr == next)
535 continue;
536 *next = 0;
537 if (strcmp(curr, ".") == 0)
538 continue;
539 if (strcmp(curr, "..") == 0) {
540 if (!parent->parent)
541 goto err_remove;
542 parent = parent->parent;
543 continue;
544 }
545 node = get_child_by_name(parent, curr);
546 if (!node) {
547 node = new_node(parent, NULL, curr);
548 if (!node)
549 goto err_remove;
550 }
551 parent = node;
552 }
553 /* Create file node. */
554 node = new_node(parent, info, curr);
555 if (!node)
556 goto err_remove;
557out:
558 kfree(filename);
559 return;
560
561err_remove:
562 remove_node(parent);
563 goto out;
564}
565
566/*
567 * The profiling data set associated with this node is being unloaded. Store a
568 * copy of the profiling data and turn this node into a "ghost".
569 */
570static int ghost_node(struct gcov_node *node)
571{
572 node->ghost = gcov_info_dup(node->info);
573 if (!node->ghost) {
574 pr_warning("could not save data for '%s' (out of memory)\n",
575 node->info->filename);
576 return -ENOMEM;
577 }
578 node->info = NULL;
579
580 return 0;
581}
582
583/*
584 * Profiling data for this node has been loaded again. Add profiling data
585 * from previous instantiation and turn this node into a regular node.
586 */
587static void revive_node(struct gcov_node *node, struct gcov_info *info)
588{
589 if (gcov_info_is_compatible(node->ghost, info))
590 gcov_info_add(info, node->ghost);
591 else {
592 pr_warning("discarding saved data for '%s' (version changed)\n",
593 info->filename);
594 }
595 gcov_info_free(node->ghost);
596 node->ghost = NULL;
597 node->info = info;
598}
599
600/*
601 * Callback to create/remove profiling files when code compiled with
602 * -fprofile-arcs is loaded/unloaded.
603 */
604void gcov_event(enum gcov_action action, struct gcov_info *info)
605{
606 struct gcov_node *node;
607
608 mutex_lock(&node_lock);
609 node = get_node_by_name(info->filename);
610 switch (action) {
611 case GCOV_ADD:
612 /* Add new node or revive ghost. */
613 if (!node) {
614 add_node(info);
615 break;
616 }
617 if (gcov_persist)
618 revive_node(node, info);
619 else {
620 pr_warning("could not add '%s' (already exists)\n",
621 info->filename);
622 }
623 break;
624 case GCOV_REMOVE:
625 /* Remove node or turn into ghost. */
626 if (!node) {
627 pr_warning("could not remove '%s' (not found)\n",
628 info->filename);
629 break;
630 }
631 if (gcov_persist) {
632 if (!ghost_node(node))
633 break;
634 }
635 remove_node(node);
636 break;
637 }
638 mutex_unlock(&node_lock);
639}
640
641/* Create debugfs entries. */
642static __init int gcov_fs_init(void)
643{
644 int rc = -EIO;
645
646 init_node(&root_node, NULL, NULL, NULL);
647 /*
648 * /sys/kernel/debug/gcov will be parent for the reset control file
649 * and all profiling files.
650 */
651 root_node.dentry = debugfs_create_dir("gcov", NULL);
652 if (!root_node.dentry)
653 goto err_remove;
654 /*
655 * Create reset file which resets all profiling counts when written
656 * to.
657 */
658 reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry,
659 NULL, &gcov_reset_fops);
660 if (!reset_dentry)
661 goto err_remove;
662 /* Replay previous events to get our fs hierarchy up-to-date. */
663 gcov_enable_events();
664 return 0;
665
666err_remove:
667 pr_err("init failed\n");
668 if (root_node.dentry)
669 debugfs_remove(root_node.dentry);
670
671 return rc;
672}
673device_initcall(gcov_fs_init);
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
new file mode 100644
index 000000000000..ae5bb4260033
--- /dev/null
+++ b/kernel/gcov/gcc_3_4.c
@@ -0,0 +1,447 @@
1/*
2 * This code provides functions to handle gcc's profiling data format
3 * introduced with gcc 3.4. Future versions of gcc may change the gcov
4 * format (as happened before), so all format-specific information needs
5 * to be kept modular and easily exchangeable.
6 *
7 * This file is based on gcc-internal definitions. Functions and data
8 * structures are defined to be compatible with gcc counterparts.
9 * For a better understanding, refer to gcc source: gcc/gcov-io.h.
10 *
11 * Copyright IBM Corp. 2009
12 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
13 *
14 * Uses gcc-internal data definitions.
15 */
16
17#include <linux/errno.h>
18#include <linux/slab.h>
19#include <linux/string.h>
20#include <linux/seq_file.h>
21#include <linux/vmalloc.h>
22#include "gcov.h"
23
24/* Symbolic links to be created for each profiling data file. */
25const struct gcov_link gcov_link[] = {
26 { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
27 { 0, NULL},
28};
29
30/*
31 * Determine whether a counter is active. Based on gcc magic. Doesn't change
32 * at run-time.
33 */
34static int counter_active(struct gcov_info *info, unsigned int type)
35{
36 return (1 << type) & info->ctr_mask;
37}
38
39/* Determine number of active counters. Based on gcc magic. */
40static unsigned int num_counter_active(struct gcov_info *info)
41{
42 unsigned int i;
43 unsigned int result = 0;
44
45 for (i = 0; i < GCOV_COUNTERS; i++) {
46 if (counter_active(info, i))
47 result++;
48 }
49 return result;
50}
51
52/**
53 * gcov_info_reset - reset profiling data to zero
54 * @info: profiling data set
55 */
56void gcov_info_reset(struct gcov_info *info)
57{
58 unsigned int active = num_counter_active(info);
59 unsigned int i;
60
61 for (i = 0; i < active; i++) {
62 memset(info->counts[i].values, 0,
63 info->counts[i].num * sizeof(gcov_type));
64 }
65}
66
67/**
68 * gcov_info_is_compatible - check if profiling data can be added
69 * @info1: first profiling data set
70 * @info2: second profiling data set
71 *
72 * Returns non-zero if profiling data can be added, zero otherwise.
73 */
74int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
75{
76 return (info1->stamp == info2->stamp);
77}
78
79/**
80 * gcov_info_add - add up profiling data
81 * @dest: profiling data set to which data is added
82 * @source: profiling data set which is added
83 *
84 * Adds profiling counts of @source to @dest.
85 */
86void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
87{
88 unsigned int i;
89 unsigned int j;
90
91 for (i = 0; i < num_counter_active(dest); i++) {
92 for (j = 0; j < dest->counts[i].num; j++) {
93 dest->counts[i].values[j] +=
94 source->counts[i].values[j];
95 }
96 }
97}
98
99/* Get size of function info entry. Based on gcc magic. */
100static size_t get_fn_size(struct gcov_info *info)
101{
102 size_t size;
103
104 size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
105 sizeof(unsigned int);
106 if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
107 size = ALIGN(size, __alignof__(struct gcov_fn_info));
108 return size;
109}
110
111/* Get address of function info entry. Based on gcc magic. */
112static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
113{
114 return (struct gcov_fn_info *)
115 ((char *) info->functions + fn * get_fn_size(info));
116}
117
118/**
119 * gcov_info_dup - duplicate profiling data set
120 * @info: profiling data set to duplicate
121 *
122 * Return newly allocated duplicate on success, %NULL on error.
123 */
124struct gcov_info *gcov_info_dup(struct gcov_info *info)
125{
126 struct gcov_info *dup;
127 unsigned int i;
128 unsigned int active;
129
130 /* Duplicate gcov_info. */
131 active = num_counter_active(info);
132 dup = kzalloc(sizeof(struct gcov_info) +
133 sizeof(struct gcov_ctr_info) * active, GFP_KERNEL);
134 if (!dup)
135 return NULL;
136 dup->version = info->version;
137 dup->stamp = info->stamp;
138 dup->n_functions = info->n_functions;
139 dup->ctr_mask = info->ctr_mask;
140 /* Duplicate filename. */
141 dup->filename = kstrdup(info->filename, GFP_KERNEL);
142 if (!dup->filename)
143 goto err_free;
144 /* Duplicate table of functions. */
145 dup->functions = kmemdup(info->functions, info->n_functions *
146 get_fn_size(info), GFP_KERNEL);
147 if (!dup->functions)
148 goto err_free;
149 /* Duplicate counter arrays. */
150 for (i = 0; i < active ; i++) {
151 struct gcov_ctr_info *ctr = &info->counts[i];
152 size_t size = ctr->num * sizeof(gcov_type);
153
154 dup->counts[i].num = ctr->num;
155 dup->counts[i].merge = ctr->merge;
156 dup->counts[i].values = vmalloc(size);
157 if (!dup->counts[i].values)
158 goto err_free;
159 memcpy(dup->counts[i].values, ctr->values, size);
160 }
161 return dup;
162
163err_free:
164 gcov_info_free(dup);
165 return NULL;
166}
167
168/**
169 * gcov_info_free - release memory for profiling data set duplicate
170 * @info: profiling data set duplicate to free
171 */
172void gcov_info_free(struct gcov_info *info)
173{
174 unsigned int active = num_counter_active(info);
175 unsigned int i;
176
177 for (i = 0; i < active ; i++)
178 vfree(info->counts[i].values);
179 kfree(info->functions);
180 kfree(info->filename);
181 kfree(info);
182}
183
184/**
185 * struct type_info - iterator helper array
186 * @ctr_type: counter type
187 * @offset: index of the first value of the current function for this type
188 *
189 * This array is needed to convert the in-memory data format into the in-file
190 * data format:
191 *
192 * In-memory:
193 * for each counter type
194 * for each function
195 * values
196 *
197 * In-file:
198 * for each function
199 * for each counter type
200 * values
201 *
202 * See gcc source gcc/gcov-io.h for more information on data organization.
203 */
204struct type_info {
205 int ctr_type;
206 unsigned int offset;
207};
208
209/**
210 * struct gcov_iterator - specifies current file position in logical records
211 * @info: associated profiling data
212 * @record: record type
213 * @function: function number
214 * @type: counter type
215 * @count: index into values array
216 * @num_types: number of counter types
217 * @type_info: helper array to get values-array offset for current function
218 */
219struct gcov_iterator {
220 struct gcov_info *info;
221
222 int record;
223 unsigned int function;
224 unsigned int type;
225 unsigned int count;
226
227 int num_types;
228 struct type_info type_info[0];
229};
230
231static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
232{
233 return get_fn_info(iter->info, iter->function);
234}
235
236static struct type_info *get_type(struct gcov_iterator *iter)
237{
238 return &iter->type_info[iter->type];
239}
240
241/**
242 * gcov_iter_new - allocate and initialize profiling data iterator
243 * @info: profiling data set to be iterated
244 *
245 * Return file iterator on success, %NULL otherwise.
246 */
247struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
248{
249 struct gcov_iterator *iter;
250
251 iter = kzalloc(sizeof(struct gcov_iterator) +
252 num_counter_active(info) * sizeof(struct type_info),
253 GFP_KERNEL);
254 if (iter)
255 iter->info = info;
256
257 return iter;
258}
259
260/**
261 * gcov_iter_free - release memory for iterator
262 * @iter: file iterator to free
263 */
264void gcov_iter_free(struct gcov_iterator *iter)
265{
266 kfree(iter);
267}
268
269/**
270 * gcov_iter_get_info - return profiling data set for given file iterator
271 * @iter: file iterator
272 */
273struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
274{
275 return iter->info;
276}
277
278/**
279 * gcov_iter_start - reset file iterator to starting position
280 * @iter: file iterator
281 */
282void gcov_iter_start(struct gcov_iterator *iter)
283{
284 int i;
285
286 iter->record = 0;
287 iter->function = 0;
288 iter->type = 0;
289 iter->count = 0;
290 iter->num_types = 0;
291 for (i = 0; i < GCOV_COUNTERS; i++) {
292 if (counter_active(iter->info, i)) {
293 iter->type_info[iter->num_types].ctr_type = i;
294 iter->type_info[iter->num_types++].offset = 0;
295 }
296 }
297}
298
299/* Mapping of logical record number to actual file content. */
300#define RECORD_FILE_MAGIC 0
301#define RECORD_GCOV_VERSION 1
302#define RECORD_TIME_STAMP 2
303#define RECORD_FUNCTION_TAG 3
304#define RECORD_FUNCTON_TAG_LEN 4
305#define RECORD_FUNCTION_IDENT 5
306#define RECORD_FUNCTION_CHECK 6
307#define RECORD_COUNT_TAG 7
308#define RECORD_COUNT_LEN 8
309#define RECORD_COUNT 9
310
311/**
312 * gcov_iter_next - advance file iterator to next logical record
313 * @iter: file iterator
314 *
315 * Return zero if new position is valid, non-zero if iterator has reached end.
316 */
317int gcov_iter_next(struct gcov_iterator *iter)
318{
319 switch (iter->record) {
320 case RECORD_FILE_MAGIC:
321 case RECORD_GCOV_VERSION:
322 case RECORD_FUNCTION_TAG:
323 case RECORD_FUNCTON_TAG_LEN:
324 case RECORD_FUNCTION_IDENT:
325 case RECORD_COUNT_TAG:
326 /* Advance to next record */
327 iter->record++;
328 break;
329 case RECORD_COUNT:
330 /* Advance to next count */
331 iter->count++;
332 /* fall through */
333 case RECORD_COUNT_LEN:
334 if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
335 iter->record = 9;
336 break;
337 }
338 /* Advance to next counter type */
339 get_type(iter)->offset += iter->count;
340 iter->count = 0;
341 iter->type++;
342 /* fall through */
343 case RECORD_FUNCTION_CHECK:
344 if (iter->type < iter->num_types) {
345 iter->record = 7;
346 break;
347 }
348 /* Advance to next function */
349 iter->type = 0;
350 iter->function++;
351 /* fall through */
352 case RECORD_TIME_STAMP:
353 if (iter->function < iter->info->n_functions)
354 iter->record = 3;
355 else
356 iter->record = -1;
357 break;
358 }
359 /* Check for EOF. */
360 if (iter->record == -1)
361 return -EINVAL;
362 else
363 return 0;
364}
365
366/**
367 * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
368 * @seq: seq_file handle
369 * @v: value to be stored
370 *
371 * Number format defined by gcc: numbers are recorded in the 32 bit
372 * unsigned binary form of the endianness of the machine generating the
373 * file.
374 */
375static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
376{
377 return seq_write(seq, &v, sizeof(v));
378}
379
380/**
381 * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
382 * @seq: seq_file handle
383 * @v: value to be stored
384 *
385 * Number format defined by gcc: numbers are recorded in the 32 bit
386 * unsigned binary form of the endianness of the machine generating the
387 * file. 64 bit numbers are stored as two 32 bit numbers, the low part
388 * first.
389 */
390static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
391{
392 u32 data[2];
393
394 data[0] = (v & 0xffffffffUL);
395 data[1] = (v >> 32);
396 return seq_write(seq, data, sizeof(data));
397}
398
399/**
400 * gcov_iter_write - write data for current pos to seq_file
401 * @iter: file iterator
402 * @seq: seq_file handle
403 *
404 * Return zero on success, non-zero otherwise.
405 */
406int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
407{
408 int rc = -EINVAL;
409
410 switch (iter->record) {
411 case RECORD_FILE_MAGIC:
412 rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
413 break;
414 case RECORD_GCOV_VERSION:
415 rc = seq_write_gcov_u32(seq, iter->info->version);
416 break;
417 case RECORD_TIME_STAMP:
418 rc = seq_write_gcov_u32(seq, iter->info->stamp);
419 break;
420 case RECORD_FUNCTION_TAG:
421 rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
422 break;
423 case RECORD_FUNCTON_TAG_LEN:
424 rc = seq_write_gcov_u32(seq, 2);
425 break;
426 case RECORD_FUNCTION_IDENT:
427 rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
428 break;
429 case RECORD_FUNCTION_CHECK:
430 rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
431 break;
432 case RECORD_COUNT_TAG:
433 rc = seq_write_gcov_u32(seq,
434 GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
435 break;
436 case RECORD_COUNT_LEN:
437 rc = seq_write_gcov_u32(seq,
438 get_func(iter)->n_ctrs[iter->type] * 2);
439 break;
440 case RECORD_COUNT:
441 rc = seq_write_gcov_u64(seq,
442 iter->info->counts[iter->type].
443 values[iter->count + get_type(iter)->offset]);
444 break;
445 }
446 return rc;
447}
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
new file mode 100644
index 000000000000..060073ebf7a6
--- /dev/null
+++ b/kernel/gcov/gcov.h
@@ -0,0 +1,128 @@
1/*
2 * Profiling infrastructure declarations.
3 *
4 * This file is based on gcc-internal definitions. Data structures are
5 * defined to be compatible with gcc counterparts. For a better
6 * understanding, refer to gcc source: gcc/gcov-io.h.
7 *
8 * Copyright IBM Corp. 2009
9 * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
10 *
11 * Uses gcc-internal data definitions.
12 */
13
14#ifndef GCOV_H
15#define GCOV_H GCOV_H
16
17#include <linux/types.h>
18
19/*
20 * Profiling data types used for gcc 3.4 and above - these are defined by
21 * gcc and need to be kept as close to the original definition as possible to
22 * remain compatible.
23 */
24#define GCOV_COUNTERS 5
25#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
26#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
27#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
28#define GCOV_TAG_FOR_COUNTER(count) \
29 (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
30
31#if BITS_PER_LONG >= 64
32typedef long gcov_type;
33#else
34typedef long long gcov_type;
35#endif
36
37/**
38 * struct gcov_fn_info - profiling meta data per function
39 * @ident: object file-unique function identifier
40 * @checksum: function checksum
41 * @n_ctrs: number of values per counter type belonging to this function
42 *
43 * This data is generated by gcc during compilation and doesn't change
44 * at run-time.
45 */
46struct gcov_fn_info {
47 unsigned int ident;
48 unsigned int checksum;
49 unsigned int n_ctrs[0];
50};
51
52/**
53 * struct gcov_ctr_info - profiling data per counter type
54 * @num: number of counter values for this type
55 * @values: array of counter values for this type
56 * @merge: merge function for counter values of this type (unused)
57 *
58 * This data is generated by gcc during compilation and doesn't change
59 * at run-time with the exception of the values array.
60 */
61struct gcov_ctr_info {
62 unsigned int num;
63 gcov_type *values;
64 void (*merge)(gcov_type *, unsigned int);
65};
66
67/**
68 * struct gcov_info - profiling data per object file
69 * @version: gcov version magic indicating the gcc version used for compilation
70 * @next: list head for a singly-linked list
71 * @stamp: time stamp
72 * @filename: name of the associated gcov data file
73 * @n_functions: number of instrumented functions
74 * @functions: function data
75 * @ctr_mask: mask specifying which counter types are active
76 * @counts: counter data per counter type
77 *
78 * This data is generated by gcc during compilation and doesn't change
79 * at run-time with the exception of the next pointer.
80 */
81struct gcov_info {
82 unsigned int version;
83 struct gcov_info *next;
84 unsigned int stamp;
85 const char *filename;
86 unsigned int n_functions;
87 const struct gcov_fn_info *functions;
88 unsigned int ctr_mask;
89 struct gcov_ctr_info counts[0];
90};
91
92/* Base interface. */
93enum gcov_action {
94 GCOV_ADD,
95 GCOV_REMOVE,
96};
97
98void gcov_event(enum gcov_action action, struct gcov_info *info);
99void gcov_enable_events(void);
100
101/* Iterator control. */
102struct seq_file;
103struct gcov_iterator;
104
105struct gcov_iterator *gcov_iter_new(struct gcov_info *info);
106void gcov_iter_free(struct gcov_iterator *iter);
107void gcov_iter_start(struct gcov_iterator *iter);
108int gcov_iter_next(struct gcov_iterator *iter);
109int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq);
110struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter);
111
112/* gcov_info control. */
113void gcov_info_reset(struct gcov_info *info);
114int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2);
115void gcov_info_add(struct gcov_info *dest, struct gcov_info *source);
116struct gcov_info *gcov_info_dup(struct gcov_info *info);
117void gcov_info_free(struct gcov_info *info);
118
119struct gcov_link {
120 enum {
121 OBJ_TREE,
122 SRC_TREE,
123 } dir;
124 const char *ext;
125};
126extern const struct gcov_link gcov_link[];
127
128#endif /* GCOV_H */
diff --git a/kernel/groups.c b/kernel/groups.c
new file mode 100644
index 000000000000..2b45b2ee3964
--- /dev/null
+++ b/kernel/groups.c
@@ -0,0 +1,288 @@
1/*
2 * Supplementary group IDs
3 */
4#include <linux/cred.h>
5#include <linux/module.h>
6#include <linux/slab.h>
7#include <linux/security.h>
8#include <linux/syscalls.h>
9#include <asm/uaccess.h>
10
11/* init to 2 - one for init_task, one to ensure it is never freed */
12struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
13
14struct group_info *groups_alloc(int gidsetsize)
15{
16 struct group_info *group_info;
17 int nblocks;
18 int i;
19
20 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
21 /* Make sure we always allocate at least one indirect block pointer */
22 nblocks = nblocks ? : 1;
23 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
24 if (!group_info)
25 return NULL;
26 group_info->ngroups = gidsetsize;
27 group_info->nblocks = nblocks;
28 atomic_set(&group_info->usage, 1);
29
30 if (gidsetsize <= NGROUPS_SMALL)
31 group_info->blocks[0] = group_info->small_block;
32 else {
33 for (i = 0; i < nblocks; i++) {
34 gid_t *b;
35 b = (void *)__get_free_page(GFP_USER);
36 if (!b)
37 goto out_undo_partial_alloc;
38 group_info->blocks[i] = b;
39 }
40 }
41 return group_info;
42
43out_undo_partial_alloc:
44 while (--i >= 0) {
45 free_page((unsigned long)group_info->blocks[i]);
46 }
47 kfree(group_info);
48 return NULL;
49}
50
51EXPORT_SYMBOL(groups_alloc);
52
53void groups_free(struct group_info *group_info)
54{
55 if (group_info->blocks[0] != group_info->small_block) {
56 int i;
57 for (i = 0; i < group_info->nblocks; i++)
58 free_page((unsigned long)group_info->blocks[i]);
59 }
60 kfree(group_info);
61}
62
63EXPORT_SYMBOL(groups_free);
64
65/* export the group_info to a user-space array */
66static int groups_to_user(gid_t __user *grouplist,
67 const struct group_info *group_info)
68{
69 int i;
70 unsigned int count = group_info->ngroups;
71
72 for (i = 0; i < group_info->nblocks; i++) {
73 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
74 unsigned int len = cp_count * sizeof(*grouplist);
75
76 if (copy_to_user(grouplist, group_info->blocks[i], len))
77 return -EFAULT;
78
79 grouplist += NGROUPS_PER_BLOCK;
80 count -= cp_count;
81 }
82 return 0;
83}
84
85/* fill a group_info from a user-space array - it must be allocated already */
86static int groups_from_user(struct group_info *group_info,
87 gid_t __user *grouplist)
88{
89 int i;
90 unsigned int count = group_info->ngroups;
91
92 for (i = 0; i < group_info->nblocks; i++) {
93 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
94 unsigned int len = cp_count * sizeof(*grouplist);
95
96 if (copy_from_user(group_info->blocks[i], grouplist, len))
97 return -EFAULT;
98
99 grouplist += NGROUPS_PER_BLOCK;
100 count -= cp_count;
101 }
102 return 0;
103}
104
105/* a simple Shell sort */
106static void groups_sort(struct group_info *group_info)
107{
108 int base, max, stride;
109 int gidsetsize = group_info->ngroups;
110
111 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
112 ; /* nothing */
113 stride /= 3;
114
115 while (stride) {
116 max = gidsetsize - stride;
117 for (base = 0; base < max; base++) {
118 int left = base;
119 int right = left + stride;
120 gid_t tmp = GROUP_AT(group_info, right);
121
122 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
123 GROUP_AT(group_info, right) =
124 GROUP_AT(group_info, left);
125 right = left;
126 left -= stride;
127 }
128 GROUP_AT(group_info, right) = tmp;
129 }
130 stride /= 3;
131 }
132}
133
134/* a simple bsearch */
135int groups_search(const struct group_info *group_info, gid_t grp)
136{
137 unsigned int left, right;
138
139 if (!group_info)
140 return 0;
141
142 left = 0;
143 right = group_info->ngroups;
144 while (left < right) {
145 unsigned int mid = (left+right)/2;
146 int cmp = grp - GROUP_AT(group_info, mid);
147 if (cmp > 0)
148 left = mid + 1;
149 else if (cmp < 0)
150 right = mid;
151 else
152 return 1;
153 }
154 return 0;
155}
156
157/**
158 * set_groups - Change a group subscription in a set of credentials
159 * @new: The newly prepared set of credentials to alter
160 * @group_info: The group list to install
161 *
162 * Validate a group subscription and, if valid, insert it into a set
163 * of credentials.
164 */
165int set_groups(struct cred *new, struct group_info *group_info)
166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info);
174 groups_sort(group_info);
175 get_group_info(group_info);
176 new->group_info = group_info;
177 return 0;
178}
179
180EXPORT_SYMBOL(set_groups);
181
182/**
183 * set_current_groups - Change current's group subscription
184 * @group_info: The group list to impose
185 *
186 * Validate a group subscription and, if valid, impose it upon current's task
187 * security record.
188 */
189int set_current_groups(struct group_info *group_info)
190{
191 struct cred *new;
192 int ret;
193
194 new = prepare_creds();
195 if (!new)
196 return -ENOMEM;
197
198 ret = set_groups(new, group_info);
199 if (ret < 0) {
200 abort_creds(new);
201 return ret;
202 }
203
204 return commit_creds(new);
205}
206
207EXPORT_SYMBOL(set_current_groups);
208
209SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
210{
211 const struct cred *cred = current_cred();
212 int i;
213
214 if (gidsetsize < 0)
215 return -EINVAL;
216
217 /* no need to grab task_lock here; it cannot change */
218 i = cred->group_info->ngroups;
219 if (gidsetsize) {
220 if (i > gidsetsize) {
221 i = -EINVAL;
222 goto out;
223 }
224 if (groups_to_user(grouplist, cred->group_info)) {
225 i = -EFAULT;
226 goto out;
227 }
228 }
229out:
230 return i;
231}
232
233/*
234 * SMP: Our groups are copy-on-write. We can set them safely
235 * without another task interfering.
236 */
237
238SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
239{
240 struct group_info *group_info;
241 int retval;
242
243 if (!capable(CAP_SETGID))
244 return -EPERM;
245 if ((unsigned)gidsetsize > NGROUPS_MAX)
246 return -EINVAL;
247
248 group_info = groups_alloc(gidsetsize);
249 if (!group_info)
250 return -ENOMEM;
251 retval = groups_from_user(group_info, grouplist);
252 if (retval) {
253 put_group_info(group_info);
254 return retval;
255 }
256
257 retval = set_current_groups(group_info);
258 put_group_info(group_info);
259
260 return retval;
261}
262
263/*
264 * Check whether we're fsgid/egid or in the supplemental group..
265 */
266int in_group_p(gid_t grp)
267{
268 const struct cred *cred = current_cred();
269 int retval = 1;
270
271 if (grp != cred->fsgid)
272 retval = groups_search(cred->group_info, grp);
273 return retval;
274}
275
276EXPORT_SYMBOL(in_group_p);
277
278int in_egroup_p(gid_t grp)
279{
280 const struct cred *cred = current_cred();
281 int retval = 1;
282
283 if (grp != cred->egid)
284 retval = groups_search(cred->group_info, grp);
285 return retval;
286}
287
288EXPORT_SYMBOL(in_egroup_p);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cb8a15c19583..9002958a96e7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -43,6 +43,8 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/err.h> 44#include <linux/err.h>
45#include <linux/debugobjects.h> 45#include <linux/debugobjects.h>
46#include <linux/sched.h>
47#include <linux/timer.h>
46 48
47#include <asm/uaccess.h> 49#include <asm/uaccess.h>
48 50
@@ -193,12 +195,24 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
193 * Switch the timer base to the current CPU when possible. 195 * Switch the timer base to the current CPU when possible.
194 */ 196 */
195static inline struct hrtimer_clock_base * 197static inline struct hrtimer_clock_base *
196switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base) 198switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
199 int pinned)
197{ 200{
198 struct hrtimer_clock_base *new_base; 201 struct hrtimer_clock_base *new_base;
199 struct hrtimer_cpu_base *new_cpu_base; 202 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1;
204
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
200 213
201 new_cpu_base = &__get_cpu_var(hrtimer_bases); 214again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
202 new_base = &new_cpu_base->clock_base[base->index]; 216 new_base = &new_cpu_base->clock_base[base->index];
203 217
204 if (base != new_base) { 218 if (base != new_base) {
@@ -218,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base)
218 timer->base = NULL; 232 timer->base = NULL;
219 spin_unlock(&base->cpu_base->lock); 233 spin_unlock(&base->cpu_base->lock);
220 spin_lock(&new_base->cpu_base->lock); 234 spin_lock(&new_base->cpu_base->lock);
235
236 /* Optimized away for NOHZ=n SMP=n */
237 if (cpu == preferred_cpu) {
238 /* Calculate clock monotonic expiry time */
239#ifdef CONFIG_HIGH_RES_TIMERS
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer),
241 new_base->offset);
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 }
221 timer->base = new_base; 269 timer->base = new_base;
222 } 270 }
223 return new_base; 271 return new_base;
@@ -235,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
235 return base; 283 return base;
236} 284}
237 285
238# define switch_hrtimer_base(t, b) (b) 286# define switch_hrtimer_base(t, b, p) (b)
239 287
240#endif /* !CONFIG_SMP */ 288#endif /* !CONFIG_SMP */
241 289
@@ -332,6 +380,8 @@ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
332 return res; 380 return res;
333} 381}
334 382
383EXPORT_SYMBOL_GPL(ktime_add_safe);
384
335#ifdef CONFIG_DEBUG_OBJECTS_TIMERS 385#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
336 386
337static struct debug_obj_descr hrtimer_debug_descr; 387static struct debug_obj_descr hrtimer_debug_descr;
@@ -907,9 +957,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
907 ret = remove_hrtimer(timer, base); 957 ret = remove_hrtimer(timer, base);
908 958
909 /* Switch the timer base, if necessary: */ 959 /* Switch the timer base, if necessary: */
910 new_base = switch_hrtimer_base(timer, base); 960 new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
911 961
912 if (mode == HRTIMER_MODE_REL) { 962 if (mode & HRTIMER_MODE_REL) {
913 tim = ktime_add_safe(tim, new_base->get_time()); 963 tim = ktime_add_safe(tim, new_base->get_time());
914 /* 964 /*
915 * CONFIG_TIME_LOW_RES is a temporary way for architectures 965 * CONFIG_TIME_LOW_RES is a temporary way for architectures
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964..7d047808419d 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o 6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 7obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2..13c68e71b726 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
359 359
360 spin_lock(&desc->lock); 360 spin_lock(&desc->lock);
361 mask_ack_irq(desc, irq); 361 mask_ack_irq(desc, irq);
362 desc = irq_remap_to_desc(irq, desc);
363 362
364 if (unlikely(desc->status & IRQ_INPROGRESS)) 363 if (unlikely(desc->status & IRQ_INPROGRESS))
365 goto out_unlock; 364 goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
438 desc->status &= ~IRQ_INPROGRESS; 437 desc->status &= ~IRQ_INPROGRESS;
439out: 438out:
440 desc->chip->eoi(irq); 439 desc->chip->eoi(irq);
441 desc = irq_remap_to_desc(irq, desc);
442 440
443 spin_unlock(&desc->lock); 441 spin_unlock(&desc->lock);
444} 442}
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
475 !desc->action)) { 473 !desc->action)) {
476 desc->status |= (IRQ_PENDING | IRQ_MASKED); 474 desc->status |= (IRQ_PENDING | IRQ_MASKED);
477 mask_ack_irq(desc, irq); 475 mask_ack_irq(desc, irq);
478 desc = irq_remap_to_desc(irq, desc);
479 goto out_unlock; 476 goto out_unlock;
480 } 477 }
481 kstat_incr_irqs_this_cpu(irq, desc); 478 kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
483 /* Start handling the irq */ 480 /* Start handling the irq */
484 if (desc->chip->ack) 481 if (desc->chip->ack)
485 desc->chip->ack(irq); 482 desc->chip->ack(irq);
486 desc = irq_remap_to_desc(irq, desc);
487 483
488 /* Mark the IRQ currently in progress.*/ 484 /* Mark the IRQ currently in progress.*/
489 desc->status |= IRQ_INPROGRESS; 485 desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
544 if (!noirqdebug) 540 if (!noirqdebug)
545 note_interrupt(irq, desc, action_ret); 541 note_interrupt(irq, desc, action_ret);
546 542
547 if (desc->chip->eoi) { 543 if (desc->chip->eoi)
548 desc->chip->eoi(irq); 544 desc->chip->eoi(irq);
549 desc = irq_remap_to_desc(irq, desc);
550 }
551} 545}
552 546
553void 547void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
582 576
583 /* Uninstall? */ 577 /* Uninstall? */
584 if (handle == handle_bad_irq) { 578 if (handle == handle_bad_irq) {
585 if (desc->chip != &no_irq_chip) { 579 if (desc->chip != &no_irq_chip)
586 mask_ack_irq(desc, irq); 580 mask_ack_irq(desc, irq);
587 desc = irq_remap_to_desc(irq, desc);
588 }
589 desc->status |= IRQ_DISABLED; 581 desc->status |= IRQ_DISABLED;
590 desc->depth = 1; 582 desc->depth = 1;
591 } 583 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e08754744f..065205bdd920 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,14 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/slab.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/random.h> 16#include <linux/random.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 19#include <linux/rculist.h>
19#include <linux/hash.h> 20#include <linux/hash.h>
20#include <trace/irq.h>
21#include <linux/bootmem.h> 21#include <linux/bootmem.h>
22#include <trace/events/irq.h>
22 23
23#include "internals.h" 24#include "internals.h"
24 25
@@ -44,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
44#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) 45#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
45static void __init init_irq_default_affinity(void) 46static void __init init_irq_default_affinity(void)
46{ 47{
47 alloc_bootmem_cpumask_var(&irq_default_affinity); 48 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
48 cpumask_setall(irq_default_affinity); 49 cpumask_setall(irq_default_affinity);
49} 50}
50#else 51#else
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
81 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 82 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
82}; 83};
83 84
84void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) 85void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
85{ 86{
86 int node;
87 void *ptr; 87 void *ptr;
88 88
89 node = cpu_to_node(cpu); 89 if (slab_is_available())
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node); 90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92 else
93 ptr = alloc_bootmem_node(NODE_DATA(node),
94 nr * sizeof(*desc->kstat_irqs));
91 95
92 /* 96 /*
93 * don't overwite if can not get new one 97 * don't overwite if can not get new one
94 * init_copy_kstat_irqs() could still use old one 98 * init_copy_kstat_irqs() could still use old one
95 */ 99 */
96 if (ptr) { 100 if (ptr) {
97 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", 101 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
98 cpu, node);
99 desc->kstat_irqs = ptr; 102 desc->kstat_irqs = ptr;
100 } 103 }
101} 104}
102 105
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) 106static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{ 107{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 108 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106 109
107 spin_lock_init(&desc->lock); 110 spin_lock_init(&desc->lock);
108 desc->irq = irq; 111 desc->irq = irq;
109#ifdef CONFIG_SMP 112#ifdef CONFIG_SMP
110 desc->cpu = cpu; 113 desc->node = node;
111#endif 114#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 115 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, cpu, nr_cpu_ids); 116 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) { 117 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n"); 118 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1); 119 BUG_ON(1);
117 } 120 }
118 if (!init_alloc_desc_masks(desc, cpu, false)) { 121 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); 122 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1); 123 BUG_ON(1);
121 } 124 }
122 arch_init_chip_data(desc, cpu); 125 init_desc_masks(desc);
126 arch_init_chip_data(desc, node);
123} 127}
124 128
125/* 129/*
@@ -146,6 +150,7 @@ int __init early_irq_init(void)
146{ 150{
147 struct irq_desc *desc; 151 struct irq_desc *desc;
148 int legacy_count; 152 int legacy_count;
153 int node;
149 int i; 154 int i;
150 155
151 init_irq_default_affinity(); 156 init_irq_default_affinity();
@@ -156,20 +161,21 @@ int __init early_irq_init(void)
156 161
157 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
158 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node;
159 165
160 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
161 irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
162 168
163 /* allocate based on nr_cpu_ids */ 169 /* allocate based on nr_cpu_ids */
164 /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ 170 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
165 kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * 171 sizeof(int), GFP_NOWAIT, node);
166 sizeof(int));
167 172
168 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
169 desc[i].irq = i; 174 desc[i].irq = i;
170 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
171 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
172 init_alloc_desc_masks(&desc[i], 0, true); 177 alloc_desc_masks(&desc[i], node, true);
178 init_desc_masks(&desc[i]);
173 irq_desc_ptrs[i] = desc + i; 179 irq_desc_ptrs[i] = desc + i;
174 } 180 }
175 181
@@ -187,11 +193,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
187 return NULL; 193 return NULL;
188} 194}
189 195
190struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 196struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
191{ 197{
192 struct irq_desc *desc; 198 struct irq_desc *desc;
193 unsigned long flags; 199 unsigned long flags;
194 int node;
195 200
196 if (irq >= nr_irqs) { 201 if (irq >= nr_irqs) {
197 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", 202 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +215,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
210 if (desc) 215 if (desc)
211 goto out_unlock; 216 goto out_unlock;
212 217
213 node = cpu_to_node(cpu); 218 if (slab_is_available())
214 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 219 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
215 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", 220 else
216 irq, cpu, node); 221 desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
222
223 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
217 if (!desc) { 224 if (!desc) {
218 printk(KERN_ERR "can not alloc irq_desc\n"); 225 printk(KERN_ERR "can not alloc irq_desc\n");
219 BUG_ON(1); 226 BUG_ON(1);
220 } 227 }
221 init_one_irq_desc(irq, desc, cpu); 228 init_one_irq_desc(irq, desc, node);
222 229
223 irq_desc_ptrs[irq] = desc; 230 irq_desc_ptrs[irq] = desc;
224 231
@@ -256,7 +263,8 @@ int __init early_irq_init(void)
256 263
257 for (i = 0; i < count; i++) { 264 for (i = 0; i < count; i++) {
258 desc[i].irq = i; 265 desc[i].irq = i;
259 init_alloc_desc_masks(&desc[i], 0, true); 266 alloc_desc_masks(&desc[i], 0, true);
267 init_desc_masks(&desc[i]);
260 desc[i].kstat_irqs = kstat_irqs_all[i]; 268 desc[i].kstat_irqs = kstat_irqs_all[i];
261 } 269 }
262 return arch_early_irq_init(); 270 return arch_early_irq_init();
@@ -267,7 +275,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
267 return (irq < NR_IRQS) ? irq_desc + irq : NULL; 275 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
268} 276}
269 277
270struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) 278struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
271{ 279{
272 return irq_to_desc(irq); 280 return irq_to_desc(irq);
273} 281}
@@ -348,9 +356,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
348 "but no thread function available.", irq, action->name); 356 "but no thread function available.", irq, action->name);
349} 357}
350 358
351DEFINE_TRACE(irq_handler_entry);
352DEFINE_TRACE(irq_handler_exit);
353
354/** 359/**
355 * handle_IRQ_event - irq action chain handler 360 * handle_IRQ_event - irq action chain handler
356 * @irq: the interrupt number 361 * @irq: the interrupt number
@@ -453,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
453 /* 458 /*
454 * No locking required for CPU-local interrupts: 459 * No locking required for CPU-local interrupts:
455 */ 460 */
456 if (desc->chip->ack) { 461 if (desc->chip->ack)
457 desc->chip->ack(irq); 462 desc->chip->ack(irq);
458 /* get new one */
459 desc = irq_remap_to_desc(irq, desc);
460 }
461 if (likely(!(desc->status & IRQ_DISABLED))) { 463 if (likely(!(desc->status & IRQ_DISABLED))) {
462 action_ret = handle_IRQ_event(irq, desc->action); 464 action_ret = handle_IRQ_event(irq, desc->action);
463 if (!noirqdebug) 465 if (!noirqdebug)
@@ -468,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
468 } 470 }
469 471
470 spin_lock(&desc->lock); 472 spin_lock(&desc->lock);
471 if (desc->chip->ack) { 473 if (desc->chip->ack)
472 desc->chip->ack(irq); 474 desc->chip->ack(irq);
473 desc = irq_remap_to_desc(irq, desc);
474 }
475 /* 475 /*
476 * REPLAY is when Linux resends an IRQ that was dropped earlier 476 * REPLAY is when Linux resends an IRQ that was dropped earlier
477 * WAITING is used by probe to mark irqs that are being tested 477 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38f..73468253143b 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 17
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern spinlock_t sparse_irq_lock;
22 22
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
47
45/* 48/*
46 * Debugging printout: 49 * Debugging printout:
47 */ 50 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb47f8b80557..50da67672901 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83static void 83void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{ 85{
86 struct irqaction *action = desc->action; 86 struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
109 spin_lock_irqsave(&desc->lock, flags); 109 spin_lock_irqsave(&desc->lock, flags);
110 110
111#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
112 if (desc->status & IRQ_MOVE_PCNTXT) 112 if (desc->status & IRQ_MOVE_PCNTXT) {
113 desc->chip->set_affinity(irq, cpumask); 113 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask);
116 }
117 }
114 else { 118 else {
115 desc->status |= IRQ_MOVE_PENDING; 119 desc->status |= IRQ_MOVE_PENDING;
116 cpumask_copy(desc->pending_mask, cpumask); 120 cpumask_copy(desc->pending_mask, cpumask);
117 } 121 }
118#else 122#else
119 cpumask_copy(desc->affinity, cpumask); 123 if (!desc->chip->set_affinity(irq, cpumask)) {
120 desc->chip->set_affinity(irq, cpumask); 124 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask);
126 }
121#endif 127#endif
122 irq_set_thread_affinity(desc, cpumask);
123 desc->status |= IRQ_AFFINITY_SET; 128 desc->status |= IRQ_AFFINITY_SET;
124 spin_unlock_irqrestore(&desc->lock, flags); 129 spin_unlock_irqrestore(&desc->lock, flags);
125 return 0; 130 return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7..cfe767ca1545 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
1 1
2#include <linux/irq.h> 2#include <linux/irq.h>
3#include <linux/interrupt.h>
4
5#include "internals.h"
3 6
4void move_masked_irq(int irq) 7void move_masked_irq(int irq)
5{ 8{
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
39 * masking the irqs. 42 * masking the irqs.
40 */ 43 */
41 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
42 < nr_cpu_ids)) { 45 < nr_cpu_ids))
43 cpumask_and(desc->affinity, 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
44 desc->pending_mask, cpu_online_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
45 desc->chip->set_affinity(irq, desc->affinity); 48 irq_set_thread_affinity(desc, desc->pending_mask);
46 } 49 }
50
47 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
48} 52}
49 53
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2..2f69bee57bf2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
15 15
16static void init_copy_kstat_irqs(struct irq_desc *old_desc, 16static void init_copy_kstat_irqs(struct irq_desc *old_desc,
17 struct irq_desc *desc, 17 struct irq_desc *desc,
18 int cpu, int nr) 18 int node, int nr)
19{ 19{
20 init_kstat_irqs(desc, cpu, nr); 20 init_kstat_irqs(desc, node, nr);
21 21
22 if (desc->kstat_irqs != old_desc->kstat_irqs) 22 if (desc->kstat_irqs != old_desc->kstat_irqs)
23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs, 23 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
34} 34}
35 35
36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, 36static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
37 struct irq_desc *desc, int cpu) 37 struct irq_desc *desc, int node)
38{ 38{
39 memcpy(desc, old_desc, sizeof(struct irq_desc)); 39 memcpy(desc, old_desc, sizeof(struct irq_desc));
40 if (!init_alloc_desc_masks(desc, cpu, false)) { 40 if (!alloc_desc_masks(desc, node, false)) {
41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " 41 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 spin_lock_init(&desc->lock);
46 desc->cpu = cpu; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
49 init_copy_desc_masks(old_desc, desc); 49 init_copy_desc_masks(old_desc, desc);
50 arch_init_copy_chip_data(old_desc, desc, cpu); 50 arch_init_copy_chip_data(old_desc, desc, node);
51 return true; 51 return true;
52} 52}
53 53
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
59} 59}
60 60
61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, 61static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
62 int cpu) 62 int node)
63{ 63{
64 struct irq_desc *desc; 64 struct irq_desc *desc;
65 unsigned int irq; 65 unsigned int irq;
66 unsigned long flags; 66 unsigned long flags;
67 int node;
68 67
69 irq = old_desc->irq; 68 irq = old_desc->irq;
70 69
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
76 if (desc && old_desc != desc) 75 if (desc && old_desc != desc)
77 goto out_unlock; 76 goto out_unlock;
78 77
79 node = cpu_to_node(cpu);
80 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); 78 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
81 if (!desc) { 79 if (!desc) {
82 printk(KERN_ERR "irq %d: can not get new irq_desc " 80 printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
85 desc = old_desc; 83 desc = old_desc;
86 goto out_unlock; 84 goto out_unlock;
87 } 85 }
88 if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { 86 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
89 /* still use old one */ 87 /* still use old one */
90 kfree(desc); 88 kfree(desc);
91 desc = old_desc; 89 desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
97 95
98 /* free the old one */ 96 /* free the old one */
99 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
100 spin_unlock(&old_desc->lock);
101 kfree(old_desc); 98 kfree(old_desc);
102 spin_lock(&desc->lock);
103 99
104 return desc; 100 return desc;
105 101
@@ -109,24 +105,14 @@ out_unlock:
109 return desc; 105 return desc;
110} 106}
111 107
112struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
113{ 109{
114 int old_cpu;
115 int node, old_node;
116
117 /* those all static, do move them */ 110 /* those all static, do move them */
118 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY)
119 return desc; 112 return desc;
120 113
121 old_cpu = desc->cpu; 114 if (desc->node != node)
122 if (old_cpu != cpu) { 115 desc = __real_move_irq_desc(desc, node);
123 node = cpu_to_node(cpu);
124 old_node = cpu_to_node(old_cpu);
125 if (old_node != node)
126 desc = __real_move_irq_desc(desc, cpu);
127 else
128 desc->cpu = cpu;
129 }
130 116
131 return desc; 117 return desc;
132} 118}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 374faf9bfdc7..3a29dbe7898e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,12 +30,16 @@
30#define all_var 0 30#define all_var 0
31#endif 31#endif
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/*
34 * These will be re-linked against their real values
35 * during the second link stage.
36 */
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 37extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const u8 kallsyms_names[] __attribute__((weak)); 38extern const u8 kallsyms_names[] __attribute__((weak));
36 39
37/* tell the compiler that the count isn't in the small data section if the arch 40/*
38 * has one (eg: FRV) 41 * Tell the compiler that the count isn't in the small data section if the arch
42 * has one (eg: FRV).
39 */ 43 */
40extern const unsigned long kallsyms_num_syms 44extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata"))); 45__attribute__((weak, section(".rodata")));
@@ -75,31 +79,37 @@ static int is_ksym_addr(unsigned long addr)
75 return is_kernel_text(addr) || is_kernel_inittext(addr); 79 return is_kernel_text(addr) || is_kernel_inittext(addr);
76} 80}
77 81
78/* expand a compressed symbol data into the resulting uncompressed string, 82/*
79 given the offset to where the symbol is in the compressed stream */ 83 * Expand a compressed symbol data into the resulting uncompressed string,
84 * given the offset to where the symbol is in the compressed stream.
85 */
80static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) 86static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
81{ 87{
82 int len, skipped_first = 0; 88 int len, skipped_first = 0;
83 const u8 *tptr, *data; 89 const u8 *tptr, *data;
84 90
85 /* get the compressed symbol length from the first symbol byte */ 91 /* Get the compressed symbol length from the first symbol byte. */
86 data = &kallsyms_names[off]; 92 data = &kallsyms_names[off];
87 len = *data; 93 len = *data;
88 data++; 94 data++;
89 95
90 /* update the offset to return the offset for the next symbol on 96 /*
91 * the compressed stream */ 97 * Update the offset to return the offset for the next symbol on
98 * the compressed stream.
99 */
92 off += len + 1; 100 off += len + 1;
93 101
94 /* for every byte on the compressed symbol data, copy the table 102 /*
95 entry for that byte */ 103 * For every byte on the compressed symbol data, copy the table
96 while(len) { 104 * entry for that byte.
97 tptr = &kallsyms_token_table[ kallsyms_token_index[*data] ]; 105 */
106 while (len) {
107 tptr = &kallsyms_token_table[kallsyms_token_index[*data]];
98 data++; 108 data++;
99 len--; 109 len--;
100 110
101 while (*tptr) { 111 while (*tptr) {
102 if(skipped_first) { 112 if (skipped_first) {
103 *result = *tptr; 113 *result = *tptr;
104 result++; 114 result++;
105 } else 115 } else
@@ -110,36 +120,46 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
110 120
111 *result = '\0'; 121 *result = '\0';
112 122
113 /* return to offset to the next symbol */ 123 /* Return to offset to the next symbol. */
114 return off; 124 return off;
115} 125}
116 126
117/* get symbol type information. This is encoded as a single char at the 127/*
118 * begining of the symbol name */ 128 * Get symbol type information. This is encoded as a single char at the
129 * beginning of the symbol name.
130 */
119static char kallsyms_get_symbol_type(unsigned int off) 131static char kallsyms_get_symbol_type(unsigned int off)
120{ 132{
121 /* get just the first code, look it up in the token table, and return the 133 /*
122 * first char from this token */ 134 * Get just the first code, look it up in the token table,
123 return kallsyms_token_table[ kallsyms_token_index[ kallsyms_names[off+1] ] ]; 135 * and return the first char from this token.
136 */
137 return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]];
124} 138}
125 139
126 140
127/* find the offset on the compressed stream given and index in the 141/*
128 * kallsyms array */ 142 * Find the offset on the compressed stream given and index in the
143 * kallsyms array.
144 */
129static unsigned int get_symbol_offset(unsigned long pos) 145static unsigned int get_symbol_offset(unsigned long pos)
130{ 146{
131 const u8 *name; 147 const u8 *name;
132 int i; 148 int i;
133 149
134 /* use the closest marker we have. We have markers every 256 positions, 150 /*
135 * so that should be close enough */ 151 * Use the closest marker we have. We have markers every 256 positions,
136 name = &kallsyms_names[ kallsyms_markers[pos>>8] ]; 152 * so that should be close enough.
153 */
154 name = &kallsyms_names[kallsyms_markers[pos >> 8]];
137 155
138 /* sequentially scan all the symbols up to the point we're searching for. 156 /*
139 * Every symbol is stored in a [<len>][<len> bytes of data] format, so we 157 * Sequentially scan all the symbols up to the point we're searching
140 * just need to add the len to the current pointer for every symbol we 158 * for. Every symbol is stored in a [<len>][<len> bytes of data] format,
141 * wish to skip */ 159 * so we just need to add the len to the current pointer for every
142 for(i = 0; i < (pos&0xFF); i++) 160 * symbol we wish to skip.
161 */
162 for (i = 0; i < (pos & 0xFF); i++)
143 name = name + (*name) + 1; 163 name = name + (*name) + 1;
144 164
145 return name - kallsyms_names; 165 return name - kallsyms_names;
@@ -190,7 +210,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
190 /* This kernel should never had been booted. */ 210 /* This kernel should never had been booted. */
191 BUG_ON(!kallsyms_addresses); 211 BUG_ON(!kallsyms_addresses);
192 212
193 /* do a binary search on the sorted kallsyms_addresses array */ 213 /* Do a binary search on the sorted kallsyms_addresses array. */
194 low = 0; 214 low = 0;
195 high = kallsyms_num_syms; 215 high = kallsyms_num_syms;
196 216
@@ -203,15 +223,15 @@ static unsigned long get_symbol_pos(unsigned long addr,
203 } 223 }
204 224
205 /* 225 /*
206 * search for the first aliased symbol. Aliased 226 * Search for the first aliased symbol. Aliased
207 * symbols are symbols with the same address 227 * symbols are symbols with the same address.
208 */ 228 */
209 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) 229 while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
210 --low; 230 --low;
211 231
212 symbol_start = kallsyms_addresses[low]; 232 symbol_start = kallsyms_addresses[low];
213 233
214 /* Search for next non-aliased symbol */ 234 /* Search for next non-aliased symbol. */
215 for (i = low + 1; i < kallsyms_num_syms; i++) { 235 for (i = low + 1; i < kallsyms_num_syms; i++) {
216 if (kallsyms_addresses[i] > symbol_start) { 236 if (kallsyms_addresses[i] > symbol_start) {
217 symbol_end = kallsyms_addresses[i]; 237 symbol_end = kallsyms_addresses[i];
@@ -219,7 +239,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
219 } 239 }
220 } 240 }
221 241
222 /* if we found no next symbol, we use the end of the section */ 242 /* If we found no next symbol, we use the end of the section. */
223 if (!symbol_end) { 243 if (!symbol_end) {
224 if (is_kernel_inittext(addr)) 244 if (is_kernel_inittext(addr))
225 symbol_end = (unsigned long)_einittext; 245 symbol_end = (unsigned long)_einittext;
@@ -252,10 +272,10 @@ int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
252 272
253/* 273/*
254 * Lookup an address 274 * Lookup an address
255 * - modname is set to NULL if it's in the kernel 275 * - modname is set to NULL if it's in the kernel.
256 * - we guarantee that the returned name is valid until we reschedule even if 276 * - We guarantee that the returned name is valid until we reschedule even if.
257 * it resides in a module 277 * It resides in a module.
258 * - we also guarantee that modname will be valid until rescheduled 278 * - We also guarantee that modname will be valid until rescheduled.
259 */ 279 */
260const char *kallsyms_lookup(unsigned long addr, 280const char *kallsyms_lookup(unsigned long addr,
261 unsigned long *symbolsize, 281 unsigned long *symbolsize,
@@ -276,7 +296,7 @@ const char *kallsyms_lookup(unsigned long addr,
276 return namebuf; 296 return namebuf;
277 } 297 }
278 298
279 /* see if it's in a module */ 299 /* See if it's in a module. */
280 return module_address_lookup(addr, symbolsize, offset, modname, 300 return module_address_lookup(addr, symbolsize, offset, modname,
281 namebuf); 301 namebuf);
282} 302}
@@ -294,7 +314,7 @@ int lookup_symbol_name(unsigned long addr, char *symname)
294 kallsyms_expand_symbol(get_symbol_offset(pos), symname); 314 kallsyms_expand_symbol(get_symbol_offset(pos), symname);
295 return 0; 315 return 0;
296 } 316 }
297 /* see if it's in a module */ 317 /* See if it's in a module. */
298 return lookup_module_symbol_name(addr, symname); 318 return lookup_module_symbol_name(addr, symname);
299} 319}
300 320
@@ -313,7 +333,7 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
313 modname[0] = '\0'; 333 modname[0] = '\0';
314 return 0; 334 return 0;
315 } 335 }
316 /* see if it's in a module */ 336 /* See if it's in a module. */
317 return lookup_module_symbol_attrs(addr, size, offset, modname, name); 337 return lookup_module_symbol_attrs(addr, size, offset, modname, name);
318} 338}
319 339
@@ -342,6 +362,7 @@ int sprint_symbol(char *buffer, unsigned long address)
342 362
343 return len; 363 return len;
344} 364}
365EXPORT_SYMBOL_GPL(sprint_symbol);
345 366
346/* Look up a kernel symbol and print it to the kernel messages. */ 367/* Look up a kernel symbol and print it to the kernel messages. */
347void __print_symbol(const char *fmt, unsigned long address) 368void __print_symbol(const char *fmt, unsigned long address)
@@ -352,13 +373,13 @@ void __print_symbol(const char *fmt, unsigned long address)
352 373
353 printk(fmt, buffer); 374 printk(fmt, buffer);
354} 375}
376EXPORT_SYMBOL(__print_symbol);
355 377
356/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ 378/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
357struct kallsym_iter 379struct kallsym_iter {
358{
359 loff_t pos; 380 loff_t pos;
360 unsigned long value; 381 unsigned long value;
361 unsigned int nameoff; /* If iterating in core kernel symbols */ 382 unsigned int nameoff; /* If iterating in core kernel symbols. */
362 char type; 383 char type;
363 char name[KSYM_NAME_LEN]; 384 char name[KSYM_NAME_LEN];
364 char module_name[MODULE_NAME_LEN]; 385 char module_name[MODULE_NAME_LEN];
@@ -404,7 +425,7 @@ static int update_iter(struct kallsym_iter *iter, loff_t pos)
404 iter->pos = pos; 425 iter->pos = pos;
405 return get_ksymbol_mod(iter); 426 return get_ksymbol_mod(iter);
406 } 427 }
407 428
408 /* If we're not on the desired position, reset to new position. */ 429 /* If we're not on the desired position, reset to new position. */
409 if (pos != iter->pos) 430 if (pos != iter->pos)
410 reset_iter(iter, pos); 431 reset_iter(iter, pos);
@@ -439,23 +460,25 @@ static int s_show(struct seq_file *m, void *p)
439{ 460{
440 struct kallsym_iter *iter = m->private; 461 struct kallsym_iter *iter = m->private;
441 462
442 /* Some debugging symbols have no name. Ignore them. */ 463 /* Some debugging symbols have no name. Ignore them. */
443 if (!iter->name[0]) 464 if (!iter->name[0])
444 return 0; 465 return 0;
445 466
446 if (iter->module_name[0]) { 467 if (iter->module_name[0]) {
447 char type; 468 char type;
448 469
449 /* Label it "global" if it is exported, 470 /*
450 * "local" if not exported. */ 471 * Label it "global" if it is exported,
472 * "local" if not exported.
473 */
451 type = iter->exported ? toupper(iter->type) : 474 type = iter->exported ? toupper(iter->type) :
452 tolower(iter->type); 475 tolower(iter->type);
453 seq_printf(m, "%0*lx %c %s\t[%s]\n", 476 seq_printf(m, "%0*lx %c %s\t[%s]\n",
454 (int)(2*sizeof(void*)), 477 (int)(2 * sizeof(void *)),
455 iter->value, type, iter->name, iter->module_name); 478 iter->value, type, iter->name, iter->module_name);
456 } else 479 } else
457 seq_printf(m, "%0*lx %c %s\n", 480 seq_printf(m, "%0*lx %c %s\n",
458 (int)(2*sizeof(void*)), 481 (int)(2 * sizeof(void *)),
459 iter->value, iter->type, iter->name); 482 iter->value, iter->type, iter->name);
460 return 0; 483 return 0;
461} 484}
@@ -469,9 +492,11 @@ static const struct seq_operations kallsyms_op = {
469 492
470static int kallsyms_open(struct inode *inode, struct file *file) 493static int kallsyms_open(struct inode *inode, struct file *file)
471{ 494{
472 /* We keep iterator in m->private, since normal case is to 495 /*
496 * We keep iterator in m->private, since normal case is to
473 * s_start from where we left off, so we avoid doing 497 * s_start from where we left off, so we avoid doing
474 * using get_symbol_offset for every symbol */ 498 * using get_symbol_offset for every symbol.
499 */
475 struct kallsym_iter *iter; 500 struct kallsym_iter *iter;
476 int ret; 501 int ret;
477 502
@@ -500,7 +525,4 @@ static int __init kallsyms_init(void)
500 proc_create("kallsyms", 0444, NULL, &kallsyms_operations); 525 proc_create("kallsyms", 0444, NULL, &kallsyms_operations);
501 return 0; 526 return 0;
502} 527}
503__initcall(kallsyms_init); 528device_initcall(kallsyms_init);
504
505EXPORT_SYMBOL(__print_symbol);
506EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 5a758c6e4950..ae1c35201cc8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1448,18 +1448,17 @@ int kernel_kexec(void)
1448 goto Restore_console; 1448 goto Restore_console;
1449 } 1449 }
1450 suspend_console(); 1450 suspend_console();
1451 error = device_suspend(PMSG_FREEZE); 1451 error = dpm_suspend_start(PMSG_FREEZE);
1452 if (error) 1452 if (error)
1453 goto Resume_console; 1453 goto Resume_console;
1454 device_pm_lock(); 1454 /* At this point, dpm_suspend_start() has been called,
1455 /* At this point, device_suspend() has been called, 1455 * but *not* dpm_suspend_noirq(). We *must* call
1456 * but *not* device_power_down(). We *must* 1456 * dpm_suspend_noirq() now. Otherwise, drivers for
1457 * device_power_down() now. Otherwise, drivers for
1458 * some devices (e.g. interrupt controllers) become 1457 * some devices (e.g. interrupt controllers) become
1459 * desynchronized with the actual state of the 1458 * desynchronized with the actual state of the
1460 * hardware at resume time, and evil weirdness ensues. 1459 * hardware at resume time, and evil weirdness ensues.
1461 */ 1460 */
1462 error = device_power_down(PMSG_FREEZE); 1461 error = dpm_suspend_noirq(PMSG_FREEZE);
1463 if (error) 1462 if (error)
1464 goto Resume_devices; 1463 goto Resume_devices;
1465 error = disable_nonboot_cpus(); 1464 error = disable_nonboot_cpus();
@@ -1487,10 +1486,9 @@ int kernel_kexec(void)
1487 local_irq_enable(); 1486 local_irq_enable();
1488 Enable_cpus: 1487 Enable_cpus:
1489 enable_nonboot_cpus(); 1488 enable_nonboot_cpus();
1490 device_power_up(PMSG_RESTORE); 1489 dpm_resume_noirq(PMSG_RESTORE);
1491 Resume_devices: 1490 Resume_devices:
1492 device_pm_unlock(); 1491 dpm_resume_end(PMSG_RESTORE);
1493 device_resume(PMSG_RESTORE);
1494 Resume_console: 1492 Resume_console:
1495 resume_console(); 1493 resume_console();
1496 thaw_processes(); 1494 thaw_processes();
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index bc41ad0f24f8..26539e3228e5 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -72,9 +72,9 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
72 72
73 /* 73 /*
74 * round up to the next power of 2, since our 'let the indices 74 * round up to the next power of 2, since our 'let the indices
75 * wrap' tachnique works only in this case. 75 * wrap' technique works only in this case.
76 */ 76 */
77 if (size & (size - 1)) { 77 if (!is_power_of_2(size)) {
78 BUG_ON(size > 0x80000000); 78 BUG_ON(size > 0x80000000);
79 size = roundup_pow_of_two(size); 79 size = roundup_pow_of_two(size);
80 } 80 }
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index e4dcfb2272a4..9147a3190c9d 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1583 1583
1584static struct sysrq_key_op sysrq_gdb_op = { 1584static struct sysrq_key_op sysrq_gdb_op = {
1585 .handler = sysrq_handle_gdb, 1585 .handler = sysrq_handle_gdb,
1586 .help_msg = "Gdb", 1586 .help_msg = "debug(G)",
1587 .action_msg = "GDB", 1587 .action_msg = "DEBUG",
1588}; 1588};
1589#endif 1589#endif
1590 1590
diff --git a/kernel/kmod.c b/kernel/kmod.c
index b750675251e5..7e95bedb2bfc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -370,8 +370,10 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
370 sub_info->argv = argv; 370 sub_info->argv = argv;
371 sub_info->envp = envp; 371 sub_info->envp = envp;
372 sub_info->cred = prepare_usermodehelper_creds(); 372 sub_info->cred = prepare_usermodehelper_creds();
373 if (!sub_info->cred) 373 if (!sub_info->cred) {
374 kfree(sub_info);
374 return NULL; 375 return NULL;
376 }
375 377
376 out: 378 out:
377 return sub_info; 379 return sub_info;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519abf..9b1a7de26979 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -9,11 +9,12 @@
9#include <linux/kthread.h> 9#include <linux/kthread.h>
10#include <linux/completion.h> 10#include <linux/completion.h>
11#include <linux/err.h> 11#include <linux/err.h>
12#include <linux/cpuset.h>
12#include <linux/unistd.h> 13#include <linux/unistd.h>
13#include <linux/file.h> 14#include <linux/file.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
16#include <trace/sched.h> 17#include <trace/events/sched.h>
17 18
18#define KTHREAD_NICE_LEVEL (-5) 19#define KTHREAD_NICE_LEVEL (-5)
19 20
@@ -21,15 +22,11 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 22static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 23struct task_struct *kthreadd_task;
23 24
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
27struct kthread_create_info 25struct kthread_create_info
28{ 26{
29 /* Information passed to kthread() from kthreadd. */ 27 /* Information passed to kthread() from kthreadd. */
30 int (*threadfn)(void *data); 28 int (*threadfn)(void *data);
31 void *data; 29 void *data;
32 struct completion started;
33 30
34 /* Result passed back to kthread_create() from kthreadd. */ 31 /* Result passed back to kthread_create() from kthreadd. */
35 struct task_struct *result; 32 struct task_struct *result;
@@ -38,17 +35,13 @@ struct kthread_create_info
38 struct list_head list; 35 struct list_head list;
39}; 36};
40 37
41struct kthread_stop_info 38struct kthread {
42{ 39 int should_stop;
43 struct task_struct *k; 40 struct completion exited;
44 int err;
45 struct completion done;
46}; 41};
47 42
48/* Thread stopping is done by setthing this var: lock serializes 43#define to_kthread(tsk) \
49 * multiple kthread_stop calls. */ 44 container_of((tsk)->vfork_done, struct kthread, exited)
50static DEFINE_MUTEX(kthread_stop_lock);
51static struct kthread_stop_info kthread_stop_info;
52 45
53/** 46/**
54 * kthread_should_stop - should this kthread return now? 47 * kthread_should_stop - should this kthread return now?
@@ -59,36 +52,35 @@ static struct kthread_stop_info kthread_stop_info;
59 */ 52 */
60int kthread_should_stop(void) 53int kthread_should_stop(void)
61{ 54{
62 return (kthread_stop_info.k == current); 55 return to_kthread(current)->should_stop;
63} 56}
64EXPORT_SYMBOL(kthread_should_stop); 57EXPORT_SYMBOL(kthread_should_stop);
65 58
66static int kthread(void *_create) 59static int kthread(void *_create)
67{ 60{
61 /* Copy data: it's on kthread's stack */
68 struct kthread_create_info *create = _create; 62 struct kthread_create_info *create = _create;
69 int (*threadfn)(void *data); 63 int (*threadfn)(void *data) = create->threadfn;
70 void *data; 64 void *data = create->data;
71 int ret = -EINTR; 65 struct kthread self;
66 int ret;
72 67
73 /* Copy data: it's on kthread's stack */ 68 self.should_stop = 0;
74 threadfn = create->threadfn; 69 init_completion(&self.exited);
75 data = create->data; 70 current->vfork_done = &self.exited;
76 71
77 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
78 __set_current_state(TASK_UNINTERRUPTIBLE); 73 __set_current_state(TASK_UNINTERRUPTIBLE);
79 create->result = current; 74 create->result = current;
80 complete(&create->started); 75 complete(&create->done);
81 schedule(); 76 schedule();
82 77
83 if (!kthread_should_stop()) 78 ret = -EINTR;
79 if (!self.should_stop)
84 ret = threadfn(data); 80 ret = threadfn(data);
85 81
86 /* It might have exited on its own, w/o kthread_stop. Check. */ 82 /* we can't just return, we must preserve "self" on stack */
87 if (kthread_should_stop()) { 83 do_exit(ret);
88 kthread_stop_info.err = ret;
89 complete(&kthread_stop_info.done);
90 }
91 return 0;
92} 84}
93 85
94static void create_kthread(struct kthread_create_info *create) 86static void create_kthread(struct kthread_create_info *create)
@@ -97,11 +89,10 @@ static void create_kthread(struct kthread_create_info *create)
97 89
98 /* We want our own signal handler (we take no signals by default). */ 90 /* We want our own signal handler (we take no signals by default). */
99 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 91 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
100 if (pid < 0) 92 if (pid < 0) {
101 create->result = ERR_PTR(pid); 93 create->result = ERR_PTR(pid);
102 else 94 complete(&create->done);
103 wait_for_completion(&create->started); 95 }
104 complete(&create->done);
105} 96}
106 97
107/** 98/**
@@ -132,7 +123,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
132 123
133 create.threadfn = threadfn; 124 create.threadfn = threadfn;
134 create.data = data; 125 create.data = data;
135 init_completion(&create.started);
136 init_completion(&create.done); 126 init_completion(&create.done);
137 127
138 spin_lock(&kthread_create_lock); 128 spin_lock(&kthread_create_lock);
@@ -200,30 +190,22 @@ EXPORT_SYMBOL(kthread_bind);
200 */ 190 */
201int kthread_stop(struct task_struct *k) 191int kthread_stop(struct task_struct *k)
202{ 192{
193 struct kthread *kthread;
203 int ret; 194 int ret;
204 195
205 mutex_lock(&kthread_stop_lock);
206
207 /* It could exit after stop_info.k set, but before wake_up_process. */
208 get_task_struct(k);
209
210 trace_sched_kthread_stop(k); 196 trace_sched_kthread_stop(k);
197 get_task_struct(k);
211 198
212 /* Must init completion *before* thread sees kthread_stop_info.k */ 199 kthread = to_kthread(k);
213 init_completion(&kthread_stop_info.done); 200 barrier(); /* it might have exited */
214 smp_wmb(); 201 if (k->vfork_done != NULL) {
202 kthread->should_stop = 1;
203 wake_up_process(k);
204 wait_for_completion(&kthread->exited);
205 }
206 ret = k->exit_code;
215 207
216 /* Now set kthread_should_stop() to true, and wake it up. */
217 kthread_stop_info.k = k;
218 wake_up_process(k);
219 put_task_struct(k); 208 put_task_struct(k);
220
221 /* Once it dies, reset stop ptr, gather result and we're done. */
222 wait_for_completion(&kthread_stop_info.done);
223 kthread_stop_info.k = NULL;
224 ret = kthread_stop_info.err;
225 mutex_unlock(&kthread_stop_lock);
226
227 trace_sched_kthread_stop_ret(ret); 209 trace_sched_kthread_stop_ret(ret);
228 210
229 return ret; 211 return ret;
@@ -239,6 +221,7 @@ int kthreadd(void *unused)
239 ignore_signals(tsk); 221 ignore_signals(tsk);
240 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 222 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
241 set_cpus_allowed_ptr(tsk, cpu_all_mask); 223 set_cpus_allowed_ptr(tsk, cpu_all_mask);
224 set_mems_allowed(node_possible_map);
242 225
243 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 226 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
244 227
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index accb40cdb12a..8bbeef996c76 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <trace/lockdep.h>
46 45
47#include <asm/sections.h> 46#include <asm/sections.h>
48 47
49#include "lockdep_internals.h" 48#include "lockdep_internals.h"
50 49
50#define CREATE_TRACE_POINTS
51#include <trace/events/lockdep.h>
52
51#ifdef CONFIG_PROVE_LOCKING 53#ifdef CONFIG_PROVE_LOCKING
52int prove_locking = 1; 54int prove_locking = 1;
53module_param(prove_locking, int, 0644); 55module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2935} 2937}
2936EXPORT_SYMBOL_GPL(lock_set_class); 2938EXPORT_SYMBOL_GPL(lock_set_class);
2937 2939
2938DEFINE_TRACE(lock_acquire);
2939
2940/* 2940/*
2941 * We are not always called with irqs disabled - do that here, 2941 * We are not always called with irqs disabled - do that here,
2942 * and also avoid lockdep recursion: 2942 * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2963} 2963}
2964EXPORT_SYMBOL_GPL(lock_acquire); 2964EXPORT_SYMBOL_GPL(lock_acquire);
2965 2965
2966DEFINE_TRACE(lock_release);
2967
2968void lock_release(struct lockdep_map *lock, int nested, 2966void lock_release(struct lockdep_map *lock, int nested,
2969 unsigned long ip) 2967 unsigned long ip)
2970{ 2968{
@@ -3105,6 +3103,8 @@ found_it:
3105 hlock->holdtime_stamp = now; 3103 hlock->holdtime_stamp = now;
3106 } 3104 }
3107 3105
3106 trace_lock_acquired(lock, ip, waittime);
3107
3108 stats = get_lock_stats(hlock_class(hlock)); 3108 stats = get_lock_stats(hlock_class(hlock));
3109 if (waittime) { 3109 if (waittime) {
3110 if (hlock->read) 3110 if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
3120 lock->ip = ip; 3120 lock->ip = ip;
3121} 3121}
3122 3122
3123DEFINE_TRACE(lock_contended);
3124
3125void lock_contended(struct lockdep_map *lock, unsigned long ip) 3123void lock_contended(struct lockdep_map *lock, unsigned long ip)
3126{ 3124{
3127 unsigned long flags; 3125 unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3143} 3141}
3144EXPORT_SYMBOL_GPL(lock_contended); 3142EXPORT_SYMBOL_GPL(lock_contended);
3145 3143
3146DEFINE_TRACE(lock_acquired);
3147
3148void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3144void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3149{ 3145{
3150 unsigned long flags; 3146 unsigned long flags;
3151 3147
3152 trace_lock_acquired(lock, ip);
3153
3154 if (unlikely(!lock_stat)) 3148 if (unlikely(!lock_stat))
3155 return; 3149 return;
3156 3150
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2cc7e9a6e84..699a2ac3a0d7 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 54 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 55 * conflicts and deadlocks.
56 */ 56 */
57#define MAX_LOCKDEP_ENTRIES 8192UL 57#define MAX_LOCKDEP_ENTRIES 16384UL
58 58
59#define MAX_LOCKDEP_CHAINS_BITS 14 59#define MAX_LOCKDEP_CHAINS_BITS 15
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61 61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) 62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d95..38928fcaff2b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
18*/ 18*/
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/ftrace_event.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
23#include <linux/fs.h> 24#include <linux/fs.h>
@@ -52,6 +53,7 @@
52#include <linux/ftrace.h> 53#include <linux/ftrace.h>
53#include <linux/async.h> 54#include <linux/async.h>
54#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h>
55 57
56#if 0 58#if 0
57#define DEBUGP printk 59#define DEBUGP printk
@@ -72,6 +74,9 @@ DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex); 74EXPORT_SYMBOL_GPL(module_mutex);
73static LIST_HEAD(modules); 75static LIST_HEAD(modules);
74 76
77/* Block module loading/unloading? */
78int modules_disabled = 0;
79
75/* Waiting for a module to finish initializing? */ 80/* Waiting for a module to finish initializing? */
76static DECLARE_WAIT_QUEUE_HEAD(module_wq); 81static DECLARE_WAIT_QUEUE_HEAD(module_wq);
77 82
@@ -429,6 +434,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
429 unsigned long extra; 434 unsigned long extra;
430 unsigned int i; 435 unsigned int i;
431 void *ptr; 436 void *ptr;
437 int cpu;
432 438
433 if (align > PAGE_SIZE) { 439 if (align > PAGE_SIZE) {
434 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 440 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -458,6 +464,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
458 if (!split_block(i, size)) 464 if (!split_block(i, size))
459 return NULL; 465 return NULL;
460 466
467 /* add the per-cpu scanning areas */
468 for_each_possible_cpu(cpu)
469 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
470 GFP_KERNEL);
471
461 /* Mark allocated */ 472 /* Mark allocated */
462 pcpu_size[i] = -pcpu_size[i]; 473 pcpu_size[i] = -pcpu_size[i];
463 return ptr; 474 return ptr;
@@ -472,6 +483,7 @@ static void percpu_modfree(void *freeme)
472{ 483{
473 unsigned int i; 484 unsigned int i;
474 void *ptr = __per_cpu_start + block_size(pcpu_size[0]); 485 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
486 int cpu;
475 487
476 /* First entry is core kernel percpu data. */ 488 /* First entry is core kernel percpu data. */
477 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 489 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -483,6 +495,10 @@ static void percpu_modfree(void *freeme)
483 BUG(); 495 BUG();
484 496
485 free: 497 free:
498 /* remove the per-cpu scanning areas */
499 for_each_possible_cpu(cpu)
500 kmemleak_free(freeme + per_cpu_offset(cpu));
501
486 /* Merge with previous? */ 502 /* Merge with previous? */
487 if (pcpu_size[i-1] >= 0) { 503 if (pcpu_size[i-1] >= 0) {
488 pcpu_size[i-1] += pcpu_size[i]; 504 pcpu_size[i-1] += pcpu_size[i];
@@ -777,7 +793,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
777 char name[MODULE_NAME_LEN]; 793 char name[MODULE_NAME_LEN];
778 int ret, forced = 0; 794 int ret, forced = 0;
779 795
780 if (!capable(CAP_SYS_MODULE)) 796 if (!capable(CAP_SYS_MODULE) || modules_disabled)
781 return -EPERM; 797 return -EPERM;
782 798
783 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) 799 if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -1489,9 +1505,6 @@ static void free_module(struct module *mod)
1489 /* Free any allocated parameters. */ 1505 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp); 1506 destroy_params(mod->kp, mod->num_kp);
1491 1507
1492 /* release any pointers to mcount in this module */
1493 ftrace_release(mod->module_core, mod->core_size);
1494
1495 /* This may be NULL, but that's OK */ 1508 /* This may be NULL, but that's OK */
1496 module_free(mod, mod->module_init); 1509 module_free(mod, mod->module_init);
1497 kfree(mod->args); 1510 kfree(mod->args);
@@ -1878,6 +1891,36 @@ static void *module_alloc_update_bounds(unsigned long size)
1878 return ret; 1891 return ret;
1879} 1892}
1880 1893
1894#ifdef CONFIG_DEBUG_KMEMLEAK
1895static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1896 Elf_Shdr *sechdrs, char *secstrings)
1897{
1898 unsigned int i;
1899
1900 /* only scan the sections containing data */
1901 kmemleak_scan_area(mod->module_core, (unsigned long)mod -
1902 (unsigned long)mod->module_core,
1903 sizeof(struct module), GFP_KERNEL);
1904
1905 for (i = 1; i < hdr->e_shnum; i++) {
1906 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
1907 continue;
1908 if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
1909 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
1910 continue;
1911
1912 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
1913 (unsigned long)mod->module_core,
1914 sechdrs[i].sh_size, GFP_KERNEL);
1915 }
1916}
1917#else
1918static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
1919 Elf_Shdr *sechdrs, char *secstrings)
1920{
1921}
1922#endif
1923
1881/* Allocate and load the module: note that size of section 0 is always 1924/* Allocate and load the module: note that size of section 0 is always
1882 zero, and we rely on this for optional sections. */ 1925 zero, and we rely on this for optional sections. */
1883static noinline struct module *load_module(void __user *umod, 1926static noinline struct module *load_module(void __user *umod,
@@ -1892,11 +1935,9 @@ static noinline struct module *load_module(void __user *umod,
1892 unsigned int symindex = 0; 1935 unsigned int symindex = 0;
1893 unsigned int strindex = 0; 1936 unsigned int strindex = 0;
1894 unsigned int modindex, versindex, infoindex, pcpuindex; 1937 unsigned int modindex, versindex, infoindex, pcpuindex;
1895 unsigned int num_mcount;
1896 struct module *mod; 1938 struct module *mod;
1897 long err = 0; 1939 long err = 0;
1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1940 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1899 unsigned long *mseg;
1900 mm_segment_t old_fs; 1941 mm_segment_t old_fs;
1901 1942
1902 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1943 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2050,6 +2091,12 @@ static noinline struct module *load_module(void __user *umod,
2050 2091
2051 /* Do the allocs. */ 2092 /* Do the allocs. */
2052 ptr = module_alloc_update_bounds(mod->core_size); 2093 ptr = module_alloc_update_bounds(mod->core_size);
2094 /*
2095 * The pointer to this block is stored in the module structure
2096 * which is inside the block. Just mark it as not being a
2097 * leak.
2098 */
2099 kmemleak_not_leak(ptr);
2053 if (!ptr) { 2100 if (!ptr) {
2054 err = -ENOMEM; 2101 err = -ENOMEM;
2055 goto free_percpu; 2102 goto free_percpu;
@@ -2058,6 +2105,13 @@ static noinline struct module *load_module(void __user *umod,
2058 mod->module_core = ptr; 2105 mod->module_core = ptr;
2059 2106
2060 ptr = module_alloc_update_bounds(mod->init_size); 2107 ptr = module_alloc_update_bounds(mod->init_size);
2108 /*
2109 * The pointer to this block is stored in the module structure
2110 * which is inside the block. This block doesn't need to be
2111 * scanned as it contains data and code that will be freed
2112 * after the module is initialized.
2113 */
2114 kmemleak_ignore(ptr);
2061 if (!ptr && mod->init_size) { 2115 if (!ptr && mod->init_size) {
2062 err = -ENOMEM; 2116 err = -ENOMEM;
2063 goto free_core; 2117 goto free_core;
@@ -2088,6 +2142,7 @@ static noinline struct module *load_module(void __user *umod,
2088 } 2142 }
2089 /* Module has been moved. */ 2143 /* Module has been moved. */
2090 mod = (void *)sechdrs[modindex].sh_addr; 2144 mod = (void *)sechdrs[modindex].sh_addr;
2145 kmemleak_load_module(mod, hdr, sechdrs, secstrings);
2091 2146
2092#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2147#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2093 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), 2148 mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
@@ -2161,6 +2216,10 @@ static noinline struct module *load_module(void __user *umod,
2161 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, 2216 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2162 "__kcrctab_unused_gpl"); 2217 "__kcrctab_unused_gpl");
2163#endif 2218#endif
2219#ifdef CONFIG_CONSTRUCTORS
2220 mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors",
2221 sizeof(*mod->ctors), &mod->num_ctors);
2222#endif
2164 2223
2165#ifdef CONFIG_MARKERS 2224#ifdef CONFIG_MARKERS
2166 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers", 2225 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
@@ -2172,7 +2231,19 @@ static noinline struct module *load_module(void __user *umod,
2172 sizeof(*mod->tracepoints), 2231 sizeof(*mod->tracepoints),
2173 &mod->num_tracepoints); 2232 &mod->num_tracepoints);
2174#endif 2233#endif
2175 2234#ifdef CONFIG_EVENT_TRACING
2235 mod->trace_events = section_objs(hdr, sechdrs, secstrings,
2236 "_ftrace_events",
2237 sizeof(*mod->trace_events),
2238 &mod->num_trace_events);
2239#endif
2240#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2241 /* sechdrs[0].sh_size is always zero */
2242 mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
2243 "__mcount_loc",
2244 sizeof(*mod->ftrace_callsites),
2245 &mod->num_ftrace_callsites);
2246#endif
2176#ifdef CONFIG_MODVERSIONS 2247#ifdef CONFIG_MODVERSIONS
2177 if ((mod->num_syms && !mod->crcs) 2248 if ((mod->num_syms && !mod->crcs)
2178 || (mod->num_gpl_syms && !mod->gpl_crcs) 2249 || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2308,6 @@ static noinline struct module *load_module(void __user *umod,
2237 dynamic_debug_setup(debug, num_debug); 2308 dynamic_debug_setup(debug, num_debug);
2238 } 2309 }
2239 2310
2240 /* sechdrs[0].sh_size is always zero */
2241 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2242 sizeof(*mseg), &num_mcount);
2243 ftrace_init_module(mod, mseg, mseg + num_mcount);
2244
2245 err = module_finalize(hdr, sechdrs, mod); 2311 err = module_finalize(hdr, sechdrs, mod);
2246 if (err < 0) 2312 if (err < 0)
2247 goto cleanup; 2313 goto cleanup;
@@ -2302,7 +2368,6 @@ static noinline struct module *load_module(void __user *umod,
2302 cleanup: 2368 cleanup:
2303 kobject_del(&mod->mkobj.kobj); 2369 kobject_del(&mod->mkobj.kobj);
2304 kobject_put(&mod->mkobj.kobj); 2370 kobject_put(&mod->mkobj.kobj);
2305 ftrace_release(mod->module_core, mod->core_size);
2306 free_unload: 2371 free_unload:
2307 module_unload_free(mod); 2372 module_unload_free(mod);
2308#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2373#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2328,6 +2393,17 @@ static noinline struct module *load_module(void __user *umod,
2328 goto free_hdr; 2393 goto free_hdr;
2329} 2394}
2330 2395
2396/* Call module constructors. */
2397static void do_mod_ctors(struct module *mod)
2398{
2399#ifdef CONFIG_CONSTRUCTORS
2400 unsigned long i;
2401
2402 for (i = 0; i < mod->num_ctors; i++)
2403 mod->ctors[i]();
2404#endif
2405}
2406
2331/* This is where the real work happens */ 2407/* This is where the real work happens */
2332SYSCALL_DEFINE3(init_module, void __user *, umod, 2408SYSCALL_DEFINE3(init_module, void __user *, umod,
2333 unsigned long, len, const char __user *, uargs) 2409 unsigned long, len, const char __user *, uargs)
@@ -2336,7 +2412,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2336 int ret = 0; 2412 int ret = 0;
2337 2413
2338 /* Must have permission */ 2414 /* Must have permission */
2339 if (!capable(CAP_SYS_MODULE)) 2415 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2340 return -EPERM; 2416 return -EPERM;
2341 2417
2342 /* Only one module load at a time, please */ 2418 /* Only one module load at a time, please */
@@ -2356,6 +2432,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2356 blocking_notifier_call_chain(&module_notify_list, 2432 blocking_notifier_call_chain(&module_notify_list,
2357 MODULE_STATE_COMING, mod); 2433 MODULE_STATE_COMING, mod);
2358 2434
2435 do_mod_ctors(mod);
2359 /* Start the module */ 2436 /* Start the module */
2360 if (mod->init != NULL) 2437 if (mod->init != NULL)
2361 ret = do_one_initcall(mod->init); 2438 ret = do_one_initcall(mod->init);
@@ -2394,6 +2471,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2394 mutex_lock(&module_mutex); 2471 mutex_lock(&module_mutex);
2395 /* Drop initial reference. */ 2472 /* Drop initial reference. */
2396 module_put(mod); 2473 module_put(mod);
2474 trim_init_extable(mod);
2397 module_free(mod, mod->module_init); 2475 module_free(mod, mod->module_init);
2398 mod->module_init = NULL; 2476 mod->module_init = NULL;
2399 mod->init_size = 0; 2477 mod->init_size = 0;
@@ -2837,7 +2915,7 @@ void print_modules(void)
2837 struct module *mod; 2915 struct module *mod;
2838 char buf[8]; 2916 char buf[8];
2839 2917
2840 printk("Modules linked in:"); 2918 printk(KERN_DEFAULT "Modules linked in:");
2841 /* Most callers should already have preempt disabled, but make sure */ 2919 /* Most callers should already have preempt disabled, but make sure */
2842 preempt_disable(); 2920 preempt_disable();
2843 list_for_each_entry_rcu(mod, &modules, list) 2921 list_for_each_entry_rcu(mod, &modules, list)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..947b3ad551f8 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 473
472 return ret; 474 return ret;
473} 475}
474
475EXPORT_SYMBOL(mutex_trylock); 476EXPORT_SYMBOL(mutex_trylock);
477
478/**
479 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
480 * @cnt: the atomic which we are to dec
481 * @lock: the mutex to return holding if we dec to 0
482 *
483 * return true and hold lock if we dec to 0, return false otherwise
484 */
485int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
486{
487 /* dec if we can't possibly hit 0 */
488 if (atomic_add_unless(cnt, -1, 1))
489 return 0;
490 /* we might hit 0, so take the lock */
491 mutex_lock(lock);
492 if (!atomic_dec_and_test(cnt)) {
493 /* when we actually did the dec, we didn't hit 0 */
494 mutex_unlock(lock);
495 return 0;
496 }
497 /* we hit 0, and we hold the lock */
498 return 1;
499}
500EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 63598dca2d0c..09b4ff9711b2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,14 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29/* 29static inline struct nsproxy *create_nsproxy(void)
30 * creates a copy of "orig" with refcount 1.
31 */
32static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
33{ 30{
34 struct nsproxy *ns; 31 struct nsproxy *nsproxy;
35 32
36 ns = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); 33 nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
37 if (ns) { 34 if (nsproxy)
38 memcpy(ns, orig, sizeof(struct nsproxy)); 35 atomic_set(&nsproxy->count, 1);
39 atomic_set(&ns->count, 1); 36 return nsproxy;
40 }
41 return ns;
42} 37}
43 38
44/* 39/*
@@ -52,7 +47,7 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
52 struct nsproxy *new_nsp; 47 struct nsproxy *new_nsp;
53 int err; 48 int err;
54 49
55 new_nsp = clone_nsproxy(tsk->nsproxy); 50 new_nsp = create_nsproxy();
56 if (!new_nsp) 51 if (!new_nsp)
57 return ERR_PTR(-ENOMEM); 52 return ERR_PTR(-ENOMEM);
58 53
diff --git a/kernel/panic.c b/kernel/panic.c
index 874ecf1307ae..984b3ecbd72c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -340,39 +340,44 @@ void oops_exit(void)
340} 340}
341 341
342#ifdef WANT_WARN_ON_SLOWPATH 342#ifdef WANT_WARN_ON_SLOWPATH
343void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 343struct slowpath_args {
344{ 344 const char *fmt;
345 va_list args; 345 va_list args;
346 char function[KSYM_SYMBOL_LEN]; 346};
347 unsigned long caller = (unsigned long)__builtin_return_address(0);
348 const char *board;
349 347
350 sprint_symbol(function, caller); 348static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
349{
350 const char *board;
351 351
352 printk(KERN_WARNING "------------[ cut here ]------------\n"); 352 printk(KERN_WARNING "------------[ cut here ]------------\n");
353 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 353 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
354 line, function);
355 board = dmi_get_system_info(DMI_PRODUCT_NAME); 354 board = dmi_get_system_info(DMI_PRODUCT_NAME);
356 if (board) 355 if (board)
357 printk(KERN_WARNING "Hardware name: %s\n", board); 356 printk(KERN_WARNING "Hardware name: %s\n", board);
358 357
359 if (*fmt) { 358 if (args)
360 va_start(args, fmt); 359 vprintk(args->fmt, args->args);
361 vprintk(fmt, args);
362 va_end(args);
363 }
364 360
365 print_modules(); 361 print_modules();
366 dump_stack(); 362 dump_stack();
367 print_oops_end_marker(); 363 print_oops_end_marker();
368 add_taint(TAINT_WARN); 364 add_taint(TAINT_WARN);
369} 365}
366
367void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
368{
369 struct slowpath_args args;
370
371 args.fmt = fmt;
372 va_start(args.args, fmt);
373 warn_slowpath_common(file, line, __builtin_return_address(0), &args);
374 va_end(args.args);
375}
370EXPORT_SYMBOL(warn_slowpath_fmt); 376EXPORT_SYMBOL(warn_slowpath_fmt);
371 377
372void warn_slowpath_null(const char *file, int line) 378void warn_slowpath_null(const char *file, int line)
373{ 379{
374 static const char *empty = ""; 380 warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
375 warn_slowpath_fmt(file, line, empty);
376} 381}
377EXPORT_SYMBOL(warn_slowpath_null); 382EXPORT_SYMBOL(warn_slowpath_null);
378#endif 383#endif
diff --git a/kernel/params.c b/kernel/params.c
index de273ec85bd2..7f6912ced2ba 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,9 +24,6 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
30#if 0 27#if 0
31#define DEBUGP printk 28#define DEBUGP printk
32#else 29#else
@@ -220,13 +217,13 @@ int param_set_charp(const char *val, struct kernel_param *kp)
220 return -ENOSPC; 217 return -ENOSPC;
221 } 218 }
222 219
223 if (kp->perm & KPARAM_KMALLOCED) 220 if (kp->flags & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg); 221 kfree(*(char **)kp->arg);
225 222
226 /* This is a hack. We can't need to strdup in early boot, and we 223 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */ 224 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) { 225 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED; 226 kp->flags |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 227 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg) 228 if (!kp->arg)
232 return -ENOMEM; 229 return -ENOMEM;
@@ -241,44 +238,63 @@ int param_get_charp(char *buffer, struct kernel_param *kp)
241 return sprintf(buffer, "%s", *((char **)kp->arg)); 238 return sprintf(buffer, "%s", *((char **)kp->arg));
242} 239}
243 240
241/* Actually could be a bool or an int, for historical reasons. */
244int param_set_bool(const char *val, struct kernel_param *kp) 242int param_set_bool(const char *val, struct kernel_param *kp)
245{ 243{
244 bool v;
245
246 /* No equals means "set"... */ 246 /* No equals means "set"... */
247 if (!val) val = "1"; 247 if (!val) val = "1";
248 248
249 /* One of =[yYnN01] */ 249 /* One of =[yYnN01] */
250 switch (val[0]) { 250 switch (val[0]) {
251 case 'y': case 'Y': case '1': 251 case 'y': case 'Y': case '1':
252 *(int *)kp->arg = 1; 252 v = true;
253 return 0; 253 break;
254 case 'n': case 'N': case '0': 254 case 'n': case 'N': case '0':
255 *(int *)kp->arg = 0; 255 v = false;
256 return 0; 256 break;
257 default:
258 return -EINVAL;
257 } 259 }
258 return -EINVAL; 260
261 if (kp->flags & KPARAM_ISBOOL)
262 *(bool *)kp->arg = v;
263 else
264 *(int *)kp->arg = v;
265 return 0;
259} 266}
260 267
261int param_get_bool(char *buffer, struct kernel_param *kp) 268int param_get_bool(char *buffer, struct kernel_param *kp)
262{ 269{
270 bool val;
271 if (kp->flags & KPARAM_ISBOOL)
272 val = *(bool *)kp->arg;
273 else
274 val = *(int *)kp->arg;
275
263 /* Y and N chosen as being relatively non-coder friendly */ 276 /* Y and N chosen as being relatively non-coder friendly */
264 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'Y' : 'N'); 277 return sprintf(buffer, "%c", val ? 'Y' : 'N');
265} 278}
266 279
280/* This one must be bool. */
267int param_set_invbool(const char *val, struct kernel_param *kp) 281int param_set_invbool(const char *val, struct kernel_param *kp)
268{ 282{
269 int boolval, ret; 283 int ret;
284 bool boolval;
270 struct kernel_param dummy; 285 struct kernel_param dummy;
271 286
272 dummy.arg = &boolval; 287 dummy.arg = &boolval;
288 dummy.flags = KPARAM_ISBOOL;
273 ret = param_set_bool(val, &dummy); 289 ret = param_set_bool(val, &dummy);
274 if (ret == 0) 290 if (ret == 0)
275 *(int *)kp->arg = !boolval; 291 *(bool *)kp->arg = !boolval;
276 return ret; 292 return ret;
277} 293}
278 294
279int param_get_invbool(char *buffer, struct kernel_param *kp) 295int param_get_invbool(char *buffer, struct kernel_param *kp)
280{ 296{
281 return sprintf(buffer, "%c", (*(int *)kp->arg) ? 'N' : 'Y'); 297 return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
282} 298}
283 299
284/* We break the rule and mangle the string. */ 300/* We break the rule and mangle the string. */
@@ -591,7 +607,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
591 unsigned int i; 607 unsigned int i;
592 608
593 for (i = 0; i < num; i++) 609 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED) 610 if (params[i].flags & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg); 611 kfree(*(char **)params[i].arg);
596} 612}
597 613
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..1a933a221ea4
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,4383 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45
46/*
47 * perf counter paranoia level:
48 * 0 - not paranoid
49 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv
51 */
52int sysctl_perf_counter_paranoid __read_mostly;
53
54static inline bool perf_paranoid_cpu(void)
55{
56 return sysctl_perf_counter_paranoid > 0;
57}
58
59static inline bool perf_paranoid_kernel(void)
60{
61 return sysctl_perf_counter_paranoid > 1;
62}
63
64int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
65
66/*
67 * max perf counter sample rate
68 */
69int sysctl_perf_counter_sample_rate __read_mostly = 100000;
70
71static atomic64_t perf_counter_id;
72
73/*
74 * Lock for (sysadmin-configurable) counter reservations:
75 */
76static DEFINE_SPINLOCK(perf_resource_lock);
77
78/*
79 * Architecture provided APIs - weak aliases:
80 */
81extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
82{
83 return NULL;
84}
85
86void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); }
88
89void __weak hw_perf_counter_setup(int cpu) { barrier(); }
90
91int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader,
93 struct perf_cpu_context *cpuctx,
94 struct perf_counter_context *ctx, int cpu)
95{
96 return 0;
97}
98
99void __weak perf_counter_print_debug(void) { }
100
101static DEFINE_PER_CPU(int, disable_count);
102
103void __perf_disable(void)
104{
105 __get_cpu_var(disable_count)++;
106}
107
108bool __perf_enable(void)
109{
110 return !--__get_cpu_var(disable_count);
111}
112
113void perf_disable(void)
114{
115 __perf_disable();
116 hw_perf_disable();
117}
118
119void perf_enable(void)
120{
121 if (__perf_enable())
122 hw_perf_enable();
123}
124
125static void get_ctx(struct perf_counter_context *ctx)
126{
127 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128}
129
130static void free_ctx(struct rcu_head *head)
131{
132 struct perf_counter_context *ctx;
133
134 ctx = container_of(head, struct perf_counter_context, rcu_head);
135 kfree(ctx);
136}
137
138static void put_ctx(struct perf_counter_context *ctx)
139{
140 if (atomic_dec_and_test(&ctx->refcount)) {
141 if (ctx->parent_ctx)
142 put_ctx(ctx->parent_ctx);
143 if (ctx->task)
144 put_task_struct(ctx->task);
145 call_rcu(&ctx->rcu_head, free_ctx);
146 }
147}
148
149/*
150 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked,
152 * the context could get moved to another task.
153 */
154static struct perf_counter_context *
155perf_lock_task_context(struct task_struct *task, unsigned long *flags)
156{
157 struct perf_counter_context *ctx;
158
159 rcu_read_lock();
160 retry:
161 ctx = rcu_dereference(task->perf_counter_ctxp);
162 if (ctx) {
163 /*
164 * If this context is a clone of another, it might
165 * get swapped for another underneath us by
166 * perf_counter_task_sched_out, though the
167 * rcu_read_lock() protects us from any context
168 * getting freed. Lock the context and check if it
169 * got swapped before we could get the lock, and retry
170 * if so. If we locked the right context, then it
171 * can't get swapped on us any more.
172 */
173 spin_lock_irqsave(&ctx->lock, *flags);
174 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry;
177 }
178
179 if (!atomic_inc_not_zero(&ctx->refcount)) {
180 spin_unlock_irqrestore(&ctx->lock, *flags);
181 ctx = NULL;
182 }
183 }
184 rcu_read_unlock();
185 return ctx;
186}
187
188/*
189 * Get the context for a task and increment its pin_count so it
190 * can't get swapped to another task. This also increments its
191 * reference count so that the context can't get freed.
192 */
193static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
194{
195 struct perf_counter_context *ctx;
196 unsigned long flags;
197
198 ctx = perf_lock_task_context(task, &flags);
199 if (ctx) {
200 ++ctx->pin_count;
201 spin_unlock_irqrestore(&ctx->lock, flags);
202 }
203 return ctx;
204}
205
206static void perf_unpin_context(struct perf_counter_context *ctx)
207{
208 unsigned long flags;
209
210 spin_lock_irqsave(&ctx->lock, flags);
211 --ctx->pin_count;
212 spin_unlock_irqrestore(&ctx->lock, flags);
213 put_ctx(ctx);
214}
215
216/*
217 * Add a counter from the lists for its context.
218 * Must be called with ctx->mutex and ctx->lock held.
219 */
220static void
221list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
222{
223 struct perf_counter *group_leader = counter->group_leader;
224
225 /*
226 * Depending on whether it is a standalone or sibling counter,
227 * add it straight to the context's counter list, or to the group
228 * leader's sibling list:
229 */
230 if (group_leader == counter)
231 list_add_tail(&counter->list_entry, &ctx->counter_list);
232 else {
233 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
234 group_leader->nr_siblings++;
235 }
236
237 list_add_rcu(&counter->event_entry, &ctx->event_list);
238 ctx->nr_counters++;
239}
240
241/*
242 * Remove a counter from the lists for its context.
243 * Must be called with ctx->mutex and ctx->lock held.
244 */
245static void
246list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
247{
248 struct perf_counter *sibling, *tmp;
249
250 if (list_empty(&counter->list_entry))
251 return;
252 ctx->nr_counters--;
253
254 list_del_init(&counter->list_entry);
255 list_del_rcu(&counter->event_entry);
256
257 if (counter->group_leader != counter)
258 counter->group_leader->nr_siblings--;
259
260 /*
261 * If this was a group counter with sibling counters then
262 * upgrade the siblings to singleton counters by adding them
263 * to the context list directly:
264 */
265 list_for_each_entry_safe(sibling, tmp,
266 &counter->sibling_list, list_entry) {
267
268 list_move_tail(&sibling->list_entry, &ctx->counter_list);
269 sibling->group_leader = sibling;
270 }
271}
272
273static void
274counter_sched_out(struct perf_counter *counter,
275 struct perf_cpu_context *cpuctx,
276 struct perf_counter_context *ctx)
277{
278 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
279 return;
280
281 counter->state = PERF_COUNTER_STATE_INACTIVE;
282 counter->tstamp_stopped = ctx->time;
283 counter->pmu->disable(counter);
284 counter->oncpu = -1;
285
286 if (!is_software_counter(counter))
287 cpuctx->active_oncpu--;
288 ctx->nr_active--;
289 if (counter->attr.exclusive || !cpuctx->active_oncpu)
290 cpuctx->exclusive = 0;
291}
292
293static void
294group_sched_out(struct perf_counter *group_counter,
295 struct perf_cpu_context *cpuctx,
296 struct perf_counter_context *ctx)
297{
298 struct perf_counter *counter;
299
300 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
301 return;
302
303 counter_sched_out(group_counter, cpuctx, ctx);
304
305 /*
306 * Schedule out siblings (if any):
307 */
308 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
309 counter_sched_out(counter, cpuctx, ctx);
310
311 if (group_counter->attr.exclusive)
312 cpuctx->exclusive = 0;
313}
314
315/*
316 * Cross CPU call to remove a performance counter
317 *
318 * We disable the counter on the hardware level first. After that we
319 * remove it from the context list.
320 */
321static void __perf_counter_remove_from_context(void *info)
322{
323 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
324 struct perf_counter *counter = info;
325 struct perf_counter_context *ctx = counter->ctx;
326
327 /*
328 * If this is a task context, we need to check whether it is
329 * the current task context of this cpu. If not it has been
330 * scheduled out before the smp call arrived.
331 */
332 if (ctx->task && cpuctx->task_ctx != ctx)
333 return;
334
335 spin_lock(&ctx->lock);
336 /*
337 * Protect the list operation against NMI by disabling the
338 * counters on a global level.
339 */
340 perf_disable();
341
342 counter_sched_out(counter, cpuctx, ctx);
343
344 list_del_counter(counter, ctx);
345
346 if (!ctx->task) {
347 /*
348 * Allow more per task counters with respect to the
349 * reservation:
350 */
351 cpuctx->max_pertask =
352 min(perf_max_counters - ctx->nr_counters,
353 perf_max_counters - perf_reserved_percpu);
354 }
355
356 perf_enable();
357 spin_unlock(&ctx->lock);
358}
359
360
361/*
362 * Remove the counter from a task's (or a CPU's) list of counters.
363 *
364 * Must be called with ctx->mutex held.
365 *
366 * CPU counters are removed with a smp call. For task counters we only
367 * call when the task is on a CPU.
368 *
369 * If counter->ctx is a cloned context, callers must make sure that
370 * every task struct that counter->ctx->task could possibly point to
371 * remains valid. This is OK when called from perf_release since
372 * that only calls us on the top-level context, which can't be a clone.
373 * When called from perf_counter_exit_task, it's OK because the
374 * context has been detached from its task.
375 */
376static void perf_counter_remove_from_context(struct perf_counter *counter)
377{
378 struct perf_counter_context *ctx = counter->ctx;
379 struct task_struct *task = ctx->task;
380
381 if (!task) {
382 /*
383 * Per cpu counters are removed via an smp call and
384 * the removal is always sucessful.
385 */
386 smp_call_function_single(counter->cpu,
387 __perf_counter_remove_from_context,
388 counter, 1);
389 return;
390 }
391
392retry:
393 task_oncpu_function_call(task, __perf_counter_remove_from_context,
394 counter);
395
396 spin_lock_irq(&ctx->lock);
397 /*
398 * If the context is active we need to retry the smp call.
399 */
400 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
401 spin_unlock_irq(&ctx->lock);
402 goto retry;
403 }
404
405 /*
406 * The lock prevents that this context is scheduled in so we
407 * can remove the counter safely, if the call above did not
408 * succeed.
409 */
410 if (!list_empty(&counter->list_entry)) {
411 list_del_counter(counter, ctx);
412 }
413 spin_unlock_irq(&ctx->lock);
414}
415
416static inline u64 perf_clock(void)
417{
418 return cpu_clock(smp_processor_id());
419}
420
421/*
422 * Update the record of the current time in a context.
423 */
424static void update_context_time(struct perf_counter_context *ctx)
425{
426 u64 now = perf_clock();
427
428 ctx->time += now - ctx->timestamp;
429 ctx->timestamp = now;
430}
431
432/*
433 * Update the total_time_enabled and total_time_running fields for a counter.
434 */
435static void update_counter_times(struct perf_counter *counter)
436{
437 struct perf_counter_context *ctx = counter->ctx;
438 u64 run_end;
439
440 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
441 return;
442
443 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
444
445 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
446 run_end = counter->tstamp_stopped;
447 else
448 run_end = ctx->time;
449
450 counter->total_time_running = run_end - counter->tstamp_running;
451}
452
453/*
454 * Update total_time_enabled and total_time_running for all counters in a group.
455 */
456static void update_group_times(struct perf_counter *leader)
457{
458 struct perf_counter *counter;
459
460 update_counter_times(leader);
461 list_for_each_entry(counter, &leader->sibling_list, list_entry)
462 update_counter_times(counter);
463}
464
465/*
466 * Cross CPU call to disable a performance counter
467 */
468static void __perf_counter_disable(void *info)
469{
470 struct perf_counter *counter = info;
471 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
472 struct perf_counter_context *ctx = counter->ctx;
473
474 /*
475 * If this is a per-task counter, need to check whether this
476 * counter's task is the current task on this cpu.
477 */
478 if (ctx->task && cpuctx->task_ctx != ctx)
479 return;
480
481 spin_lock(&ctx->lock);
482
483 /*
484 * If the counter is on, turn it off.
485 * If it is in error state, leave it in error state.
486 */
487 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
488 update_context_time(ctx);
489 update_counter_times(counter);
490 if (counter == counter->group_leader)
491 group_sched_out(counter, cpuctx, ctx);
492 else
493 counter_sched_out(counter, cpuctx, ctx);
494 counter->state = PERF_COUNTER_STATE_OFF;
495 }
496
497 spin_unlock(&ctx->lock);
498}
499
500/*
501 * Disable a counter.
502 *
503 * If counter->ctx is a cloned context, callers must make sure that
504 * every task struct that counter->ctx->task could possibly point to
505 * remains valid. This condition is satisifed when called through
506 * perf_counter_for_each_child or perf_counter_for_each because they
507 * hold the top-level counter's child_mutex, so any descendant that
508 * goes to exit will block in sync_child_counter.
509 * When called from perf_pending_counter it's OK because counter->ctx
510 * is the current context on this CPU and preemption is disabled,
511 * hence we can't get into perf_counter_task_sched_out for this context.
512 */
513static void perf_counter_disable(struct perf_counter *counter)
514{
515 struct perf_counter_context *ctx = counter->ctx;
516 struct task_struct *task = ctx->task;
517
518 if (!task) {
519 /*
520 * Disable the counter on the cpu that it's on
521 */
522 smp_call_function_single(counter->cpu, __perf_counter_disable,
523 counter, 1);
524 return;
525 }
526
527 retry:
528 task_oncpu_function_call(task, __perf_counter_disable, counter);
529
530 spin_lock_irq(&ctx->lock);
531 /*
532 * If the counter is still active, we need to retry the cross-call.
533 */
534 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
535 spin_unlock_irq(&ctx->lock);
536 goto retry;
537 }
538
539 /*
540 * Since we have the lock this context can't be scheduled
541 * in, so we can change the state safely.
542 */
543 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
544 update_counter_times(counter);
545 counter->state = PERF_COUNTER_STATE_OFF;
546 }
547
548 spin_unlock_irq(&ctx->lock);
549}
550
551static int
552counter_sched_in(struct perf_counter *counter,
553 struct perf_cpu_context *cpuctx,
554 struct perf_counter_context *ctx,
555 int cpu)
556{
557 if (counter->state <= PERF_COUNTER_STATE_OFF)
558 return 0;
559
560 counter->state = PERF_COUNTER_STATE_ACTIVE;
561 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
562 /*
563 * The new state must be visible before we turn it on in the hardware:
564 */
565 smp_wmb();
566
567 if (counter->pmu->enable(counter)) {
568 counter->state = PERF_COUNTER_STATE_INACTIVE;
569 counter->oncpu = -1;
570 return -EAGAIN;
571 }
572
573 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
574
575 if (!is_software_counter(counter))
576 cpuctx->active_oncpu++;
577 ctx->nr_active++;
578
579 if (counter->attr.exclusive)
580 cpuctx->exclusive = 1;
581
582 return 0;
583}
584
585static int
586group_sched_in(struct perf_counter *group_counter,
587 struct perf_cpu_context *cpuctx,
588 struct perf_counter_context *ctx,
589 int cpu)
590{
591 struct perf_counter *counter, *partial_group;
592 int ret;
593
594 if (group_counter->state == PERF_COUNTER_STATE_OFF)
595 return 0;
596
597 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
598 if (ret)
599 return ret < 0 ? ret : 0;
600
601 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
602 return -EAGAIN;
603
604 /*
605 * Schedule in siblings as one group (if any):
606 */
607 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
608 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
609 partial_group = counter;
610 goto group_error;
611 }
612 }
613
614 return 0;
615
616group_error:
617 /*
618 * Groups can be scheduled in as one unit only, so undo any
619 * partial group before returning:
620 */
621 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
622 if (counter == partial_group)
623 break;
624 counter_sched_out(counter, cpuctx, ctx);
625 }
626 counter_sched_out(group_counter, cpuctx, ctx);
627
628 return -EAGAIN;
629}
630
631/*
632 * Return 1 for a group consisting entirely of software counters,
633 * 0 if the group contains any hardware counters.
634 */
635static int is_software_only_group(struct perf_counter *leader)
636{
637 struct perf_counter *counter;
638
639 if (!is_software_counter(leader))
640 return 0;
641
642 list_for_each_entry(counter, &leader->sibling_list, list_entry)
643 if (!is_software_counter(counter))
644 return 0;
645
646 return 1;
647}
648
649/*
650 * Work out whether we can put this counter group on the CPU now.
651 */
652static int group_can_go_on(struct perf_counter *counter,
653 struct perf_cpu_context *cpuctx,
654 int can_add_hw)
655{
656 /*
657 * Groups consisting entirely of software counters can always go on.
658 */
659 if (is_software_only_group(counter))
660 return 1;
661 /*
662 * If an exclusive group is already on, no other hardware
663 * counters can go on.
664 */
665 if (cpuctx->exclusive)
666 return 0;
667 /*
668 * If this group is exclusive and there are already
669 * counters on the CPU, it can't go on.
670 */
671 if (counter->attr.exclusive && cpuctx->active_oncpu)
672 return 0;
673 /*
674 * Otherwise, try to add it if all previous groups were able
675 * to go on.
676 */
677 return can_add_hw;
678}
679
680static void add_counter_to_ctx(struct perf_counter *counter,
681 struct perf_counter_context *ctx)
682{
683 list_add_counter(counter, ctx);
684 counter->tstamp_enabled = ctx->time;
685 counter->tstamp_running = ctx->time;
686 counter->tstamp_stopped = ctx->time;
687}
688
689/*
690 * Cross CPU call to install and enable a performance counter
691 *
692 * Must be called with ctx->mutex held
693 */
694static void __perf_install_in_context(void *info)
695{
696 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
697 struct perf_counter *counter = info;
698 struct perf_counter_context *ctx = counter->ctx;
699 struct perf_counter *leader = counter->group_leader;
700 int cpu = smp_processor_id();
701 int err;
702
703 /*
704 * If this is a task context, we need to check whether it is
705 * the current task context of this cpu. If not it has been
706 * scheduled out before the smp call arrived.
707 * Or possibly this is the right context but it isn't
708 * on this cpu because it had no counters.
709 */
710 if (ctx->task && cpuctx->task_ctx != ctx) {
711 if (cpuctx->task_ctx || ctx->task != current)
712 return;
713 cpuctx->task_ctx = ctx;
714 }
715
716 spin_lock(&ctx->lock);
717 ctx->is_active = 1;
718 update_context_time(ctx);
719
720 /*
721 * Protect the list operation against NMI by disabling the
722 * counters on a global level. NOP for non NMI based counters.
723 */
724 perf_disable();
725
726 add_counter_to_ctx(counter, ctx);
727
728 /*
729 * Don't put the counter on if it is disabled or if
730 * it is in a group and the group isn't on.
731 */
732 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
733 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
734 goto unlock;
735
736 /*
737 * An exclusive counter can't go on if there are already active
738 * hardware counters, and no hardware counter can go on if there
739 * is already an exclusive counter on.
740 */
741 if (!group_can_go_on(counter, cpuctx, 1))
742 err = -EEXIST;
743 else
744 err = counter_sched_in(counter, cpuctx, ctx, cpu);
745
746 if (err) {
747 /*
748 * This counter couldn't go on. If it is in a group
749 * then we have to pull the whole group off.
750 * If the counter group is pinned then put it in error state.
751 */
752 if (leader != counter)
753 group_sched_out(leader, cpuctx, ctx);
754 if (leader->attr.pinned) {
755 update_group_times(leader);
756 leader->state = PERF_COUNTER_STATE_ERROR;
757 }
758 }
759
760 if (!err && !ctx->task && cpuctx->max_pertask)
761 cpuctx->max_pertask--;
762
763 unlock:
764 perf_enable();
765
766 spin_unlock(&ctx->lock);
767}
768
769/*
770 * Attach a performance counter to a context
771 *
772 * First we add the counter to the list with the hardware enable bit
773 * in counter->hw_config cleared.
774 *
775 * If the counter is attached to a task which is on a CPU we use a smp
776 * call to enable it in the task context. The task might have been
777 * scheduled away, but we check this in the smp call again.
778 *
779 * Must be called with ctx->mutex held.
780 */
781static void
782perf_install_in_context(struct perf_counter_context *ctx,
783 struct perf_counter *counter,
784 int cpu)
785{
786 struct task_struct *task = ctx->task;
787
788 if (!task) {
789 /*
790 * Per cpu counters are installed via an smp call and
791 * the install is always sucessful.
792 */
793 smp_call_function_single(cpu, __perf_install_in_context,
794 counter, 1);
795 return;
796 }
797
798retry:
799 task_oncpu_function_call(task, __perf_install_in_context,
800 counter);
801
802 spin_lock_irq(&ctx->lock);
803 /*
804 * we need to retry the smp call.
805 */
806 if (ctx->is_active && list_empty(&counter->list_entry)) {
807 spin_unlock_irq(&ctx->lock);
808 goto retry;
809 }
810
811 /*
812 * The lock prevents that this context is scheduled in so we
813 * can add the counter safely, if it the call above did not
814 * succeed.
815 */
816 if (list_empty(&counter->list_entry))
817 add_counter_to_ctx(counter, ctx);
818 spin_unlock_irq(&ctx->lock);
819}
820
821/*
822 * Cross CPU call to enable a performance counter
823 */
824static void __perf_counter_enable(void *info)
825{
826 struct perf_counter *counter = info;
827 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
828 struct perf_counter_context *ctx = counter->ctx;
829 struct perf_counter *leader = counter->group_leader;
830 int err;
831
832 /*
833 * If this is a per-task counter, need to check whether this
834 * counter's task is the current task on this cpu.
835 */
836 if (ctx->task && cpuctx->task_ctx != ctx) {
837 if (cpuctx->task_ctx || ctx->task != current)
838 return;
839 cpuctx->task_ctx = ctx;
840 }
841
842 spin_lock(&ctx->lock);
843 ctx->is_active = 1;
844 update_context_time(ctx);
845
846 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
847 goto unlock;
848 counter->state = PERF_COUNTER_STATE_INACTIVE;
849 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
850
851 /*
852 * If the counter is in a group and isn't the group leader,
853 * then don't put it on unless the group is on.
854 */
855 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
856 goto unlock;
857
858 if (!group_can_go_on(counter, cpuctx, 1)) {
859 err = -EEXIST;
860 } else {
861 perf_disable();
862 if (counter == leader)
863 err = group_sched_in(counter, cpuctx, ctx,
864 smp_processor_id());
865 else
866 err = counter_sched_in(counter, cpuctx, ctx,
867 smp_processor_id());
868 perf_enable();
869 }
870
871 if (err) {
872 /*
873 * If this counter can't go on and it's part of a
874 * group, then the whole group has to come off.
875 */
876 if (leader != counter)
877 group_sched_out(leader, cpuctx, ctx);
878 if (leader->attr.pinned) {
879 update_group_times(leader);
880 leader->state = PERF_COUNTER_STATE_ERROR;
881 }
882 }
883
884 unlock:
885 spin_unlock(&ctx->lock);
886}
887
888/*
889 * Enable a counter.
890 *
891 * If counter->ctx is a cloned context, callers must make sure that
892 * every task struct that counter->ctx->task could possibly point to
893 * remains valid. This condition is satisfied when called through
894 * perf_counter_for_each_child or perf_counter_for_each as described
895 * for perf_counter_disable.
896 */
897static void perf_counter_enable(struct perf_counter *counter)
898{
899 struct perf_counter_context *ctx = counter->ctx;
900 struct task_struct *task = ctx->task;
901
902 if (!task) {
903 /*
904 * Enable the counter on the cpu that it's on
905 */
906 smp_call_function_single(counter->cpu, __perf_counter_enable,
907 counter, 1);
908 return;
909 }
910
911 spin_lock_irq(&ctx->lock);
912 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
913 goto out;
914
915 /*
916 * If the counter is in error state, clear that first.
917 * That way, if we see the counter in error state below, we
918 * know that it has gone back into error state, as distinct
919 * from the task having been scheduled away before the
920 * cross-call arrived.
921 */
922 if (counter->state == PERF_COUNTER_STATE_ERROR)
923 counter->state = PERF_COUNTER_STATE_OFF;
924
925 retry:
926 spin_unlock_irq(&ctx->lock);
927 task_oncpu_function_call(task, __perf_counter_enable, counter);
928
929 spin_lock_irq(&ctx->lock);
930
931 /*
932 * If the context is active and the counter is still off,
933 * we need to retry the cross-call.
934 */
935 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
936 goto retry;
937
938 /*
939 * Since we have the lock this context can't be scheduled
940 * in, so we can change the state safely.
941 */
942 if (counter->state == PERF_COUNTER_STATE_OFF) {
943 counter->state = PERF_COUNTER_STATE_INACTIVE;
944 counter->tstamp_enabled =
945 ctx->time - counter->total_time_enabled;
946 }
947 out:
948 spin_unlock_irq(&ctx->lock);
949}
950
951static int perf_counter_refresh(struct perf_counter *counter, int refresh)
952{
953 /*
954 * not supported on inherited counters
955 */
956 if (counter->attr.inherit)
957 return -EINVAL;
958
959 atomic_add(refresh, &counter->event_limit);
960 perf_counter_enable(counter);
961
962 return 0;
963}
964
965void __perf_counter_sched_out(struct perf_counter_context *ctx,
966 struct perf_cpu_context *cpuctx)
967{
968 struct perf_counter *counter;
969
970 spin_lock(&ctx->lock);
971 ctx->is_active = 0;
972 if (likely(!ctx->nr_counters))
973 goto out;
974 update_context_time(ctx);
975
976 perf_disable();
977 if (ctx->nr_active) {
978 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
979 if (counter != counter->group_leader)
980 counter_sched_out(counter, cpuctx, ctx);
981 else
982 group_sched_out(counter, cpuctx, ctx);
983 }
984 }
985 perf_enable();
986 out:
987 spin_unlock(&ctx->lock);
988}
989
990/*
991 * Test whether two contexts are equivalent, i.e. whether they
992 * have both been cloned from the same version of the same context
993 * and they both have the same number of enabled counters.
994 * If the number of enabled counters is the same, then the set
995 * of enabled counters should be the same, because these are both
996 * inherited contexts, therefore we can't access individual counters
997 * in them directly with an fd; we can only enable/disable all
998 * counters via prctl, or enable/disable all counters in a family
999 * via ioctl, which will have the same effect on both contexts.
1000 */
1001static int context_equiv(struct perf_counter_context *ctx1,
1002 struct perf_counter_context *ctx2)
1003{
1004 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1005 && ctx1->parent_gen == ctx2->parent_gen
1006 && !ctx1->pin_count && !ctx2->pin_count;
1007}
1008
1009/*
1010 * Called from scheduler to remove the counters of the current task,
1011 * with interrupts disabled.
1012 *
1013 * We stop each counter and update the counter value in counter->count.
1014 *
1015 * This does not protect us against NMI, but disable()
1016 * sets the disabled bit in the control field of counter _before_
1017 * accessing the counter control register. If a NMI hits, then it will
1018 * not restart the counter.
1019 */
1020void perf_counter_task_sched_out(struct task_struct *task,
1021 struct task_struct *next, int cpu)
1022{
1023 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1024 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1025 struct perf_counter_context *next_ctx;
1026 struct perf_counter_context *parent;
1027 struct pt_regs *regs;
1028 int do_switch = 1;
1029
1030 regs = task_pt_regs(task);
1031 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1032
1033 if (likely(!ctx || !cpuctx->task_ctx))
1034 return;
1035
1036 update_context_time(ctx);
1037
1038 rcu_read_lock();
1039 parent = rcu_dereference(ctx->parent_ctx);
1040 next_ctx = next->perf_counter_ctxp;
1041 if (parent && next_ctx &&
1042 rcu_dereference(next_ctx->parent_ctx) == parent) {
1043 /*
1044 * Looks like the two contexts are clones, so we might be
1045 * able to optimize the context switch. We lock both
1046 * contexts and check that they are clones under the
1047 * lock (including re-checking that neither has been
1048 * uncloned in the meantime). It doesn't matter which
1049 * order we take the locks because no other cpu could
1050 * be trying to lock both of these tasks.
1051 */
1052 spin_lock(&ctx->lock);
1053 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1054 if (context_equiv(ctx, next_ctx)) {
1055 /*
1056 * XXX do we need a memory barrier of sorts
1057 * wrt to rcu_dereference() of perf_counter_ctxp
1058 */
1059 task->perf_counter_ctxp = next_ctx;
1060 next->perf_counter_ctxp = ctx;
1061 ctx->task = next;
1062 next_ctx->task = task;
1063 do_switch = 0;
1064 }
1065 spin_unlock(&next_ctx->lock);
1066 spin_unlock(&ctx->lock);
1067 }
1068 rcu_read_unlock();
1069
1070 if (do_switch) {
1071 __perf_counter_sched_out(ctx, cpuctx);
1072 cpuctx->task_ctx = NULL;
1073 }
1074}
1075
1076/*
1077 * Called with IRQs disabled
1078 */
1079static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1080{
1081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1082
1083 if (!cpuctx->task_ctx)
1084 return;
1085
1086 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1087 return;
1088
1089 __perf_counter_sched_out(ctx, cpuctx);
1090 cpuctx->task_ctx = NULL;
1091}
1092
1093/*
1094 * Called with IRQs disabled
1095 */
1096static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1097{
1098 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1099}
1100
1101static void
1102__perf_counter_sched_in(struct perf_counter_context *ctx,
1103 struct perf_cpu_context *cpuctx, int cpu)
1104{
1105 struct perf_counter *counter;
1106 int can_add_hw = 1;
1107
1108 spin_lock(&ctx->lock);
1109 ctx->is_active = 1;
1110 if (likely(!ctx->nr_counters))
1111 goto out;
1112
1113 ctx->timestamp = perf_clock();
1114
1115 perf_disable();
1116
1117 /*
1118 * First go through the list and put on any pinned groups
1119 * in order to give them the best chance of going on.
1120 */
1121 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1122 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1123 !counter->attr.pinned)
1124 continue;
1125 if (counter->cpu != -1 && counter->cpu != cpu)
1126 continue;
1127
1128 if (counter != counter->group_leader)
1129 counter_sched_in(counter, cpuctx, ctx, cpu);
1130 else {
1131 if (group_can_go_on(counter, cpuctx, 1))
1132 group_sched_in(counter, cpuctx, ctx, cpu);
1133 }
1134
1135 /*
1136 * If this pinned group hasn't been scheduled,
1137 * put it in error state.
1138 */
1139 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1140 update_group_times(counter);
1141 counter->state = PERF_COUNTER_STATE_ERROR;
1142 }
1143 }
1144
1145 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1146 /*
1147 * Ignore counters in OFF or ERROR state, and
1148 * ignore pinned counters since we did them already.
1149 */
1150 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1151 counter->attr.pinned)
1152 continue;
1153
1154 /*
1155 * Listen to the 'cpu' scheduling filter constraint
1156 * of counters:
1157 */
1158 if (counter->cpu != -1 && counter->cpu != cpu)
1159 continue;
1160
1161 if (counter != counter->group_leader) {
1162 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1163 can_add_hw = 0;
1164 } else {
1165 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1166 if (group_sched_in(counter, cpuctx, ctx, cpu))
1167 can_add_hw = 0;
1168 }
1169 }
1170 }
1171 perf_enable();
1172 out:
1173 spin_unlock(&ctx->lock);
1174}
1175
1176/*
1177 * Called from scheduler to add the counters of the current task
1178 * with interrupts disabled.
1179 *
1180 * We restore the counter value and then enable it.
1181 *
1182 * This does not protect us against NMI, but enable()
1183 * sets the enabled bit in the control field of counter _before_
1184 * accessing the counter control register. If a NMI hits, then it will
1185 * keep the counter running.
1186 */
1187void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1188{
1189 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1190 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1191
1192 if (likely(!ctx))
1193 return;
1194 if (cpuctx->task_ctx == ctx)
1195 return;
1196 __perf_counter_sched_in(ctx, cpuctx, cpu);
1197 cpuctx->task_ctx = ctx;
1198}
1199
1200static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1201{
1202 struct perf_counter_context *ctx = &cpuctx->ctx;
1203
1204 __perf_counter_sched_in(ctx, cpuctx, cpu);
1205}
1206
1207#define MAX_INTERRUPTS (~0ULL)
1208
1209static void perf_log_throttle(struct perf_counter *counter, int enable);
1210static void perf_log_period(struct perf_counter *counter, u64 period);
1211
1212static void perf_adjust_period(struct perf_counter *counter, u64 events)
1213{
1214 struct hw_perf_counter *hwc = &counter->hw;
1215 u64 period, sample_period;
1216 s64 delta;
1217
1218 events *= hwc->sample_period;
1219 period = div64_u64(events, counter->attr.sample_freq);
1220
1221 delta = (s64)(period - hwc->sample_period);
1222 delta = (delta + 7) / 8; /* low pass filter */
1223
1224 sample_period = hwc->sample_period + delta;
1225
1226 if (!sample_period)
1227 sample_period = 1;
1228
1229 perf_log_period(counter, sample_period);
1230
1231 hwc->sample_period = sample_period;
1232}
1233
1234static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1235{
1236 struct perf_counter *counter;
1237 struct hw_perf_counter *hwc;
1238 u64 interrupts, freq;
1239
1240 spin_lock(&ctx->lock);
1241 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1242 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1243 continue;
1244
1245 hwc = &counter->hw;
1246
1247 interrupts = hwc->interrupts;
1248 hwc->interrupts = 0;
1249
1250 /*
1251 * unthrottle counters on the tick
1252 */
1253 if (interrupts == MAX_INTERRUPTS) {
1254 perf_log_throttle(counter, 1);
1255 counter->pmu->unthrottle(counter);
1256 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1257 }
1258
1259 if (!counter->attr.freq || !counter->attr.sample_freq)
1260 continue;
1261
1262 /*
1263 * if the specified freq < HZ then we need to skip ticks
1264 */
1265 if (counter->attr.sample_freq < HZ) {
1266 freq = counter->attr.sample_freq;
1267
1268 hwc->freq_count += freq;
1269 hwc->freq_interrupts += interrupts;
1270
1271 if (hwc->freq_count < HZ)
1272 continue;
1273
1274 interrupts = hwc->freq_interrupts;
1275 hwc->freq_interrupts = 0;
1276 hwc->freq_count -= HZ;
1277 } else
1278 freq = HZ;
1279
1280 perf_adjust_period(counter, freq * interrupts);
1281
1282 /*
1283 * In order to avoid being stalled by an (accidental) huge
1284 * sample period, force reset the sample period if we didn't
1285 * get any events in this freq period.
1286 */
1287 if (!interrupts) {
1288 perf_disable();
1289 counter->pmu->disable(counter);
1290 atomic64_set(&hwc->period_left, 0);
1291 counter->pmu->enable(counter);
1292 perf_enable();
1293 }
1294 }
1295 spin_unlock(&ctx->lock);
1296}
1297
1298/*
1299 * Round-robin a context's counters:
1300 */
1301static void rotate_ctx(struct perf_counter_context *ctx)
1302{
1303 struct perf_counter *counter;
1304
1305 if (!ctx->nr_counters)
1306 return;
1307
1308 spin_lock(&ctx->lock);
1309 /*
1310 * Rotate the first entry last (works just fine for group counters too):
1311 */
1312 perf_disable();
1313 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1314 list_move_tail(&counter->list_entry, &ctx->counter_list);
1315 break;
1316 }
1317 perf_enable();
1318
1319 spin_unlock(&ctx->lock);
1320}
1321
1322void perf_counter_task_tick(struct task_struct *curr, int cpu)
1323{
1324 struct perf_cpu_context *cpuctx;
1325 struct perf_counter_context *ctx;
1326
1327 if (!atomic_read(&nr_counters))
1328 return;
1329
1330 cpuctx = &per_cpu(perf_cpu_context, cpu);
1331 ctx = curr->perf_counter_ctxp;
1332
1333 perf_ctx_adjust_freq(&cpuctx->ctx);
1334 if (ctx)
1335 perf_ctx_adjust_freq(ctx);
1336
1337 perf_counter_cpu_sched_out(cpuctx);
1338 if (ctx)
1339 __perf_counter_task_sched_out(ctx);
1340
1341 rotate_ctx(&cpuctx->ctx);
1342 if (ctx)
1343 rotate_ctx(ctx);
1344
1345 perf_counter_cpu_sched_in(cpuctx, cpu);
1346 if (ctx)
1347 perf_counter_task_sched_in(curr, cpu);
1348}
1349
1350/*
1351 * Cross CPU call to read the hardware counter
1352 */
1353static void __read(void *info)
1354{
1355 struct perf_counter *counter = info;
1356 struct perf_counter_context *ctx = counter->ctx;
1357 unsigned long flags;
1358
1359 local_irq_save(flags);
1360 if (ctx->is_active)
1361 update_context_time(ctx);
1362 counter->pmu->read(counter);
1363 update_counter_times(counter);
1364 local_irq_restore(flags);
1365}
1366
1367static u64 perf_counter_read(struct perf_counter *counter)
1368{
1369 /*
1370 * If counter is enabled and currently active on a CPU, update the
1371 * value in the counter structure:
1372 */
1373 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1374 smp_call_function_single(counter->oncpu,
1375 __read, counter, 1);
1376 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1377 update_counter_times(counter);
1378 }
1379
1380 return atomic64_read(&counter->count);
1381}
1382
1383/*
1384 * Initialize the perf_counter context in a task_struct:
1385 */
1386static void
1387__perf_counter_init_context(struct perf_counter_context *ctx,
1388 struct task_struct *task)
1389{
1390 memset(ctx, 0, sizeof(*ctx));
1391 spin_lock_init(&ctx->lock);
1392 mutex_init(&ctx->mutex);
1393 INIT_LIST_HEAD(&ctx->counter_list);
1394 INIT_LIST_HEAD(&ctx->event_list);
1395 atomic_set(&ctx->refcount, 1);
1396 ctx->task = task;
1397}
1398
1399static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1400{
1401 struct perf_counter_context *parent_ctx;
1402 struct perf_counter_context *ctx;
1403 struct perf_cpu_context *cpuctx;
1404 struct task_struct *task;
1405 unsigned long flags;
1406 int err;
1407
1408 /*
1409 * If cpu is not a wildcard then this is a percpu counter:
1410 */
1411 if (cpu != -1) {
1412 /* Must be root to operate on a CPU counter: */
1413 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1414 return ERR_PTR(-EACCES);
1415
1416 if (cpu < 0 || cpu > num_possible_cpus())
1417 return ERR_PTR(-EINVAL);
1418
1419 /*
1420 * We could be clever and allow to attach a counter to an
1421 * offline CPU and activate it when the CPU comes up, but
1422 * that's for later.
1423 */
1424 if (!cpu_isset(cpu, cpu_online_map))
1425 return ERR_PTR(-ENODEV);
1426
1427 cpuctx = &per_cpu(perf_cpu_context, cpu);
1428 ctx = &cpuctx->ctx;
1429 get_ctx(ctx);
1430
1431 return ctx;
1432 }
1433
1434 rcu_read_lock();
1435 if (!pid)
1436 task = current;
1437 else
1438 task = find_task_by_vpid(pid);
1439 if (task)
1440 get_task_struct(task);
1441 rcu_read_unlock();
1442
1443 if (!task)
1444 return ERR_PTR(-ESRCH);
1445
1446 /*
1447 * Can't attach counters to a dying task.
1448 */
1449 err = -ESRCH;
1450 if (task->flags & PF_EXITING)
1451 goto errout;
1452
1453 /* Reuse ptrace permission checks for now. */
1454 err = -EACCES;
1455 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1456 goto errout;
1457
1458 retry:
1459 ctx = perf_lock_task_context(task, &flags);
1460 if (ctx) {
1461 parent_ctx = ctx->parent_ctx;
1462 if (parent_ctx) {
1463 put_ctx(parent_ctx);
1464 ctx->parent_ctx = NULL; /* no longer a clone */
1465 }
1466 spin_unlock_irqrestore(&ctx->lock, flags);
1467 }
1468
1469 if (!ctx) {
1470 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1471 err = -ENOMEM;
1472 if (!ctx)
1473 goto errout;
1474 __perf_counter_init_context(ctx, task);
1475 get_ctx(ctx);
1476 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1477 /*
1478 * We raced with some other task; use
1479 * the context they set.
1480 */
1481 kfree(ctx);
1482 goto retry;
1483 }
1484 get_task_struct(task);
1485 }
1486
1487 put_task_struct(task);
1488 return ctx;
1489
1490 errout:
1491 put_task_struct(task);
1492 return ERR_PTR(err);
1493}
1494
1495static void free_counter_rcu(struct rcu_head *head)
1496{
1497 struct perf_counter *counter;
1498
1499 counter = container_of(head, struct perf_counter, rcu_head);
1500 if (counter->ns)
1501 put_pid_ns(counter->ns);
1502 kfree(counter);
1503}
1504
1505static void perf_pending_sync(struct perf_counter *counter);
1506
1507static void free_counter(struct perf_counter *counter)
1508{
1509 perf_pending_sync(counter);
1510
1511 atomic_dec(&nr_counters);
1512 if (counter->attr.mmap)
1513 atomic_dec(&nr_mmap_counters);
1514 if (counter->attr.comm)
1515 atomic_dec(&nr_comm_counters);
1516
1517 if (counter->destroy)
1518 counter->destroy(counter);
1519
1520 put_ctx(counter->ctx);
1521 call_rcu(&counter->rcu_head, free_counter_rcu);
1522}
1523
1524/*
1525 * Called when the last reference to the file is gone.
1526 */
1527static int perf_release(struct inode *inode, struct file *file)
1528{
1529 struct perf_counter *counter = file->private_data;
1530 struct perf_counter_context *ctx = counter->ctx;
1531
1532 file->private_data = NULL;
1533
1534 WARN_ON_ONCE(ctx->parent_ctx);
1535 mutex_lock(&ctx->mutex);
1536 perf_counter_remove_from_context(counter);
1537 mutex_unlock(&ctx->mutex);
1538
1539 mutex_lock(&counter->owner->perf_counter_mutex);
1540 list_del_init(&counter->owner_entry);
1541 mutex_unlock(&counter->owner->perf_counter_mutex);
1542 put_task_struct(counter->owner);
1543
1544 free_counter(counter);
1545
1546 return 0;
1547}
1548
1549/*
1550 * Read the performance counter - simple non blocking version for now
1551 */
1552static ssize_t
1553perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1554{
1555 u64 values[4];
1556 int n;
1557
1558 /*
1559 * Return end-of-file for a read on a counter that is in
1560 * error state (i.e. because it was pinned but it couldn't be
1561 * scheduled on to the CPU at some point).
1562 */
1563 if (counter->state == PERF_COUNTER_STATE_ERROR)
1564 return 0;
1565
1566 WARN_ON_ONCE(counter->ctx->parent_ctx);
1567 mutex_lock(&counter->child_mutex);
1568 values[0] = perf_counter_read(counter);
1569 n = 1;
1570 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1571 values[n++] = counter->total_time_enabled +
1572 atomic64_read(&counter->child_total_time_enabled);
1573 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1574 values[n++] = counter->total_time_running +
1575 atomic64_read(&counter->child_total_time_running);
1576 if (counter->attr.read_format & PERF_FORMAT_ID)
1577 values[n++] = counter->id;
1578 mutex_unlock(&counter->child_mutex);
1579
1580 if (count < n * sizeof(u64))
1581 return -EINVAL;
1582 count = n * sizeof(u64);
1583
1584 if (copy_to_user(buf, values, count))
1585 return -EFAULT;
1586
1587 return count;
1588}
1589
1590static ssize_t
1591perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1592{
1593 struct perf_counter *counter = file->private_data;
1594
1595 return perf_read_hw(counter, buf, count);
1596}
1597
1598static unsigned int perf_poll(struct file *file, poll_table *wait)
1599{
1600 struct perf_counter *counter = file->private_data;
1601 struct perf_mmap_data *data;
1602 unsigned int events = POLL_HUP;
1603
1604 rcu_read_lock();
1605 data = rcu_dereference(counter->data);
1606 if (data)
1607 events = atomic_xchg(&data->poll, 0);
1608 rcu_read_unlock();
1609
1610 poll_wait(file, &counter->waitq, wait);
1611
1612 return events;
1613}
1614
1615static void perf_counter_reset(struct perf_counter *counter)
1616{
1617 (void)perf_counter_read(counter);
1618 atomic64_set(&counter->count, 0);
1619 perf_counter_update_userpage(counter);
1620}
1621
1622/*
1623 * Holding the top-level counter's child_mutex means that any
1624 * descendant process that has inherited this counter will block
1625 * in sync_child_counter if it goes to exit, thus satisfying the
1626 * task existence requirements of perf_counter_enable/disable.
1627 */
1628static void perf_counter_for_each_child(struct perf_counter *counter,
1629 void (*func)(struct perf_counter *))
1630{
1631 struct perf_counter *child;
1632
1633 WARN_ON_ONCE(counter->ctx->parent_ctx);
1634 mutex_lock(&counter->child_mutex);
1635 func(counter);
1636 list_for_each_entry(child, &counter->child_list, child_list)
1637 func(child);
1638 mutex_unlock(&counter->child_mutex);
1639}
1640
1641static void perf_counter_for_each(struct perf_counter *counter,
1642 void (*func)(struct perf_counter *))
1643{
1644 struct perf_counter_context *ctx = counter->ctx;
1645 struct perf_counter *sibling;
1646
1647 WARN_ON_ONCE(ctx->parent_ctx);
1648 mutex_lock(&ctx->mutex);
1649 counter = counter->group_leader;
1650
1651 perf_counter_for_each_child(counter, func);
1652 func(counter);
1653 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1654 perf_counter_for_each_child(counter, func);
1655 mutex_unlock(&ctx->mutex);
1656}
1657
1658static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1659{
1660 struct perf_counter_context *ctx = counter->ctx;
1661 unsigned long size;
1662 int ret = 0;
1663 u64 value;
1664
1665 if (!counter->attr.sample_period)
1666 return -EINVAL;
1667
1668 size = copy_from_user(&value, arg, sizeof(value));
1669 if (size != sizeof(value))
1670 return -EFAULT;
1671
1672 if (!value)
1673 return -EINVAL;
1674
1675 spin_lock_irq(&ctx->lock);
1676 if (counter->attr.freq) {
1677 if (value > sysctl_perf_counter_sample_rate) {
1678 ret = -EINVAL;
1679 goto unlock;
1680 }
1681
1682 counter->attr.sample_freq = value;
1683 } else {
1684 perf_log_period(counter, value);
1685
1686 counter->attr.sample_period = value;
1687 counter->hw.sample_period = value;
1688 }
1689unlock:
1690 spin_unlock_irq(&ctx->lock);
1691
1692 return ret;
1693}
1694
1695static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1696{
1697 struct perf_counter *counter = file->private_data;
1698 void (*func)(struct perf_counter *);
1699 u32 flags = arg;
1700
1701 switch (cmd) {
1702 case PERF_COUNTER_IOC_ENABLE:
1703 func = perf_counter_enable;
1704 break;
1705 case PERF_COUNTER_IOC_DISABLE:
1706 func = perf_counter_disable;
1707 break;
1708 case PERF_COUNTER_IOC_RESET:
1709 func = perf_counter_reset;
1710 break;
1711
1712 case PERF_COUNTER_IOC_REFRESH:
1713 return perf_counter_refresh(counter, arg);
1714
1715 case PERF_COUNTER_IOC_PERIOD:
1716 return perf_counter_period(counter, (u64 __user *)arg);
1717
1718 default:
1719 return -ENOTTY;
1720 }
1721
1722 if (flags & PERF_IOC_FLAG_GROUP)
1723 perf_counter_for_each(counter, func);
1724 else
1725 perf_counter_for_each_child(counter, func);
1726
1727 return 0;
1728}
1729
1730int perf_counter_task_enable(void)
1731{
1732 struct perf_counter *counter;
1733
1734 mutex_lock(&current->perf_counter_mutex);
1735 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1736 perf_counter_for_each_child(counter, perf_counter_enable);
1737 mutex_unlock(&current->perf_counter_mutex);
1738
1739 return 0;
1740}
1741
1742int perf_counter_task_disable(void)
1743{
1744 struct perf_counter *counter;
1745
1746 mutex_lock(&current->perf_counter_mutex);
1747 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1748 perf_counter_for_each_child(counter, perf_counter_disable);
1749 mutex_unlock(&current->perf_counter_mutex);
1750
1751 return 0;
1752}
1753
1754/*
1755 * Callers need to ensure there can be no nesting of this function, otherwise
1756 * the seqlock logic goes bad. We can not serialize this because the arch
1757 * code calls this from NMI context.
1758 */
1759void perf_counter_update_userpage(struct perf_counter *counter)
1760{
1761 struct perf_counter_mmap_page *userpg;
1762 struct perf_mmap_data *data;
1763
1764 rcu_read_lock();
1765 data = rcu_dereference(counter->data);
1766 if (!data)
1767 goto unlock;
1768
1769 userpg = data->user_page;
1770
1771 /*
1772 * Disable preemption so as to not let the corresponding user-space
1773 * spin too long if we get preempted.
1774 */
1775 preempt_disable();
1776 ++userpg->lock;
1777 barrier();
1778 userpg->index = counter->hw.idx;
1779 userpg->offset = atomic64_read(&counter->count);
1780 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1781 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1782
1783 barrier();
1784 ++userpg->lock;
1785 preempt_enable();
1786unlock:
1787 rcu_read_unlock();
1788}
1789
1790static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1791{
1792 struct perf_counter *counter = vma->vm_file->private_data;
1793 struct perf_mmap_data *data;
1794 int ret = VM_FAULT_SIGBUS;
1795
1796 if (vmf->flags & FAULT_FLAG_MKWRITE) {
1797 if (vmf->pgoff == 0)
1798 ret = 0;
1799 return ret;
1800 }
1801
1802 rcu_read_lock();
1803 data = rcu_dereference(counter->data);
1804 if (!data)
1805 goto unlock;
1806
1807 if (vmf->pgoff == 0) {
1808 vmf->page = virt_to_page(data->user_page);
1809 } else {
1810 int nr = vmf->pgoff - 1;
1811
1812 if ((unsigned)nr > data->nr_pages)
1813 goto unlock;
1814
1815 if (vmf->flags & FAULT_FLAG_WRITE)
1816 goto unlock;
1817
1818 vmf->page = virt_to_page(data->data_pages[nr]);
1819 }
1820
1821 get_page(vmf->page);
1822 vmf->page->mapping = vma->vm_file->f_mapping;
1823 vmf->page->index = vmf->pgoff;
1824
1825 ret = 0;
1826unlock:
1827 rcu_read_unlock();
1828
1829 return ret;
1830}
1831
1832static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1833{
1834 struct perf_mmap_data *data;
1835 unsigned long size;
1836 int i;
1837
1838 WARN_ON(atomic_read(&counter->mmap_count));
1839
1840 size = sizeof(struct perf_mmap_data);
1841 size += nr_pages * sizeof(void *);
1842
1843 data = kzalloc(size, GFP_KERNEL);
1844 if (!data)
1845 goto fail;
1846
1847 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1848 if (!data->user_page)
1849 goto fail_user_page;
1850
1851 for (i = 0; i < nr_pages; i++) {
1852 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1853 if (!data->data_pages[i])
1854 goto fail_data_pages;
1855 }
1856
1857 data->nr_pages = nr_pages;
1858 atomic_set(&data->lock, -1);
1859
1860 rcu_assign_pointer(counter->data, data);
1861
1862 return 0;
1863
1864fail_data_pages:
1865 for (i--; i >= 0; i--)
1866 free_page((unsigned long)data->data_pages[i]);
1867
1868 free_page((unsigned long)data->user_page);
1869
1870fail_user_page:
1871 kfree(data);
1872
1873fail:
1874 return -ENOMEM;
1875}
1876
1877static void perf_mmap_free_page(unsigned long addr)
1878{
1879 struct page *page = virt_to_page(addr);
1880
1881 page->mapping = NULL;
1882 __free_page(page);
1883}
1884
1885static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1886{
1887 struct perf_mmap_data *data;
1888 int i;
1889
1890 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1891
1892 perf_mmap_free_page((unsigned long)data->user_page);
1893 for (i = 0; i < data->nr_pages; i++)
1894 perf_mmap_free_page((unsigned long)data->data_pages[i]);
1895
1896 kfree(data);
1897}
1898
1899static void perf_mmap_data_free(struct perf_counter *counter)
1900{
1901 struct perf_mmap_data *data = counter->data;
1902
1903 WARN_ON(atomic_read(&counter->mmap_count));
1904
1905 rcu_assign_pointer(counter->data, NULL);
1906 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1907}
1908
1909static void perf_mmap_open(struct vm_area_struct *vma)
1910{
1911 struct perf_counter *counter = vma->vm_file->private_data;
1912
1913 atomic_inc(&counter->mmap_count);
1914}
1915
1916static void perf_mmap_close(struct vm_area_struct *vma)
1917{
1918 struct perf_counter *counter = vma->vm_file->private_data;
1919
1920 WARN_ON_ONCE(counter->ctx->parent_ctx);
1921 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
1922 struct user_struct *user = current_user();
1923
1924 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1925 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1926 perf_mmap_data_free(counter);
1927 mutex_unlock(&counter->mmap_mutex);
1928 }
1929}
1930
1931static struct vm_operations_struct perf_mmap_vmops = {
1932 .open = perf_mmap_open,
1933 .close = perf_mmap_close,
1934 .fault = perf_mmap_fault,
1935 .page_mkwrite = perf_mmap_fault,
1936};
1937
1938static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1939{
1940 struct perf_counter *counter = file->private_data;
1941 unsigned long user_locked, user_lock_limit;
1942 struct user_struct *user = current_user();
1943 unsigned long locked, lock_limit;
1944 unsigned long vma_size;
1945 unsigned long nr_pages;
1946 long user_extra, extra;
1947 int ret = 0;
1948
1949 if (!(vma->vm_flags & VM_SHARED))
1950 return -EINVAL;
1951
1952 vma_size = vma->vm_end - vma->vm_start;
1953 nr_pages = (vma_size / PAGE_SIZE) - 1;
1954
1955 /*
1956 * If we have data pages ensure they're a power-of-two number, so we
1957 * can do bitmasks instead of modulo.
1958 */
1959 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1960 return -EINVAL;
1961
1962 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1963 return -EINVAL;
1964
1965 if (vma->vm_pgoff != 0)
1966 return -EINVAL;
1967
1968 WARN_ON_ONCE(counter->ctx->parent_ctx);
1969 mutex_lock(&counter->mmap_mutex);
1970 if (atomic_inc_not_zero(&counter->mmap_count)) {
1971 if (nr_pages != counter->data->nr_pages)
1972 ret = -EINVAL;
1973 goto unlock;
1974 }
1975
1976 user_extra = nr_pages + 1;
1977 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1978
1979 /*
1980 * Increase the limit linearly with more CPUs:
1981 */
1982 user_lock_limit *= num_online_cpus();
1983
1984 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1985
1986 extra = 0;
1987 if (user_locked > user_lock_limit)
1988 extra = user_locked - user_lock_limit;
1989
1990 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1991 lock_limit >>= PAGE_SHIFT;
1992 locked = vma->vm_mm->locked_vm + extra;
1993
1994 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1995 ret = -EPERM;
1996 goto unlock;
1997 }
1998
1999 WARN_ON(counter->data);
2000 ret = perf_mmap_data_alloc(counter, nr_pages);
2001 if (ret)
2002 goto unlock;
2003
2004 atomic_set(&counter->mmap_count, 1);
2005 atomic_long_add(user_extra, &user->locked_vm);
2006 vma->vm_mm->locked_vm += extra;
2007 counter->data->nr_locked = extra;
2008 if (vma->vm_flags & VM_WRITE)
2009 counter->data->writable = 1;
2010
2011unlock:
2012 mutex_unlock(&counter->mmap_mutex);
2013
2014 vma->vm_flags |= VM_RESERVED;
2015 vma->vm_ops = &perf_mmap_vmops;
2016
2017 return ret;
2018}
2019
2020static int perf_fasync(int fd, struct file *filp, int on)
2021{
2022 struct inode *inode = filp->f_path.dentry->d_inode;
2023 struct perf_counter *counter = filp->private_data;
2024 int retval;
2025
2026 mutex_lock(&inode->i_mutex);
2027 retval = fasync_helper(fd, filp, on, &counter->fasync);
2028 mutex_unlock(&inode->i_mutex);
2029
2030 if (retval < 0)
2031 return retval;
2032
2033 return 0;
2034}
2035
2036static const struct file_operations perf_fops = {
2037 .release = perf_release,
2038 .read = perf_read,
2039 .poll = perf_poll,
2040 .unlocked_ioctl = perf_ioctl,
2041 .compat_ioctl = perf_ioctl,
2042 .mmap = perf_mmap,
2043 .fasync = perf_fasync,
2044};
2045
2046/*
2047 * Perf counter wakeup
2048 *
2049 * If there's data, ensure we set the poll() state and publish everything
2050 * to user-space before waking everybody up.
2051 */
2052
2053void perf_counter_wakeup(struct perf_counter *counter)
2054{
2055 wake_up_all(&counter->waitq);
2056
2057 if (counter->pending_kill) {
2058 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2059 counter->pending_kill = 0;
2060 }
2061}
2062
2063/*
2064 * Pending wakeups
2065 *
2066 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2067 *
2068 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2069 * single linked list and use cmpxchg() to add entries lockless.
2070 */
2071
2072static void perf_pending_counter(struct perf_pending_entry *entry)
2073{
2074 struct perf_counter *counter = container_of(entry,
2075 struct perf_counter, pending);
2076
2077 if (counter->pending_disable) {
2078 counter->pending_disable = 0;
2079 perf_counter_disable(counter);
2080 }
2081
2082 if (counter->pending_wakeup) {
2083 counter->pending_wakeup = 0;
2084 perf_counter_wakeup(counter);
2085 }
2086}
2087
2088#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2089
2090static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2091 PENDING_TAIL,
2092};
2093
2094static void perf_pending_queue(struct perf_pending_entry *entry,
2095 void (*func)(struct perf_pending_entry *))
2096{
2097 struct perf_pending_entry **head;
2098
2099 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2100 return;
2101
2102 entry->func = func;
2103
2104 head = &get_cpu_var(perf_pending_head);
2105
2106 do {
2107 entry->next = *head;
2108 } while (cmpxchg(head, entry->next, entry) != entry->next);
2109
2110 set_perf_counter_pending();
2111
2112 put_cpu_var(perf_pending_head);
2113}
2114
2115static int __perf_pending_run(void)
2116{
2117 struct perf_pending_entry *list;
2118 int nr = 0;
2119
2120 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2121 while (list != PENDING_TAIL) {
2122 void (*func)(struct perf_pending_entry *);
2123 struct perf_pending_entry *entry = list;
2124
2125 list = list->next;
2126
2127 func = entry->func;
2128 entry->next = NULL;
2129 /*
2130 * Ensure we observe the unqueue before we issue the wakeup,
2131 * so that we won't be waiting forever.
2132 * -- see perf_not_pending().
2133 */
2134 smp_wmb();
2135
2136 func(entry);
2137 nr++;
2138 }
2139
2140 return nr;
2141}
2142
2143static inline int perf_not_pending(struct perf_counter *counter)
2144{
2145 /*
2146 * If we flush on whatever cpu we run, there is a chance we don't
2147 * need to wait.
2148 */
2149 get_cpu();
2150 __perf_pending_run();
2151 put_cpu();
2152
2153 /*
2154 * Ensure we see the proper queue state before going to sleep
2155 * so that we do not miss the wakeup. -- see perf_pending_handle()
2156 */
2157 smp_rmb();
2158 return counter->pending.next == NULL;
2159}
2160
2161static void perf_pending_sync(struct perf_counter *counter)
2162{
2163 wait_event(counter->waitq, perf_not_pending(counter));
2164}
2165
2166void perf_counter_do_pending(void)
2167{
2168 __perf_pending_run();
2169}
2170
2171/*
2172 * Callchain support -- arch specific
2173 */
2174
2175__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2176{
2177 return NULL;
2178}
2179
2180/*
2181 * Output
2182 */
2183
2184struct perf_output_handle {
2185 struct perf_counter *counter;
2186 struct perf_mmap_data *data;
2187 unsigned long head;
2188 unsigned long offset;
2189 int nmi;
2190 int sample;
2191 int locked;
2192 unsigned long flags;
2193};
2194
2195static bool perf_output_space(struct perf_mmap_data *data,
2196 unsigned int offset, unsigned int head)
2197{
2198 unsigned long tail;
2199 unsigned long mask;
2200
2201 if (!data->writable)
2202 return true;
2203
2204 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2205 /*
2206 * Userspace could choose to issue a mb() before updating the tail
2207 * pointer. So that all reads will be completed before the write is
2208 * issued.
2209 */
2210 tail = ACCESS_ONCE(data->user_page->data_tail);
2211 smp_rmb();
2212
2213 offset = (offset - tail) & mask;
2214 head = (head - tail) & mask;
2215
2216 if ((int)(head - offset) < 0)
2217 return false;
2218
2219 return true;
2220}
2221
2222static void perf_output_wakeup(struct perf_output_handle *handle)
2223{
2224 atomic_set(&handle->data->poll, POLL_IN);
2225
2226 if (handle->nmi) {
2227 handle->counter->pending_wakeup = 1;
2228 perf_pending_queue(&handle->counter->pending,
2229 perf_pending_counter);
2230 } else
2231 perf_counter_wakeup(handle->counter);
2232}
2233
2234/*
2235 * Curious locking construct.
2236 *
2237 * We need to ensure a later event doesn't publish a head when a former
2238 * event isn't done writing. However since we need to deal with NMIs we
2239 * cannot fully serialize things.
2240 *
2241 * What we do is serialize between CPUs so we only have to deal with NMI
2242 * nesting on a single CPU.
2243 *
2244 * We only publish the head (and generate a wakeup) when the outer-most
2245 * event completes.
2246 */
2247static void perf_output_lock(struct perf_output_handle *handle)
2248{
2249 struct perf_mmap_data *data = handle->data;
2250 int cpu;
2251
2252 handle->locked = 0;
2253
2254 local_irq_save(handle->flags);
2255 cpu = smp_processor_id();
2256
2257 if (in_nmi() && atomic_read(&data->lock) == cpu)
2258 return;
2259
2260 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2261 cpu_relax();
2262
2263 handle->locked = 1;
2264}
2265
2266static void perf_output_unlock(struct perf_output_handle *handle)
2267{
2268 struct perf_mmap_data *data = handle->data;
2269 unsigned long head;
2270 int cpu;
2271
2272 data->done_head = data->head;
2273
2274 if (!handle->locked)
2275 goto out;
2276
2277again:
2278 /*
2279 * The xchg implies a full barrier that ensures all writes are done
2280 * before we publish the new head, matched by a rmb() in userspace when
2281 * reading this position.
2282 */
2283 while ((head = atomic_long_xchg(&data->done_head, 0)))
2284 data->user_page->data_head = head;
2285
2286 /*
2287 * NMI can happen here, which means we can miss a done_head update.
2288 */
2289
2290 cpu = atomic_xchg(&data->lock, -1);
2291 WARN_ON_ONCE(cpu != smp_processor_id());
2292
2293 /*
2294 * Therefore we have to validate we did not indeed do so.
2295 */
2296 if (unlikely(atomic_long_read(&data->done_head))) {
2297 /*
2298 * Since we had it locked, we can lock it again.
2299 */
2300 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2301 cpu_relax();
2302
2303 goto again;
2304 }
2305
2306 if (atomic_xchg(&data->wakeup, 0))
2307 perf_output_wakeup(handle);
2308out:
2309 local_irq_restore(handle->flags);
2310}
2311
2312static void perf_output_copy(struct perf_output_handle *handle,
2313 const void *buf, unsigned int len)
2314{
2315 unsigned int pages_mask;
2316 unsigned int offset;
2317 unsigned int size;
2318 void **pages;
2319
2320 offset = handle->offset;
2321 pages_mask = handle->data->nr_pages - 1;
2322 pages = handle->data->data_pages;
2323
2324 do {
2325 unsigned int page_offset;
2326 int nr;
2327
2328 nr = (offset >> PAGE_SHIFT) & pages_mask;
2329 page_offset = offset & (PAGE_SIZE - 1);
2330 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2331
2332 memcpy(pages[nr] + page_offset, buf, size);
2333
2334 len -= size;
2335 buf += size;
2336 offset += size;
2337 } while (len);
2338
2339 handle->offset = offset;
2340
2341 /*
2342 * Check we didn't copy past our reservation window, taking the
2343 * possible unsigned int wrap into account.
2344 */
2345 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2346}
2347
2348#define perf_output_put(handle, x) \
2349 perf_output_copy((handle), &(x), sizeof(x))
2350
2351static int perf_output_begin(struct perf_output_handle *handle,
2352 struct perf_counter *counter, unsigned int size,
2353 int nmi, int sample)
2354{
2355 struct perf_mmap_data *data;
2356 unsigned int offset, head;
2357 int have_lost;
2358 struct {
2359 struct perf_event_header header;
2360 u64 id;
2361 u64 lost;
2362 } lost_event;
2363
2364 /*
2365 * For inherited counters we send all the output towards the parent.
2366 */
2367 if (counter->parent)
2368 counter = counter->parent;
2369
2370 rcu_read_lock();
2371 data = rcu_dereference(counter->data);
2372 if (!data)
2373 goto out;
2374
2375 handle->data = data;
2376 handle->counter = counter;
2377 handle->nmi = nmi;
2378 handle->sample = sample;
2379
2380 if (!data->nr_pages)
2381 goto fail;
2382
2383 have_lost = atomic_read(&data->lost);
2384 if (have_lost)
2385 size += sizeof(lost_event);
2386
2387 perf_output_lock(handle);
2388
2389 do {
2390 offset = head = atomic_long_read(&data->head);
2391 head += size;
2392 if (unlikely(!perf_output_space(data, offset, head)))
2393 goto fail;
2394 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2395
2396 handle->offset = offset;
2397 handle->head = head;
2398
2399 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2400 atomic_set(&data->wakeup, 1);
2401
2402 if (have_lost) {
2403 lost_event.header.type = PERF_EVENT_LOST;
2404 lost_event.header.misc = 0;
2405 lost_event.header.size = sizeof(lost_event);
2406 lost_event.id = counter->id;
2407 lost_event.lost = atomic_xchg(&data->lost, 0);
2408
2409 perf_output_put(handle, lost_event);
2410 }
2411
2412 return 0;
2413
2414fail:
2415 atomic_inc(&data->lost);
2416 perf_output_unlock(handle);
2417out:
2418 rcu_read_unlock();
2419
2420 return -ENOSPC;
2421}
2422
2423static void perf_output_end(struct perf_output_handle *handle)
2424{
2425 struct perf_counter *counter = handle->counter;
2426 struct perf_mmap_data *data = handle->data;
2427
2428 int wakeup_events = counter->attr.wakeup_events;
2429
2430 if (handle->sample && wakeup_events) {
2431 int events = atomic_inc_return(&data->events);
2432 if (events >= wakeup_events) {
2433 atomic_sub(wakeup_events, &data->events);
2434 atomic_set(&data->wakeup, 1);
2435 }
2436 }
2437
2438 perf_output_unlock(handle);
2439 rcu_read_unlock();
2440}
2441
2442static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2443{
2444 /*
2445 * only top level counters have the pid namespace they were created in
2446 */
2447 if (counter->parent)
2448 counter = counter->parent;
2449
2450 return task_tgid_nr_ns(p, counter->ns);
2451}
2452
2453static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2454{
2455 /*
2456 * only top level counters have the pid namespace they were created in
2457 */
2458 if (counter->parent)
2459 counter = counter->parent;
2460
2461 return task_pid_nr_ns(p, counter->ns);
2462}
2463
2464static void perf_counter_output(struct perf_counter *counter, int nmi,
2465 struct perf_sample_data *data)
2466{
2467 int ret;
2468 u64 sample_type = counter->attr.sample_type;
2469 struct perf_output_handle handle;
2470 struct perf_event_header header;
2471 u64 ip;
2472 struct {
2473 u32 pid, tid;
2474 } tid_entry;
2475 struct {
2476 u64 id;
2477 u64 counter;
2478 } group_entry;
2479 struct perf_callchain_entry *callchain = NULL;
2480 int callchain_size = 0;
2481 u64 time;
2482 struct {
2483 u32 cpu, reserved;
2484 } cpu_entry;
2485
2486 header.type = 0;
2487 header.size = sizeof(header);
2488
2489 header.misc = PERF_EVENT_MISC_OVERFLOW;
2490 header.misc |= perf_misc_flags(data->regs);
2491
2492 if (sample_type & PERF_SAMPLE_IP) {
2493 ip = perf_instruction_pointer(data->regs);
2494 header.type |= PERF_SAMPLE_IP;
2495 header.size += sizeof(ip);
2496 }
2497
2498 if (sample_type & PERF_SAMPLE_TID) {
2499 /* namespace issues */
2500 tid_entry.pid = perf_counter_pid(counter, current);
2501 tid_entry.tid = perf_counter_tid(counter, current);
2502
2503 header.type |= PERF_SAMPLE_TID;
2504 header.size += sizeof(tid_entry);
2505 }
2506
2507 if (sample_type & PERF_SAMPLE_TIME) {
2508 /*
2509 * Maybe do better on x86 and provide cpu_clock_nmi()
2510 */
2511 time = sched_clock();
2512
2513 header.type |= PERF_SAMPLE_TIME;
2514 header.size += sizeof(u64);
2515 }
2516
2517 if (sample_type & PERF_SAMPLE_ADDR) {
2518 header.type |= PERF_SAMPLE_ADDR;
2519 header.size += sizeof(u64);
2520 }
2521
2522 if (sample_type & PERF_SAMPLE_ID) {
2523 header.type |= PERF_SAMPLE_ID;
2524 header.size += sizeof(u64);
2525 }
2526
2527 if (sample_type & PERF_SAMPLE_CPU) {
2528 header.type |= PERF_SAMPLE_CPU;
2529 header.size += sizeof(cpu_entry);
2530
2531 cpu_entry.cpu = raw_smp_processor_id();
2532 }
2533
2534 if (sample_type & PERF_SAMPLE_PERIOD) {
2535 header.type |= PERF_SAMPLE_PERIOD;
2536 header.size += sizeof(u64);
2537 }
2538
2539 if (sample_type & PERF_SAMPLE_GROUP) {
2540 header.type |= PERF_SAMPLE_GROUP;
2541 header.size += sizeof(u64) +
2542 counter->nr_siblings * sizeof(group_entry);
2543 }
2544
2545 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2546 callchain = perf_callchain(data->regs);
2547
2548 if (callchain) {
2549 callchain_size = (1 + callchain->nr) * sizeof(u64);
2550
2551 header.type |= PERF_SAMPLE_CALLCHAIN;
2552 header.size += callchain_size;
2553 }
2554 }
2555
2556 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2557 if (ret)
2558 return;
2559
2560 perf_output_put(&handle, header);
2561
2562 if (sample_type & PERF_SAMPLE_IP)
2563 perf_output_put(&handle, ip);
2564
2565 if (sample_type & PERF_SAMPLE_TID)
2566 perf_output_put(&handle, tid_entry);
2567
2568 if (sample_type & PERF_SAMPLE_TIME)
2569 perf_output_put(&handle, time);
2570
2571 if (sample_type & PERF_SAMPLE_ADDR)
2572 perf_output_put(&handle, data->addr);
2573
2574 if (sample_type & PERF_SAMPLE_ID)
2575 perf_output_put(&handle, counter->id);
2576
2577 if (sample_type & PERF_SAMPLE_CPU)
2578 perf_output_put(&handle, cpu_entry);
2579
2580 if (sample_type & PERF_SAMPLE_PERIOD)
2581 perf_output_put(&handle, data->period);
2582
2583 /*
2584 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2585 */
2586 if (sample_type & PERF_SAMPLE_GROUP) {
2587 struct perf_counter *leader, *sub;
2588 u64 nr = counter->nr_siblings;
2589
2590 perf_output_put(&handle, nr);
2591
2592 leader = counter->group_leader;
2593 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2594 if (sub != counter)
2595 sub->pmu->read(sub);
2596
2597 group_entry.id = sub->id;
2598 group_entry.counter = atomic64_read(&sub->count);
2599
2600 perf_output_put(&handle, group_entry);
2601 }
2602 }
2603
2604 if (callchain)
2605 perf_output_copy(&handle, callchain, callchain_size);
2606
2607 perf_output_end(&handle);
2608}
2609
2610/*
2611 * fork tracking
2612 */
2613
2614struct perf_fork_event {
2615 struct task_struct *task;
2616
2617 struct {
2618 struct perf_event_header header;
2619
2620 u32 pid;
2621 u32 ppid;
2622 } event;
2623};
2624
2625static void perf_counter_fork_output(struct perf_counter *counter,
2626 struct perf_fork_event *fork_event)
2627{
2628 struct perf_output_handle handle;
2629 int size = fork_event->event.header.size;
2630 struct task_struct *task = fork_event->task;
2631 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2632
2633 if (ret)
2634 return;
2635
2636 fork_event->event.pid = perf_counter_pid(counter, task);
2637 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2638
2639 perf_output_put(&handle, fork_event->event);
2640 perf_output_end(&handle);
2641}
2642
2643static int perf_counter_fork_match(struct perf_counter *counter)
2644{
2645 if (counter->attr.comm || counter->attr.mmap)
2646 return 1;
2647
2648 return 0;
2649}
2650
2651static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2652 struct perf_fork_event *fork_event)
2653{
2654 struct perf_counter *counter;
2655
2656 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2657 return;
2658
2659 rcu_read_lock();
2660 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2661 if (perf_counter_fork_match(counter))
2662 perf_counter_fork_output(counter, fork_event);
2663 }
2664 rcu_read_unlock();
2665}
2666
2667static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2668{
2669 struct perf_cpu_context *cpuctx;
2670 struct perf_counter_context *ctx;
2671
2672 cpuctx = &get_cpu_var(perf_cpu_context);
2673 perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2674 put_cpu_var(perf_cpu_context);
2675
2676 rcu_read_lock();
2677 /*
2678 * doesn't really matter which of the child contexts the
2679 * events ends up in.
2680 */
2681 ctx = rcu_dereference(current->perf_counter_ctxp);
2682 if (ctx)
2683 perf_counter_fork_ctx(ctx, fork_event);
2684 rcu_read_unlock();
2685}
2686
2687void perf_counter_fork(struct task_struct *task)
2688{
2689 struct perf_fork_event fork_event;
2690
2691 if (!atomic_read(&nr_comm_counters) &&
2692 !atomic_read(&nr_mmap_counters))
2693 return;
2694
2695 fork_event = (struct perf_fork_event){
2696 .task = task,
2697 .event = {
2698 .header = {
2699 .type = PERF_EVENT_FORK,
2700 .size = sizeof(fork_event.event),
2701 },
2702 },
2703 };
2704
2705 perf_counter_fork_event(&fork_event);
2706}
2707
2708/*
2709 * comm tracking
2710 */
2711
2712struct perf_comm_event {
2713 struct task_struct *task;
2714 char *comm;
2715 int comm_size;
2716
2717 struct {
2718 struct perf_event_header header;
2719
2720 u32 pid;
2721 u32 tid;
2722 } event;
2723};
2724
2725static void perf_counter_comm_output(struct perf_counter *counter,
2726 struct perf_comm_event *comm_event)
2727{
2728 struct perf_output_handle handle;
2729 int size = comm_event->event.header.size;
2730 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2731
2732 if (ret)
2733 return;
2734
2735 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2736 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2737
2738 perf_output_put(&handle, comm_event->event);
2739 perf_output_copy(&handle, comm_event->comm,
2740 comm_event->comm_size);
2741 perf_output_end(&handle);
2742}
2743
2744static int perf_counter_comm_match(struct perf_counter *counter)
2745{
2746 if (counter->attr.comm)
2747 return 1;
2748
2749 return 0;
2750}
2751
2752static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2753 struct perf_comm_event *comm_event)
2754{
2755 struct perf_counter *counter;
2756
2757 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2758 return;
2759
2760 rcu_read_lock();
2761 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2762 if (perf_counter_comm_match(counter))
2763 perf_counter_comm_output(counter, comm_event);
2764 }
2765 rcu_read_unlock();
2766}
2767
2768static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2769{
2770 struct perf_cpu_context *cpuctx;
2771 struct perf_counter_context *ctx;
2772 unsigned int size;
2773 char *comm = comm_event->task->comm;
2774
2775 size = ALIGN(strlen(comm)+1, sizeof(u64));
2776
2777 comm_event->comm = comm;
2778 comm_event->comm_size = size;
2779
2780 comm_event->event.header.size = sizeof(comm_event->event) + size;
2781
2782 cpuctx = &get_cpu_var(perf_cpu_context);
2783 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2784 put_cpu_var(perf_cpu_context);
2785
2786 rcu_read_lock();
2787 /*
2788 * doesn't really matter which of the child contexts the
2789 * events ends up in.
2790 */
2791 ctx = rcu_dereference(current->perf_counter_ctxp);
2792 if (ctx)
2793 perf_counter_comm_ctx(ctx, comm_event);
2794 rcu_read_unlock();
2795}
2796
2797void perf_counter_comm(struct task_struct *task)
2798{
2799 struct perf_comm_event comm_event;
2800
2801 if (!atomic_read(&nr_comm_counters))
2802 return;
2803
2804 comm_event = (struct perf_comm_event){
2805 .task = task,
2806 .event = {
2807 .header = { .type = PERF_EVENT_COMM, },
2808 },
2809 };
2810
2811 perf_counter_comm_event(&comm_event);
2812}
2813
2814/*
2815 * mmap tracking
2816 */
2817
2818struct perf_mmap_event {
2819 struct vm_area_struct *vma;
2820
2821 const char *file_name;
2822 int file_size;
2823
2824 struct {
2825 struct perf_event_header header;
2826
2827 u32 pid;
2828 u32 tid;
2829 u64 start;
2830 u64 len;
2831 u64 pgoff;
2832 } event;
2833};
2834
2835static void perf_counter_mmap_output(struct perf_counter *counter,
2836 struct perf_mmap_event *mmap_event)
2837{
2838 struct perf_output_handle handle;
2839 int size = mmap_event->event.header.size;
2840 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2841
2842 if (ret)
2843 return;
2844
2845 mmap_event->event.pid = perf_counter_pid(counter, current);
2846 mmap_event->event.tid = perf_counter_tid(counter, current);
2847
2848 perf_output_put(&handle, mmap_event->event);
2849 perf_output_copy(&handle, mmap_event->file_name,
2850 mmap_event->file_size);
2851 perf_output_end(&handle);
2852}
2853
2854static int perf_counter_mmap_match(struct perf_counter *counter,
2855 struct perf_mmap_event *mmap_event)
2856{
2857 if (counter->attr.mmap)
2858 return 1;
2859
2860 return 0;
2861}
2862
2863static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2864 struct perf_mmap_event *mmap_event)
2865{
2866 struct perf_counter *counter;
2867
2868 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2869 return;
2870
2871 rcu_read_lock();
2872 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2873 if (perf_counter_mmap_match(counter, mmap_event))
2874 perf_counter_mmap_output(counter, mmap_event);
2875 }
2876 rcu_read_unlock();
2877}
2878
2879static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2880{
2881 struct perf_cpu_context *cpuctx;
2882 struct perf_counter_context *ctx;
2883 struct vm_area_struct *vma = mmap_event->vma;
2884 struct file *file = vma->vm_file;
2885 unsigned int size;
2886 char tmp[16];
2887 char *buf = NULL;
2888 const char *name;
2889
2890 if (file) {
2891 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2892 if (!buf) {
2893 name = strncpy(tmp, "//enomem", sizeof(tmp));
2894 goto got_name;
2895 }
2896 name = d_path(&file->f_path, buf, PATH_MAX);
2897 if (IS_ERR(name)) {
2898 name = strncpy(tmp, "//toolong", sizeof(tmp));
2899 goto got_name;
2900 }
2901 } else {
2902 name = arch_vma_name(mmap_event->vma);
2903 if (name)
2904 goto got_name;
2905
2906 if (!vma->vm_mm) {
2907 name = strncpy(tmp, "[vdso]", sizeof(tmp));
2908 goto got_name;
2909 }
2910
2911 name = strncpy(tmp, "//anon", sizeof(tmp));
2912 goto got_name;
2913 }
2914
2915got_name:
2916 size = ALIGN(strlen(name)+1, sizeof(u64));
2917
2918 mmap_event->file_name = name;
2919 mmap_event->file_size = size;
2920
2921 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2922
2923 cpuctx = &get_cpu_var(perf_cpu_context);
2924 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2925 put_cpu_var(perf_cpu_context);
2926
2927 rcu_read_lock();
2928 /*
2929 * doesn't really matter which of the child contexts the
2930 * events ends up in.
2931 */
2932 ctx = rcu_dereference(current->perf_counter_ctxp);
2933 if (ctx)
2934 perf_counter_mmap_ctx(ctx, mmap_event);
2935 rcu_read_unlock();
2936
2937 kfree(buf);
2938}
2939
2940void __perf_counter_mmap(struct vm_area_struct *vma)
2941{
2942 struct perf_mmap_event mmap_event;
2943
2944 if (!atomic_read(&nr_mmap_counters))
2945 return;
2946
2947 mmap_event = (struct perf_mmap_event){
2948 .vma = vma,
2949 .event = {
2950 .header = { .type = PERF_EVENT_MMAP, },
2951 .start = vma->vm_start,
2952 .len = vma->vm_end - vma->vm_start,
2953 .pgoff = vma->vm_pgoff,
2954 },
2955 };
2956
2957 perf_counter_mmap_event(&mmap_event);
2958}
2959
2960/*
2961 * Log sample_period changes so that analyzing tools can re-normalize the
2962 * event flow.
2963 */
2964
2965struct freq_event {
2966 struct perf_event_header header;
2967 u64 time;
2968 u64 id;
2969 u64 period;
2970};
2971
2972static void perf_log_period(struct perf_counter *counter, u64 period)
2973{
2974 struct perf_output_handle handle;
2975 struct freq_event event;
2976 int ret;
2977
2978 if (counter->hw.sample_period == period)
2979 return;
2980
2981 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2982 return;
2983
2984 event = (struct freq_event) {
2985 .header = {
2986 .type = PERF_EVENT_PERIOD,
2987 .misc = 0,
2988 .size = sizeof(event),
2989 },
2990 .time = sched_clock(),
2991 .id = counter->id,
2992 .period = period,
2993 };
2994
2995 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2996 if (ret)
2997 return;
2998
2999 perf_output_put(&handle, event);
3000 perf_output_end(&handle);
3001}
3002
3003/*
3004 * IRQ throttle logging
3005 */
3006
3007static void perf_log_throttle(struct perf_counter *counter, int enable)
3008{
3009 struct perf_output_handle handle;
3010 int ret;
3011
3012 struct {
3013 struct perf_event_header header;
3014 u64 time;
3015 u64 id;
3016 } throttle_event = {
3017 .header = {
3018 .type = PERF_EVENT_THROTTLE + 1,
3019 .misc = 0,
3020 .size = sizeof(throttle_event),
3021 },
3022 .time = sched_clock(),
3023 .id = counter->id,
3024 };
3025
3026 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3027 if (ret)
3028 return;
3029
3030 perf_output_put(&handle, throttle_event);
3031 perf_output_end(&handle);
3032}
3033
3034/*
3035 * Generic counter overflow handling, sampling.
3036 */
3037
3038int perf_counter_overflow(struct perf_counter *counter, int nmi,
3039 struct perf_sample_data *data)
3040{
3041 int events = atomic_read(&counter->event_limit);
3042 int throttle = counter->pmu->unthrottle != NULL;
3043 struct hw_perf_counter *hwc = &counter->hw;
3044 int ret = 0;
3045
3046 if (!throttle) {
3047 hwc->interrupts++;
3048 } else {
3049 if (hwc->interrupts != MAX_INTERRUPTS) {
3050 hwc->interrupts++;
3051 if (HZ * hwc->interrupts >
3052 (u64)sysctl_perf_counter_sample_rate) {
3053 hwc->interrupts = MAX_INTERRUPTS;
3054 perf_log_throttle(counter, 0);
3055 ret = 1;
3056 }
3057 } else {
3058 /*
3059 * Keep re-disabling counters even though on the previous
3060 * pass we disabled it - just in case we raced with a
3061 * sched-in and the counter got enabled again:
3062 */
3063 ret = 1;
3064 }
3065 }
3066
3067 if (counter->attr.freq) {
3068 u64 now = sched_clock();
3069 s64 delta = now - hwc->freq_stamp;
3070
3071 hwc->freq_stamp = now;
3072
3073 if (delta > 0 && delta < TICK_NSEC)
3074 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3075 }
3076
3077 /*
3078 * XXX event_limit might not quite work as expected on inherited
3079 * counters
3080 */
3081
3082 counter->pending_kill = POLL_IN;
3083 if (events && atomic_dec_and_test(&counter->event_limit)) {
3084 ret = 1;
3085 counter->pending_kill = POLL_HUP;
3086 if (nmi) {
3087 counter->pending_disable = 1;
3088 perf_pending_queue(&counter->pending,
3089 perf_pending_counter);
3090 } else
3091 perf_counter_disable(counter);
3092 }
3093
3094 perf_counter_output(counter, nmi, data);
3095 return ret;
3096}
3097
3098/*
3099 * Generic software counter infrastructure
3100 */
3101
3102static void perf_swcounter_update(struct perf_counter *counter)
3103{
3104 struct hw_perf_counter *hwc = &counter->hw;
3105 u64 prev, now;
3106 s64 delta;
3107
3108again:
3109 prev = atomic64_read(&hwc->prev_count);
3110 now = atomic64_read(&hwc->count);
3111 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3112 goto again;
3113
3114 delta = now - prev;
3115
3116 atomic64_add(delta, &counter->count);
3117 atomic64_sub(delta, &hwc->period_left);
3118}
3119
3120static void perf_swcounter_set_period(struct perf_counter *counter)
3121{
3122 struct hw_perf_counter *hwc = &counter->hw;
3123 s64 left = atomic64_read(&hwc->period_left);
3124 s64 period = hwc->sample_period;
3125
3126 if (unlikely(left <= -period)) {
3127 left = period;
3128 atomic64_set(&hwc->period_left, left);
3129 hwc->last_period = period;
3130 }
3131
3132 if (unlikely(left <= 0)) {
3133 left += period;
3134 atomic64_add(period, &hwc->period_left);
3135 hwc->last_period = period;
3136 }
3137
3138 atomic64_set(&hwc->prev_count, -left);
3139 atomic64_set(&hwc->count, -left);
3140}
3141
3142static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3143{
3144 enum hrtimer_restart ret = HRTIMER_RESTART;
3145 struct perf_sample_data data;
3146 struct perf_counter *counter;
3147 u64 period;
3148
3149 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3150 counter->pmu->read(counter);
3151
3152 data.addr = 0;
3153 data.regs = get_irq_regs();
3154 /*
3155 * In case we exclude kernel IPs or are somehow not in interrupt
3156 * context, provide the next best thing, the user IP.
3157 */
3158 if ((counter->attr.exclude_kernel || !data.regs) &&
3159 !counter->attr.exclude_user)
3160 data.regs = task_pt_regs(current);
3161
3162 if (data.regs) {
3163 if (perf_counter_overflow(counter, 0, &data))
3164 ret = HRTIMER_NORESTART;
3165 }
3166
3167 period = max_t(u64, 10000, counter->hw.sample_period);
3168 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3169
3170 return ret;
3171}
3172
3173static void perf_swcounter_overflow(struct perf_counter *counter,
3174 int nmi, struct perf_sample_data *data)
3175{
3176 data->period = counter->hw.last_period;
3177
3178 perf_swcounter_update(counter);
3179 perf_swcounter_set_period(counter);
3180 if (perf_counter_overflow(counter, nmi, data))
3181 /* soft-disable the counter */
3182 ;
3183}
3184
3185static int perf_swcounter_is_counting(struct perf_counter *counter)
3186{
3187 struct perf_counter_context *ctx;
3188 unsigned long flags;
3189 int count;
3190
3191 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3192 return 1;
3193
3194 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3195 return 0;
3196
3197 /*
3198 * If the counter is inactive, it could be just because
3199 * its task is scheduled out, or because it's in a group
3200 * which could not go on the PMU. We want to count in
3201 * the first case but not the second. If the context is
3202 * currently active then an inactive software counter must
3203 * be the second case. If it's not currently active then
3204 * we need to know whether the counter was active when the
3205 * context was last active, which we can determine by
3206 * comparing counter->tstamp_stopped with ctx->time.
3207 *
3208 * We are within an RCU read-side critical section,
3209 * which protects the existence of *ctx.
3210 */
3211 ctx = counter->ctx;
3212 spin_lock_irqsave(&ctx->lock, flags);
3213 count = 1;
3214 /* Re-check state now we have the lock */
3215 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3216 counter->ctx->is_active ||
3217 counter->tstamp_stopped < ctx->time)
3218 count = 0;
3219 spin_unlock_irqrestore(&ctx->lock, flags);
3220 return count;
3221}
3222
3223static int perf_swcounter_match(struct perf_counter *counter,
3224 enum perf_type_id type,
3225 u32 event, struct pt_regs *regs)
3226{
3227 if (!perf_swcounter_is_counting(counter))
3228 return 0;
3229
3230 if (counter->attr.type != type)
3231 return 0;
3232 if (counter->attr.config != event)
3233 return 0;
3234
3235 if (regs) {
3236 if (counter->attr.exclude_user && user_mode(regs))
3237 return 0;
3238
3239 if (counter->attr.exclude_kernel && !user_mode(regs))
3240 return 0;
3241 }
3242
3243 return 1;
3244}
3245
3246static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3247 int nmi, struct perf_sample_data *data)
3248{
3249 int neg = atomic64_add_negative(nr, &counter->hw.count);
3250
3251 if (counter->hw.sample_period && !neg && data->regs)
3252 perf_swcounter_overflow(counter, nmi, data);
3253}
3254
3255static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3256 enum perf_type_id type,
3257 u32 event, u64 nr, int nmi,
3258 struct perf_sample_data *data)
3259{
3260 struct perf_counter *counter;
3261
3262 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3263 return;
3264
3265 rcu_read_lock();
3266 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3267 if (perf_swcounter_match(counter, type, event, data->regs))
3268 perf_swcounter_add(counter, nr, nmi, data);
3269 }
3270 rcu_read_unlock();
3271}
3272
3273static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3274{
3275 if (in_nmi())
3276 return &cpuctx->recursion[3];
3277
3278 if (in_irq())
3279 return &cpuctx->recursion[2];
3280
3281 if (in_softirq())
3282 return &cpuctx->recursion[1];
3283
3284 return &cpuctx->recursion[0];
3285}
3286
3287static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3288 u64 nr, int nmi,
3289 struct perf_sample_data *data)
3290{
3291 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3292 int *recursion = perf_swcounter_recursion_context(cpuctx);
3293 struct perf_counter_context *ctx;
3294
3295 if (*recursion)
3296 goto out;
3297
3298 (*recursion)++;
3299 barrier();
3300
3301 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3302 nr, nmi, data);
3303 rcu_read_lock();
3304 /*
3305 * doesn't really matter which of the child contexts the
3306 * events ends up in.
3307 */
3308 ctx = rcu_dereference(current->perf_counter_ctxp);
3309 if (ctx)
3310 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3311 rcu_read_unlock();
3312
3313 barrier();
3314 (*recursion)--;
3315
3316out:
3317 put_cpu_var(perf_cpu_context);
3318}
3319
3320void
3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3322{
3323 struct perf_sample_data data = {
3324 .regs = regs,
3325 .addr = addr,
3326 };
3327
3328 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3329}
3330
3331static void perf_swcounter_read(struct perf_counter *counter)
3332{
3333 perf_swcounter_update(counter);
3334}
3335
3336static int perf_swcounter_enable(struct perf_counter *counter)
3337{
3338 perf_swcounter_set_period(counter);
3339 return 0;
3340}
3341
3342static void perf_swcounter_disable(struct perf_counter *counter)
3343{
3344 perf_swcounter_update(counter);
3345}
3346
3347static const struct pmu perf_ops_generic = {
3348 .enable = perf_swcounter_enable,
3349 .disable = perf_swcounter_disable,
3350 .read = perf_swcounter_read,
3351};
3352
3353/*
3354 * Software counter: cpu wall time clock
3355 */
3356
3357static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3358{
3359 int cpu = raw_smp_processor_id();
3360 s64 prev;
3361 u64 now;
3362
3363 now = cpu_clock(cpu);
3364 prev = atomic64_read(&counter->hw.prev_count);
3365 atomic64_set(&counter->hw.prev_count, now);
3366 atomic64_add(now - prev, &counter->count);
3367}
3368
3369static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3370{
3371 struct hw_perf_counter *hwc = &counter->hw;
3372 int cpu = raw_smp_processor_id();
3373
3374 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3375 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3376 hwc->hrtimer.function = perf_swcounter_hrtimer;
3377 if (hwc->sample_period) {
3378 u64 period = max_t(u64, 10000, hwc->sample_period);
3379 __hrtimer_start_range_ns(&hwc->hrtimer,
3380 ns_to_ktime(period), 0,
3381 HRTIMER_MODE_REL, 0);
3382 }
3383
3384 return 0;
3385}
3386
3387static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3388{
3389 if (counter->hw.sample_period)
3390 hrtimer_cancel(&counter->hw.hrtimer);
3391 cpu_clock_perf_counter_update(counter);
3392}
3393
3394static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3395{
3396 cpu_clock_perf_counter_update(counter);
3397}
3398
3399static const struct pmu perf_ops_cpu_clock = {
3400 .enable = cpu_clock_perf_counter_enable,
3401 .disable = cpu_clock_perf_counter_disable,
3402 .read = cpu_clock_perf_counter_read,
3403};
3404
3405/*
3406 * Software counter: task time clock
3407 */
3408
3409static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3410{
3411 u64 prev;
3412 s64 delta;
3413
3414 prev = atomic64_xchg(&counter->hw.prev_count, now);
3415 delta = now - prev;
3416 atomic64_add(delta, &counter->count);
3417}
3418
3419static int task_clock_perf_counter_enable(struct perf_counter *counter)
3420{
3421 struct hw_perf_counter *hwc = &counter->hw;
3422 u64 now;
3423
3424 now = counter->ctx->time;
3425
3426 atomic64_set(&hwc->prev_count, now);
3427 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3428 hwc->hrtimer.function = perf_swcounter_hrtimer;
3429 if (hwc->sample_period) {
3430 u64 period = max_t(u64, 10000, hwc->sample_period);
3431 __hrtimer_start_range_ns(&hwc->hrtimer,
3432 ns_to_ktime(period), 0,
3433 HRTIMER_MODE_REL, 0);
3434 }
3435
3436 return 0;
3437}
3438
3439static void task_clock_perf_counter_disable(struct perf_counter *counter)
3440{
3441 if (counter->hw.sample_period)
3442 hrtimer_cancel(&counter->hw.hrtimer);
3443 task_clock_perf_counter_update(counter, counter->ctx->time);
3444
3445}
3446
3447static void task_clock_perf_counter_read(struct perf_counter *counter)
3448{
3449 u64 time;
3450
3451 if (!in_nmi()) {
3452 update_context_time(counter->ctx);
3453 time = counter->ctx->time;
3454 } else {
3455 u64 now = perf_clock();
3456 u64 delta = now - counter->ctx->timestamp;
3457 time = counter->ctx->time + delta;
3458 }
3459
3460 task_clock_perf_counter_update(counter, time);
3461}
3462
3463static const struct pmu perf_ops_task_clock = {
3464 .enable = task_clock_perf_counter_enable,
3465 .disable = task_clock_perf_counter_disable,
3466 .read = task_clock_perf_counter_read,
3467};
3468
3469#ifdef CONFIG_EVENT_PROFILE
3470void perf_tpcounter_event(int event_id)
3471{
3472 struct perf_sample_data data = {
3473 .regs = get_irq_regs();
3474 .addr = 0,
3475 };
3476
3477 if (!data.regs)
3478 data.regs = task_pt_regs(current);
3479
3480 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
3481}
3482EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3483
3484extern int ftrace_profile_enable(int);
3485extern void ftrace_profile_disable(int);
3486
3487static void tp_perf_counter_destroy(struct perf_counter *counter)
3488{
3489 ftrace_profile_disable(perf_event_id(&counter->attr));
3490}
3491
3492static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3493{
3494 int event_id = perf_event_id(&counter->attr);
3495 int ret;
3496
3497 ret = ftrace_profile_enable(event_id);
3498 if (ret)
3499 return NULL;
3500
3501 counter->destroy = tp_perf_counter_destroy;
3502
3503 return &perf_ops_generic;
3504}
3505#else
3506static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3507{
3508 return NULL;
3509}
3510#endif
3511
3512static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3513{
3514 const struct pmu *pmu = NULL;
3515
3516 /*
3517 * Software counters (currently) can't in general distinguish
3518 * between user, kernel and hypervisor events.
3519 * However, context switches and cpu migrations are considered
3520 * to be kernel events, and page faults are never hypervisor
3521 * events.
3522 */
3523 switch (counter->attr.config) {
3524 case PERF_COUNT_SW_CPU_CLOCK:
3525 pmu = &perf_ops_cpu_clock;
3526
3527 break;
3528 case PERF_COUNT_SW_TASK_CLOCK:
3529 /*
3530 * If the user instantiates this as a per-cpu counter,
3531 * use the cpu_clock counter instead.
3532 */
3533 if (counter->ctx->task)
3534 pmu = &perf_ops_task_clock;
3535 else
3536 pmu = &perf_ops_cpu_clock;
3537
3538 break;
3539 case PERF_COUNT_SW_PAGE_FAULTS:
3540 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3541 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3542 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3543 case PERF_COUNT_SW_CPU_MIGRATIONS:
3544 pmu = &perf_ops_generic;
3545 break;
3546 }
3547
3548 return pmu;
3549}
3550
3551/*
3552 * Allocate and initialize a counter structure
3553 */
3554static struct perf_counter *
3555perf_counter_alloc(struct perf_counter_attr *attr,
3556 int cpu,
3557 struct perf_counter_context *ctx,
3558 struct perf_counter *group_leader,
3559 gfp_t gfpflags)
3560{
3561 const struct pmu *pmu;
3562 struct perf_counter *counter;
3563 struct hw_perf_counter *hwc;
3564 long err;
3565
3566 counter = kzalloc(sizeof(*counter), gfpflags);
3567 if (!counter)
3568 return ERR_PTR(-ENOMEM);
3569
3570 /*
3571 * Single counters are their own group leaders, with an
3572 * empty sibling list:
3573 */
3574 if (!group_leader)
3575 group_leader = counter;
3576
3577 mutex_init(&counter->child_mutex);
3578 INIT_LIST_HEAD(&counter->child_list);
3579
3580 INIT_LIST_HEAD(&counter->list_entry);
3581 INIT_LIST_HEAD(&counter->event_entry);
3582 INIT_LIST_HEAD(&counter->sibling_list);
3583 init_waitqueue_head(&counter->waitq);
3584
3585 mutex_init(&counter->mmap_mutex);
3586
3587 counter->cpu = cpu;
3588 counter->attr = *attr;
3589 counter->group_leader = group_leader;
3590 counter->pmu = NULL;
3591 counter->ctx = ctx;
3592 counter->oncpu = -1;
3593
3594 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3595 counter->id = atomic64_inc_return(&perf_counter_id);
3596
3597 counter->state = PERF_COUNTER_STATE_INACTIVE;
3598
3599 if (attr->disabled)
3600 counter->state = PERF_COUNTER_STATE_OFF;
3601
3602 pmu = NULL;
3603
3604 hwc = &counter->hw;
3605 hwc->sample_period = attr->sample_period;
3606 if (attr->freq && attr->sample_freq)
3607 hwc->sample_period = 1;
3608
3609 atomic64_set(&hwc->period_left, hwc->sample_period);
3610
3611 /*
3612 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3613 */
3614 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3615 goto done;
3616
3617 switch (attr->type) {
3618 case PERF_TYPE_RAW:
3619 case PERF_TYPE_HARDWARE:
3620 case PERF_TYPE_HW_CACHE:
3621 pmu = hw_perf_counter_init(counter);
3622 break;
3623
3624 case PERF_TYPE_SOFTWARE:
3625 pmu = sw_perf_counter_init(counter);
3626 break;
3627
3628 case PERF_TYPE_TRACEPOINT:
3629 pmu = tp_perf_counter_init(counter);
3630 break;
3631
3632 default:
3633 break;
3634 }
3635done:
3636 err = 0;
3637 if (!pmu)
3638 err = -EINVAL;
3639 else if (IS_ERR(pmu))
3640 err = PTR_ERR(pmu);
3641
3642 if (err) {
3643 if (counter->ns)
3644 put_pid_ns(counter->ns);
3645 kfree(counter);
3646 return ERR_PTR(err);
3647 }
3648
3649 counter->pmu = pmu;
3650
3651 atomic_inc(&nr_counters);
3652 if (counter->attr.mmap)
3653 atomic_inc(&nr_mmap_counters);
3654 if (counter->attr.comm)
3655 atomic_inc(&nr_comm_counters);
3656
3657 return counter;
3658}
3659
3660static int perf_copy_attr(struct perf_counter_attr __user *uattr,
3661 struct perf_counter_attr *attr)
3662{
3663 int ret;
3664 u32 size;
3665
3666 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
3667 return -EFAULT;
3668
3669 /*
3670 * zero the full structure, so that a short copy will be nice.
3671 */
3672 memset(attr, 0, sizeof(*attr));
3673
3674 ret = get_user(size, &uattr->size);
3675 if (ret)
3676 return ret;
3677
3678 if (size > PAGE_SIZE) /* silly large */
3679 goto err_size;
3680
3681 if (!size) /* abi compat */
3682 size = PERF_ATTR_SIZE_VER0;
3683
3684 if (size < PERF_ATTR_SIZE_VER0)
3685 goto err_size;
3686
3687 /*
3688 * If we're handed a bigger struct than we know of,
3689 * ensure all the unknown bits are 0.
3690 */
3691 if (size > sizeof(*attr)) {
3692 unsigned long val;
3693 unsigned long __user *addr;
3694 unsigned long __user *end;
3695
3696 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
3697 sizeof(unsigned long));
3698 end = PTR_ALIGN((void __user *)uattr + size,
3699 sizeof(unsigned long));
3700
3701 for (; addr < end; addr += sizeof(unsigned long)) {
3702 ret = get_user(val, addr);
3703 if (ret)
3704 return ret;
3705 if (val)
3706 goto err_size;
3707 }
3708 }
3709
3710 ret = copy_from_user(attr, uattr, size);
3711 if (ret)
3712 return -EFAULT;
3713
3714 /*
3715 * If the type exists, the corresponding creation will verify
3716 * the attr->config.
3717 */
3718 if (attr->type >= PERF_TYPE_MAX)
3719 return -EINVAL;
3720
3721 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
3722 return -EINVAL;
3723
3724 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
3725 return -EINVAL;
3726
3727 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
3728 return -EINVAL;
3729
3730out:
3731 return ret;
3732
3733err_size:
3734 put_user(sizeof(*attr), &uattr->size);
3735 ret = -E2BIG;
3736 goto out;
3737}
3738
3739/**
3740 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3741 *
3742 * @attr_uptr: event type attributes for monitoring/sampling
3743 * @pid: target pid
3744 * @cpu: target cpu
3745 * @group_fd: group leader counter fd
3746 */
3747SYSCALL_DEFINE5(perf_counter_open,
3748 struct perf_counter_attr __user *, attr_uptr,
3749 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3750{
3751 struct perf_counter *counter, *group_leader;
3752 struct perf_counter_attr attr;
3753 struct perf_counter_context *ctx;
3754 struct file *counter_file = NULL;
3755 struct file *group_file = NULL;
3756 int fput_needed = 0;
3757 int fput_needed2 = 0;
3758 int ret;
3759
3760 /* for future expandability... */
3761 if (flags)
3762 return -EINVAL;
3763
3764 ret = perf_copy_attr(attr_uptr, &attr);
3765 if (ret)
3766 return ret;
3767
3768 if (!attr.exclude_kernel) {
3769 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
3770 return -EACCES;
3771 }
3772
3773 if (attr.freq) {
3774 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
3775 return -EINVAL;
3776 }
3777
3778 /*
3779 * Get the target context (task or percpu):
3780 */
3781 ctx = find_get_context(pid, cpu);
3782 if (IS_ERR(ctx))
3783 return PTR_ERR(ctx);
3784
3785 /*
3786 * Look up the group leader (we will attach this counter to it):
3787 */
3788 group_leader = NULL;
3789 if (group_fd != -1) {
3790 ret = -EINVAL;
3791 group_file = fget_light(group_fd, &fput_needed);
3792 if (!group_file)
3793 goto err_put_context;
3794 if (group_file->f_op != &perf_fops)
3795 goto err_put_context;
3796
3797 group_leader = group_file->private_data;
3798 /*
3799 * Do not allow a recursive hierarchy (this new sibling
3800 * becoming part of another group-sibling):
3801 */
3802 if (group_leader->group_leader != group_leader)
3803 goto err_put_context;
3804 /*
3805 * Do not allow to attach to a group in a different
3806 * task or CPU context:
3807 */
3808 if (group_leader->ctx != ctx)
3809 goto err_put_context;
3810 /*
3811 * Only a group leader can be exclusive or pinned
3812 */
3813 if (attr.exclusive || attr.pinned)
3814 goto err_put_context;
3815 }
3816
3817 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3818 GFP_KERNEL);
3819 ret = PTR_ERR(counter);
3820 if (IS_ERR(counter))
3821 goto err_put_context;
3822
3823 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3824 if (ret < 0)
3825 goto err_free_put_context;
3826
3827 counter_file = fget_light(ret, &fput_needed2);
3828 if (!counter_file)
3829 goto err_free_put_context;
3830
3831 counter->filp = counter_file;
3832 WARN_ON_ONCE(ctx->parent_ctx);
3833 mutex_lock(&ctx->mutex);
3834 perf_install_in_context(ctx, counter, cpu);
3835 ++ctx->generation;
3836 mutex_unlock(&ctx->mutex);
3837
3838 counter->owner = current;
3839 get_task_struct(current);
3840 mutex_lock(&current->perf_counter_mutex);
3841 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3842 mutex_unlock(&current->perf_counter_mutex);
3843
3844 fput_light(counter_file, fput_needed2);
3845
3846out_fput:
3847 fput_light(group_file, fput_needed);
3848
3849 return ret;
3850
3851err_free_put_context:
3852 kfree(counter);
3853
3854err_put_context:
3855 put_ctx(ctx);
3856
3857 goto out_fput;
3858}
3859
3860/*
3861 * inherit a counter from parent task to child task:
3862 */
3863static struct perf_counter *
3864inherit_counter(struct perf_counter *parent_counter,
3865 struct task_struct *parent,
3866 struct perf_counter_context *parent_ctx,
3867 struct task_struct *child,
3868 struct perf_counter *group_leader,
3869 struct perf_counter_context *child_ctx)
3870{
3871 struct perf_counter *child_counter;
3872
3873 /*
3874 * Instead of creating recursive hierarchies of counters,
3875 * we link inherited counters back to the original parent,
3876 * which has a filp for sure, which we use as the reference
3877 * count:
3878 */
3879 if (parent_counter->parent)
3880 parent_counter = parent_counter->parent;
3881
3882 child_counter = perf_counter_alloc(&parent_counter->attr,
3883 parent_counter->cpu, child_ctx,
3884 group_leader, GFP_KERNEL);
3885 if (IS_ERR(child_counter))
3886 return child_counter;
3887 get_ctx(child_ctx);
3888
3889 /*
3890 * Make the child state follow the state of the parent counter,
3891 * not its attr.disabled bit. We hold the parent's mutex,
3892 * so we won't race with perf_counter_{en, dis}able_family.
3893 */
3894 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3895 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3896 else
3897 child_counter->state = PERF_COUNTER_STATE_OFF;
3898
3899 if (parent_counter->attr.freq)
3900 child_counter->hw.sample_period = parent_counter->hw.sample_period;
3901
3902 /*
3903 * Link it up in the child's context:
3904 */
3905 add_counter_to_ctx(child_counter, child_ctx);
3906
3907 child_counter->parent = parent_counter;
3908 /*
3909 * inherit into child's child as well:
3910 */
3911 child_counter->attr.inherit = 1;
3912
3913 /*
3914 * Get a reference to the parent filp - we will fput it
3915 * when the child counter exits. This is safe to do because
3916 * we are in the parent and we know that the filp still
3917 * exists and has a nonzero count:
3918 */
3919 atomic_long_inc(&parent_counter->filp->f_count);
3920
3921 /*
3922 * Link this into the parent counter's child list
3923 */
3924 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3925 mutex_lock(&parent_counter->child_mutex);
3926 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3927 mutex_unlock(&parent_counter->child_mutex);
3928
3929 return child_counter;
3930}
3931
3932static int inherit_group(struct perf_counter *parent_counter,
3933 struct task_struct *parent,
3934 struct perf_counter_context *parent_ctx,
3935 struct task_struct *child,
3936 struct perf_counter_context *child_ctx)
3937{
3938 struct perf_counter *leader;
3939 struct perf_counter *sub;
3940 struct perf_counter *child_ctr;
3941
3942 leader = inherit_counter(parent_counter, parent, parent_ctx,
3943 child, NULL, child_ctx);
3944 if (IS_ERR(leader))
3945 return PTR_ERR(leader);
3946 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3947 child_ctr = inherit_counter(sub, parent, parent_ctx,
3948 child, leader, child_ctx);
3949 if (IS_ERR(child_ctr))
3950 return PTR_ERR(child_ctr);
3951 }
3952 return 0;
3953}
3954
3955static void sync_child_counter(struct perf_counter *child_counter,
3956 struct perf_counter *parent_counter)
3957{
3958 u64 child_val;
3959
3960 child_val = atomic64_read(&child_counter->count);
3961
3962 /*
3963 * Add back the child's count to the parent's count:
3964 */
3965 atomic64_add(child_val, &parent_counter->count);
3966 atomic64_add(child_counter->total_time_enabled,
3967 &parent_counter->child_total_time_enabled);
3968 atomic64_add(child_counter->total_time_running,
3969 &parent_counter->child_total_time_running);
3970
3971 /*
3972 * Remove this counter from the parent's list
3973 */
3974 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3975 mutex_lock(&parent_counter->child_mutex);
3976 list_del_init(&child_counter->child_list);
3977 mutex_unlock(&parent_counter->child_mutex);
3978
3979 /*
3980 * Release the parent counter, if this was the last
3981 * reference to it.
3982 */
3983 fput(parent_counter->filp);
3984}
3985
3986static void
3987__perf_counter_exit_task(struct perf_counter *child_counter,
3988 struct perf_counter_context *child_ctx)
3989{
3990 struct perf_counter *parent_counter;
3991
3992 update_counter_times(child_counter);
3993 perf_counter_remove_from_context(child_counter);
3994
3995 parent_counter = child_counter->parent;
3996 /*
3997 * It can happen that parent exits first, and has counters
3998 * that are still around due to the child reference. These
3999 * counters need to be zapped - but otherwise linger.
4000 */
4001 if (parent_counter) {
4002 sync_child_counter(child_counter, parent_counter);
4003 free_counter(child_counter);
4004 }
4005}
4006
4007/*
4008 * When a child task exits, feed back counter values to parent counters.
4009 */
4010void perf_counter_exit_task(struct task_struct *child)
4011{
4012 struct perf_counter *child_counter, *tmp;
4013 struct perf_counter_context *child_ctx;
4014 unsigned long flags;
4015
4016 if (likely(!child->perf_counter_ctxp))
4017 return;
4018
4019 local_irq_save(flags);
4020 /*
4021 * We can't reschedule here because interrupts are disabled,
4022 * and either child is current or it is a task that can't be
4023 * scheduled, so we are now safe from rescheduling changing
4024 * our context.
4025 */
4026 child_ctx = child->perf_counter_ctxp;
4027 __perf_counter_task_sched_out(child_ctx);
4028
4029 /*
4030 * Take the context lock here so that if find_get_context is
4031 * reading child->perf_counter_ctxp, we wait until it has
4032 * incremented the context's refcount before we do put_ctx below.
4033 */
4034 spin_lock(&child_ctx->lock);
4035 child->perf_counter_ctxp = NULL;
4036 if (child_ctx->parent_ctx) {
4037 /*
4038 * This context is a clone; unclone it so it can't get
4039 * swapped to another process while we're removing all
4040 * the counters from it.
4041 */
4042 put_ctx(child_ctx->parent_ctx);
4043 child_ctx->parent_ctx = NULL;
4044 }
4045 spin_unlock(&child_ctx->lock);
4046 local_irq_restore(flags);
4047
4048 /*
4049 * We can recurse on the same lock type through:
4050 *
4051 * __perf_counter_exit_task()
4052 * sync_child_counter()
4053 * fput(parent_counter->filp)
4054 * perf_release()
4055 * mutex_lock(&ctx->mutex)
4056 *
4057 * But since its the parent context it won't be the same instance.
4058 */
4059 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4060
4061again:
4062 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4063 list_entry)
4064 __perf_counter_exit_task(child_counter, child_ctx);
4065
4066 /*
4067 * If the last counter was a group counter, it will have appended all
4068 * its siblings to the list, but we obtained 'tmp' before that which
4069 * will still point to the list head terminating the iteration.
4070 */
4071 if (!list_empty(&child_ctx->counter_list))
4072 goto again;
4073
4074 mutex_unlock(&child_ctx->mutex);
4075
4076 put_ctx(child_ctx);
4077}
4078
4079/*
4080 * free an unexposed, unused context as created by inheritance by
4081 * init_task below, used by fork() in case of fail.
4082 */
4083void perf_counter_free_task(struct task_struct *task)
4084{
4085 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4086 struct perf_counter *counter, *tmp;
4087
4088 if (!ctx)
4089 return;
4090
4091 mutex_lock(&ctx->mutex);
4092again:
4093 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4094 struct perf_counter *parent = counter->parent;
4095
4096 if (WARN_ON_ONCE(!parent))
4097 continue;
4098
4099 mutex_lock(&parent->child_mutex);
4100 list_del_init(&counter->child_list);
4101 mutex_unlock(&parent->child_mutex);
4102
4103 fput(parent->filp);
4104
4105 list_del_counter(counter, ctx);
4106 free_counter(counter);
4107 }
4108
4109 if (!list_empty(&ctx->counter_list))
4110 goto again;
4111
4112 mutex_unlock(&ctx->mutex);
4113
4114 put_ctx(ctx);
4115}
4116
4117/*
4118 * Initialize the perf_counter context in task_struct
4119 */
4120int perf_counter_init_task(struct task_struct *child)
4121{
4122 struct perf_counter_context *child_ctx, *parent_ctx;
4123 struct perf_counter_context *cloned_ctx;
4124 struct perf_counter *counter;
4125 struct task_struct *parent = current;
4126 int inherited_all = 1;
4127 int ret = 0;
4128
4129 child->perf_counter_ctxp = NULL;
4130
4131 mutex_init(&child->perf_counter_mutex);
4132 INIT_LIST_HEAD(&child->perf_counter_list);
4133
4134 if (likely(!parent->perf_counter_ctxp))
4135 return 0;
4136
4137 /*
4138 * This is executed from the parent task context, so inherit
4139 * counters that have been marked for cloning.
4140 * First allocate and initialize a context for the child.
4141 */
4142
4143 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4144 if (!child_ctx)
4145 return -ENOMEM;
4146
4147 __perf_counter_init_context(child_ctx, child);
4148 child->perf_counter_ctxp = child_ctx;
4149 get_task_struct(child);
4150
4151 /*
4152 * If the parent's context is a clone, pin it so it won't get
4153 * swapped under us.
4154 */
4155 parent_ctx = perf_pin_task_context(parent);
4156
4157 /*
4158 * No need to check if parent_ctx != NULL here; since we saw
4159 * it non-NULL earlier, the only reason for it to become NULL
4160 * is if we exit, and since we're currently in the middle of
4161 * a fork we can't be exiting at the same time.
4162 */
4163
4164 /*
4165 * Lock the parent list. No need to lock the child - not PID
4166 * hashed yet and not running, so nobody can access it.
4167 */
4168 mutex_lock(&parent_ctx->mutex);
4169
4170 /*
4171 * We dont have to disable NMIs - we are only looking at
4172 * the list, not manipulating it:
4173 */
4174 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4175 if (counter != counter->group_leader)
4176 continue;
4177
4178 if (!counter->attr.inherit) {
4179 inherited_all = 0;
4180 continue;
4181 }
4182
4183 ret = inherit_group(counter, parent, parent_ctx,
4184 child, child_ctx);
4185 if (ret) {
4186 inherited_all = 0;
4187 break;
4188 }
4189 }
4190
4191 if (inherited_all) {
4192 /*
4193 * Mark the child context as a clone of the parent
4194 * context, or of whatever the parent is a clone of.
4195 * Note that if the parent is a clone, it could get
4196 * uncloned at any point, but that doesn't matter
4197 * because the list of counters and the generation
4198 * count can't have changed since we took the mutex.
4199 */
4200 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4201 if (cloned_ctx) {
4202 child_ctx->parent_ctx = cloned_ctx;
4203 child_ctx->parent_gen = parent_ctx->parent_gen;
4204 } else {
4205 child_ctx->parent_ctx = parent_ctx;
4206 child_ctx->parent_gen = parent_ctx->generation;
4207 }
4208 get_ctx(child_ctx->parent_ctx);
4209 }
4210
4211 mutex_unlock(&parent_ctx->mutex);
4212
4213 perf_unpin_context(parent_ctx);
4214
4215 return ret;
4216}
4217
4218static void __cpuinit perf_counter_init_cpu(int cpu)
4219{
4220 struct perf_cpu_context *cpuctx;
4221
4222 cpuctx = &per_cpu(perf_cpu_context, cpu);
4223 __perf_counter_init_context(&cpuctx->ctx, NULL);
4224
4225 spin_lock(&perf_resource_lock);
4226 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4227 spin_unlock(&perf_resource_lock);
4228
4229 hw_perf_counter_setup(cpu);
4230}
4231
4232#ifdef CONFIG_HOTPLUG_CPU
4233static void __perf_counter_exit_cpu(void *info)
4234{
4235 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4236 struct perf_counter_context *ctx = &cpuctx->ctx;
4237 struct perf_counter *counter, *tmp;
4238
4239 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4240 __perf_counter_remove_from_context(counter);
4241}
4242static void perf_counter_exit_cpu(int cpu)
4243{
4244 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4245 struct perf_counter_context *ctx = &cpuctx->ctx;
4246
4247 mutex_lock(&ctx->mutex);
4248 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4249 mutex_unlock(&ctx->mutex);
4250}
4251#else
4252static inline void perf_counter_exit_cpu(int cpu) { }
4253#endif
4254
4255static int __cpuinit
4256perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4257{
4258 unsigned int cpu = (long)hcpu;
4259
4260 switch (action) {
4261
4262 case CPU_UP_PREPARE:
4263 case CPU_UP_PREPARE_FROZEN:
4264 perf_counter_init_cpu(cpu);
4265 break;
4266
4267 case CPU_DOWN_PREPARE:
4268 case CPU_DOWN_PREPARE_FROZEN:
4269 perf_counter_exit_cpu(cpu);
4270 break;
4271
4272 default:
4273 break;
4274 }
4275
4276 return NOTIFY_OK;
4277}
4278
4279/*
4280 * This has to have a higher priority than migration_notifier in sched.c.
4281 */
4282static struct notifier_block __cpuinitdata perf_cpu_nb = {
4283 .notifier_call = perf_cpu_notify,
4284 .priority = 20,
4285};
4286
4287void __init perf_counter_init(void)
4288{
4289 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4290 (void *)(long)smp_processor_id());
4291 register_cpu_notifier(&perf_cpu_nb);
4292}
4293
4294static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4295{
4296 return sprintf(buf, "%d\n", perf_reserved_percpu);
4297}
4298
4299static ssize_t
4300perf_set_reserve_percpu(struct sysdev_class *class,
4301 const char *buf,
4302 size_t count)
4303{
4304 struct perf_cpu_context *cpuctx;
4305 unsigned long val;
4306 int err, cpu, mpt;
4307
4308 err = strict_strtoul(buf, 10, &val);
4309 if (err)
4310 return err;
4311 if (val > perf_max_counters)
4312 return -EINVAL;
4313
4314 spin_lock(&perf_resource_lock);
4315 perf_reserved_percpu = val;
4316 for_each_online_cpu(cpu) {
4317 cpuctx = &per_cpu(perf_cpu_context, cpu);
4318 spin_lock_irq(&cpuctx->ctx.lock);
4319 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4320 perf_max_counters - perf_reserved_percpu);
4321 cpuctx->max_pertask = mpt;
4322 spin_unlock_irq(&cpuctx->ctx.lock);
4323 }
4324 spin_unlock(&perf_resource_lock);
4325
4326 return count;
4327}
4328
4329static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4330{
4331 return sprintf(buf, "%d\n", perf_overcommit);
4332}
4333
4334static ssize_t
4335perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4336{
4337 unsigned long val;
4338 int err;
4339
4340 err = strict_strtoul(buf, 10, &val);
4341 if (err)
4342 return err;
4343 if (val > 1)
4344 return -EINVAL;
4345
4346 spin_lock(&perf_resource_lock);
4347 perf_overcommit = val;
4348 spin_unlock(&perf_resource_lock);
4349
4350 return count;
4351}
4352
4353static SYSDEV_CLASS_ATTR(
4354 reserve_percpu,
4355 0644,
4356 perf_show_reserve_percpu,
4357 perf_set_reserve_percpu
4358 );
4359
4360static SYSDEV_CLASS_ATTR(
4361 overcommit,
4362 0644,
4363 perf_show_overcommit,
4364 perf_set_overcommit
4365 );
4366
4367static struct attribute *perfclass_attrs[] = {
4368 &attr_reserve_percpu.attr,
4369 &attr_overcommit.attr,
4370 NULL
4371};
4372
4373static struct attribute_group perfclass_attr_group = {
4374 .attrs = perfclass_attrs,
4375 .name = "perf_counters",
4376};
4377
4378static int __init perf_counter_sysfs_init(void)
4379{
4380 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4381 &perfclass_attr_group);
4382}
4383device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index b2e5f78fd281..31310b5d3f50 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -378,26 +378,15 @@ EXPORT_SYMBOL(pid_task);
378/* 378/*
379 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 379 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
380 */ 380 */
381struct task_struct *find_task_by_pid_type_ns(int type, int nr, 381struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
382 struct pid_namespace *ns)
383{ 382{
384 return pid_task(find_pid_ns(nr, ns), type); 383 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
385} 384}
386 385
387EXPORT_SYMBOL(find_task_by_pid_type_ns);
388
389struct task_struct *find_task_by_vpid(pid_t vnr) 386struct task_struct *find_task_by_vpid(pid_t vnr)
390{ 387{
391 return find_task_by_pid_type_ns(PIDTYPE_PID, vnr, 388 return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
392 current->nsproxy->pid_ns);
393}
394EXPORT_SYMBOL(find_task_by_vpid);
395
396struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
397{
398 return find_task_by_pid_type_ns(PIDTYPE_PID, nr, ns);
399} 389}
400EXPORT_SYMBOL(find_task_by_pid_ns);
401 390
402struct pid *get_task_pid(struct task_struct *task, enum pid_type type) 391struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
403{ 392{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 2d1001b4858d..821722ae58a7 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -67,9 +67,10 @@ err_alloc:
67 return NULL; 67 return NULL;
68} 68}
69 69
70static struct pid_namespace *create_pid_namespace(unsigned int level) 70static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns)
71{ 71{
72 struct pid_namespace *ns; 72 struct pid_namespace *ns;
73 unsigned int level = parent_pid_ns->level + 1;
73 int i; 74 int i;
74 75
75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); 76 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -86,6 +87,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
86 87
87 kref_init(&ns->kref); 88 kref_init(&ns->kref);
88 ns->level = level; 89 ns->level = level;
90 ns->parent = get_pid_ns(parent_pid_ns);
89 91
90 set_bit(0, ns->pidmap[0].page); 92 set_bit(0, ns->pidmap[0].page);
91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 93 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -114,25 +116,11 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
114 116
115struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) 117struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
116{ 118{
117 struct pid_namespace *new_ns;
118
119 BUG_ON(!old_ns);
120 new_ns = get_pid_ns(old_ns);
121 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
122 goto out; 120 return get_pid_ns(old_ns);
123
124 new_ns = ERR_PTR(-EINVAL);
125 if (flags & CLONE_THREAD) 121 if (flags & CLONE_THREAD)
126 goto out_put; 122 return ERR_PTR(-EINVAL);
127 123 return create_pid_namespace(old_ns);
128 new_ns = create_pid_namespace(old_ns->level + 1);
129 if (!IS_ERR(new_ns))
130 new_ns->parent = get_pid_ns(old_ns);
131
132out_put:
133 put_pid_ns(old_ns);
134out:
135 return new_ns;
136} 124}
137 125
138void free_pid_ns(struct kref *kref) 126void free_pid_ns(struct kref *kref)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 23bd4daeb96b..72067cbdb37f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -116,9 +116,13 @@ config SUSPEND_FREEZER
116 116
117 Turning OFF this setting is NOT recommended! If in doubt, say Y. 117 Turning OFF this setting is NOT recommended! If in doubt, say Y.
118 118
119config HIBERNATION_NVS
120 bool
121
119config HIBERNATION 122config HIBERNATION
120 bool "Hibernation (aka 'suspend to disk')" 123 bool "Hibernation (aka 'suspend to disk')"
121 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 124 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
125 select HIBERNATION_NVS if HAS_IOMEM
122 ---help--- 126 ---help---
123 Enable the suspend to disk (STD) functionality, which is usually 127 Enable the suspend to disk (STD) functionality, which is usually
124 called "hibernation" in user interfaces. STD checkpoints the 128 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 720ea4f781bd..c3b81c30e5d5 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -6,6 +6,9 @@ endif
6obj-$(CONFIG_PM) += main.o 6obj-$(CONFIG_PM) += main.o
7obj-$(CONFIG_PM_SLEEP) += console.o 7obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_HIBERNATION) += swsusp.o disk.o snapshot.o swap.o user.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += swsusp.o hibernate.o snapshot.o swap.o user.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
10 13
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/hibernate.c
index e71ca9cd81b2..81d2e7464893 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/hibernate.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * kernel/power/disk.c - Suspend-to-disk support. 2 * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
7 * 8 *
8 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
9 *
10 */ 10 */
11 11
12#include <linux/suspend.h> 12#include <linux/suspend.h>
@@ -215,19 +215,17 @@ static int create_image(int platform_mode)
215 if (error) 215 if (error)
216 return error; 216 return error;
217 217
218 device_pm_lock(); 218 /* At this point, dpm_suspend_start() has been called, but *not*
219 219 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
220 /* At this point, device_suspend() has been called, but *not*
221 * device_power_down(). We *must* call device_power_down() now.
222 * Otherwise, drivers for some devices (e.g. interrupt controllers) 220 * Otherwise, drivers for some devices (e.g. interrupt controllers)
223 * become desynchronized with the actual state of the hardware 221 * become desynchronized with the actual state of the hardware
224 * at resume time, and evil weirdness ensues. 222 * at resume time, and evil weirdness ensues.
225 */ 223 */
226 error = device_power_down(PMSG_FREEZE); 224 error = dpm_suspend_noirq(PMSG_FREEZE);
227 if (error) { 225 if (error) {
228 printk(KERN_ERR "PM: Some devices failed to power down, " 226 printk(KERN_ERR "PM: Some devices failed to power down, "
229 "aborting hibernation\n"); 227 "aborting hibernation\n");
230 goto Unlock; 228 return error;
231 } 229 }
232 230
233 error = platform_pre_snapshot(platform_mode); 231 error = platform_pre_snapshot(platform_mode);
@@ -241,9 +239,9 @@ static int create_image(int platform_mode)
241 239
242 local_irq_disable(); 240 local_irq_disable();
243 241
244 sysdev_suspend(PMSG_FREEZE); 242 error = sysdev_suspend(PMSG_FREEZE);
245 if (error) { 243 if (error) {
246 printk(KERN_ERR "PM: Some devices failed to power down, " 244 printk(KERN_ERR "PM: Some system devices failed to power down, "
247 "aborting hibernation\n"); 245 "aborting hibernation\n");
248 goto Enable_irqs; 246 goto Enable_irqs;
249 } 247 }
@@ -264,7 +262,7 @@ static int create_image(int platform_mode)
264 262
265 Power_up: 263 Power_up:
266 sysdev_resume(); 264 sysdev_resume();
267 /* NOTE: device_power_up() is just a resume() for devices 265 /* NOTE: dpm_resume_noirq() is just a resume() for devices
268 * that suspended with irqs off ... no overall powerup. 266 * that suspended with irqs off ... no overall powerup.
269 */ 267 */
270 268
@@ -277,12 +275,9 @@ static int create_image(int platform_mode)
277 Platform_finish: 275 Platform_finish:
278 platform_finish(platform_mode); 276 platform_finish(platform_mode);
279 277
280 device_power_up(in_suspend ? 278 dpm_resume_noirq(in_suspend ?
281 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 279 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
282 280
283 Unlock:
284 device_pm_unlock();
285
286 return error; 281 return error;
287} 282}
288 283
@@ -309,7 +304,7 @@ int hibernation_snapshot(int platform_mode)
309 goto Close; 304 goto Close;
310 305
311 suspend_console(); 306 suspend_console();
312 error = device_suspend(PMSG_FREEZE); 307 error = dpm_suspend_start(PMSG_FREEZE);
313 if (error) 308 if (error)
314 goto Recover_platform; 309 goto Recover_platform;
315 310
@@ -320,7 +315,7 @@ int hibernation_snapshot(int platform_mode)
320 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
321 316
322 Resume_devices: 317 Resume_devices:
323 device_resume(in_suspend ? 318 dpm_resume_end(in_suspend ?
324 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
325 resume_console(); 320 resume_console();
326 Close: 321 Close:
@@ -344,13 +339,11 @@ static int resume_target_kernel(bool platform_mode)
344{ 339{
345 int error; 340 int error;
346 341
347 device_pm_lock(); 342 error = dpm_suspend_noirq(PMSG_QUIESCE);
348
349 error = device_power_down(PMSG_QUIESCE);
350 if (error) { 343 if (error) {
351 printk(KERN_ERR "PM: Some devices failed to power down, " 344 printk(KERN_ERR "PM: Some devices failed to power down, "
352 "aborting resume\n"); 345 "aborting resume\n");
353 goto Unlock; 346 return error;
354 } 347 }
355 348
356 error = platform_pre_restore(platform_mode); 349 error = platform_pre_restore(platform_mode);
@@ -401,10 +394,7 @@ static int resume_target_kernel(bool platform_mode)
401 Cleanup: 394 Cleanup:
402 platform_restore_cleanup(platform_mode); 395 platform_restore_cleanup(platform_mode);
403 396
404 device_power_up(PMSG_RECOVER); 397 dpm_resume_noirq(PMSG_RECOVER);
405
406 Unlock:
407 device_pm_unlock();
408 398
409 return error; 399 return error;
410} 400}
@@ -424,10 +414,10 @@ int hibernation_restore(int platform_mode)
424 414
425 pm_prepare_console(); 415 pm_prepare_console();
426 suspend_console(); 416 suspend_console();
427 error = device_suspend(PMSG_QUIESCE); 417 error = dpm_suspend_start(PMSG_QUIESCE);
428 if (!error) { 418 if (!error) {
429 error = resume_target_kernel(platform_mode); 419 error = resume_target_kernel(platform_mode);
430 device_resume(PMSG_RECOVER); 420 dpm_resume_end(PMSG_RECOVER);
431 } 421 }
432 resume_console(); 422 resume_console();
433 pm_restore_console(); 423 pm_restore_console();
@@ -457,18 +447,16 @@ int hibernation_platform_enter(void)
457 447
458 entering_platform_hibernation = true; 448 entering_platform_hibernation = true;
459 suspend_console(); 449 suspend_console();
460 error = device_suspend(PMSG_HIBERNATE); 450 error = dpm_suspend_start(PMSG_HIBERNATE);
461 if (error) { 451 if (error) {
462 if (hibernation_ops->recover) 452 if (hibernation_ops->recover)
463 hibernation_ops->recover(); 453 hibernation_ops->recover();
464 goto Resume_devices; 454 goto Resume_devices;
465 } 455 }
466 456
467 device_pm_lock(); 457 error = dpm_suspend_noirq(PMSG_HIBERNATE);
468
469 error = device_power_down(PMSG_HIBERNATE);
470 if (error) 458 if (error)
471 goto Unlock; 459 goto Resume_devices;
472 460
473 error = hibernation_ops->prepare(); 461 error = hibernation_ops->prepare();
474 if (error) 462 if (error)
@@ -491,14 +479,11 @@ int hibernation_platform_enter(void)
491 Platofrm_finish: 479 Platofrm_finish:
492 hibernation_ops->finish(); 480 hibernation_ops->finish();
493 481
494 device_power_up(PMSG_RESTORE); 482 dpm_suspend_noirq(PMSG_RESTORE);
495
496 Unlock:
497 device_pm_unlock();
498 483
499 Resume_devices: 484 Resume_devices:
500 entering_platform_hibernation = false; 485 entering_platform_hibernation = false;
501 device_resume(PMSG_RESTORE); 486 dpm_resume_end(PMSG_RESTORE);
502 resume_console(); 487 resume_console();
503 488
504 Close: 489 Close:
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c
new file mode 100644
index 000000000000..39ac698ef836
--- /dev/null
+++ b/kernel/power/hibernate_nvs.c
@@ -0,0 +1,135 @@
1/*
2 * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
3 *
4 * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/io.h>
10#include <linux/kernel.h>
11#include <linux/list.h>
12#include <linux/mm.h>
13#include <linux/suspend.h>
14
15/*
16 * Platforms, like ACPI, may want us to save some memory used by them during
17 * hibernation and to restore the contents of this memory during the subsequent
18 * resume. The code below implements a mechanism allowing us to do that.
19 */
20
21struct nvs_page {
22 unsigned long phys_start;
23 unsigned int size;
24 void *kaddr;
25 void *data;
26 struct list_head node;
27};
28
29static LIST_HEAD(nvs_list);
30
31/**
32 * hibernate_nvs_register - register platform NVS memory region to save
33 * @start - physical address of the region
34 * @size - size of the region
35 *
36 * The NVS region need not be page-aligned (both ends) and we arrange
37 * things so that the data from page-aligned addresses in this region will
38 * be copied into separate RAM pages.
39 */
40int hibernate_nvs_register(unsigned long start, unsigned long size)
41{
42 struct nvs_page *entry, *next;
43
44 while (size > 0) {
45 unsigned int nr_bytes;
46
47 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
48 if (!entry)
49 goto Error;
50
51 list_add_tail(&entry->node, &nvs_list);
52 entry->phys_start = start;
53 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
54 entry->size = (size < nr_bytes) ? size : nr_bytes;
55
56 start += entry->size;
57 size -= entry->size;
58 }
59 return 0;
60
61 Error:
62 list_for_each_entry_safe(entry, next, &nvs_list, node) {
63 list_del(&entry->node);
64 kfree(entry);
65 }
66 return -ENOMEM;
67}
68
69/**
70 * hibernate_nvs_free - free data pages allocated for saving NVS regions
71 */
72void hibernate_nvs_free(void)
73{
74 struct nvs_page *entry;
75
76 list_for_each_entry(entry, &nvs_list, node)
77 if (entry->data) {
78 free_page((unsigned long)entry->data);
79 entry->data = NULL;
80 if (entry->kaddr) {
81 iounmap(entry->kaddr);
82 entry->kaddr = NULL;
83 }
84 }
85}
86
87/**
88 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
89 */
90int hibernate_nvs_alloc(void)
91{
92 struct nvs_page *entry;
93
94 list_for_each_entry(entry, &nvs_list, node) {
95 entry->data = (void *)__get_free_page(GFP_KERNEL);
96 if (!entry->data) {
97 hibernate_nvs_free();
98 return -ENOMEM;
99 }
100 }
101 return 0;
102}
103
104/**
105 * hibernate_nvs_save - save NVS memory regions
106 */
107void hibernate_nvs_save(void)
108{
109 struct nvs_page *entry;
110
111 printk(KERN_INFO "PM: Saving platform NVS memory\n");
112
113 list_for_each_entry(entry, &nvs_list, node)
114 if (entry->data) {
115 entry->kaddr = ioremap(entry->phys_start, entry->size);
116 memcpy(entry->data, entry->kaddr, entry->size);
117 }
118}
119
120/**
121 * hibernate_nvs_restore - restore NVS memory regions
122 *
123 * This function is going to be called with interrupts disabled, so it
124 * cannot iounmap the virtual addresses used to access the NVS region.
125 */
126void hibernate_nvs_restore(void)
127{
128 struct nvs_page *entry;
129
130 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
131
132 list_for_each_entry(entry, &nvs_list, node)
133 if (entry->data)
134 memcpy(entry->kaddr, entry->data, entry->size);
135}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f99ed6a75eac..f710e36930cc 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,20 +8,9 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/module.h>
12#include <linux/suspend.h>
13#include <linux/kobject.h> 11#include <linux/kobject.h>
14#include <linux/string.h> 12#include <linux/string.h>
15#include <linux/delay.h>
16#include <linux/errno.h>
17#include <linux/kmod.h>
18#include <linux/init.h>
19#include <linux/console.h>
20#include <linux/cpu.h>
21#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
22#include <linux/freezer.h>
23#include <linux/vmstat.h>
24#include <linux/syscalls.h>
25 14
26#include "power.h" 15#include "power.h"
27 16
@@ -119,378 +108,6 @@ power_attr(pm_test);
119 108
120#endif /* CONFIG_PM_SLEEP */ 109#endif /* CONFIG_PM_SLEEP */
121 110
122#ifdef CONFIG_SUSPEND
123
124static int suspend_test(int level)
125{
126#ifdef CONFIG_PM_DEBUG
127 if (pm_test_level == level) {
128 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
129 mdelay(5000);
130 return 1;
131 }
132#endif /* !CONFIG_PM_DEBUG */
133 return 0;
134}
135
136#ifdef CONFIG_PM_TEST_SUSPEND
137
138/*
139 * We test the system suspend code by setting an RTC wakealarm a short
140 * time in the future, then suspending. Suspending the devices won't
141 * normally take long ... some systems only need a few milliseconds.
142 *
143 * The time it takes is system-specific though, so when we test this
144 * during system bootup we allow a LOT of time.
145 */
146#define TEST_SUSPEND_SECONDS 5
147
148static unsigned long suspend_test_start_time;
149
150static void suspend_test_start(void)
151{
152 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
153 * What we want is a hardware counter that will work correctly even
154 * during the irqs-are-off stages of the suspend/resume cycle...
155 */
156 suspend_test_start_time = jiffies;
157}
158
159static void suspend_test_finish(const char *label)
160{
161 long nj = jiffies - suspend_test_start_time;
162 unsigned msec;
163
164 msec = jiffies_to_msecs(abs(nj));
165 pr_info("PM: %s took %d.%03d seconds\n", label,
166 msec / 1000, msec % 1000);
167
168 /* Warning on suspend means the RTC alarm period needs to be
169 * larger -- the system was sooo slooowwww to suspend that the
170 * alarm (should have) fired before the system went to sleep!
171 *
172 * Warning on either suspend or resume also means the system
173 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk...
175 */
176 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
177}
178
179#else
180
181static void suspend_test_start(void)
182{
183}
184
185static void suspend_test_finish(const char *label)
186{
187}
188
189#endif
190
191/* This is just an arbitrary number */
192#define FREE_PAGE_NUMBER (100)
193
194static struct platform_suspend_ops *suspend_ops;
195
196/**
197 * suspend_set_ops - Set the global suspend method table.
198 * @ops: Pointer to ops structure.
199 */
200
201void suspend_set_ops(struct platform_suspend_ops *ops)
202{
203 mutex_lock(&pm_mutex);
204 suspend_ops = ops;
205 mutex_unlock(&pm_mutex);
206}
207
208/**
209 * suspend_valid_only_mem - generic memory-only valid callback
210 *
211 * Platform drivers that implement mem suspend only and only need
212 * to check for that in their .valid callback can use this instead
213 * of rolling their own .valid callback.
214 */
215int suspend_valid_only_mem(suspend_state_t state)
216{
217 return state == PM_SUSPEND_MEM;
218}
219
220/**
221 * suspend_prepare - Do prep work before entering low-power state.
222 *
223 * This is common code that is called for each state that we're entering.
224 * Run suspend notifiers, allocate a console and stop all processes.
225 */
226static int suspend_prepare(void)
227{
228 int error;
229 unsigned int free_pages;
230
231 if (!suspend_ops || !suspend_ops->enter)
232 return -EPERM;
233
234 pm_prepare_console();
235
236 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
237 if (error)
238 goto Finish;
239
240 error = usermodehelper_disable();
241 if (error)
242 goto Finish;
243
244 if (suspend_freeze_processes()) {
245 error = -EAGAIN;
246 goto Thaw;
247 }
248
249 free_pages = global_page_state(NR_FREE_PAGES);
250 if (free_pages < FREE_PAGE_NUMBER) {
251 pr_debug("PM: free some memory\n");
252 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
253 if (nr_free_pages() < FREE_PAGE_NUMBER) {
254 error = -ENOMEM;
255 printk(KERN_ERR "PM: No enough memory\n");
256 }
257 }
258 if (!error)
259 return 0;
260
261 Thaw:
262 suspend_thaw_processes();
263 usermodehelper_enable();
264 Finish:
265 pm_notifier_call_chain(PM_POST_SUSPEND);
266 pm_restore_console();
267 return error;
268}
269
270/* default implementation */
271void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
272{
273 local_irq_disable();
274}
275
276/* default implementation */
277void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
278{
279 local_irq_enable();
280}
281
282/**
283 * suspend_enter - enter the desired system sleep state.
284 * @state: state to enter
285 *
286 * This function should be called after devices have been suspended.
287 */
288static int suspend_enter(suspend_state_t state)
289{
290 int error;
291
292 device_pm_lock();
293
294 if (suspend_ops->prepare) {
295 error = suspend_ops->prepare();
296 if (error)
297 goto Done;
298 }
299
300 error = device_power_down(PMSG_SUSPEND);
301 if (error) {
302 printk(KERN_ERR "PM: Some devices failed to power down\n");
303 goto Platfrom_finish;
304 }
305
306 if (suspend_ops->prepare_late) {
307 error = suspend_ops->prepare_late();
308 if (error)
309 goto Power_up_devices;
310 }
311
312 if (suspend_test(TEST_PLATFORM))
313 goto Platform_wake;
314
315 error = disable_nonboot_cpus();
316 if (error || suspend_test(TEST_CPUS))
317 goto Enable_cpus;
318
319 arch_suspend_disable_irqs();
320 BUG_ON(!irqs_disabled());
321
322 error = sysdev_suspend(PMSG_SUSPEND);
323 if (!error) {
324 if (!suspend_test(TEST_CORE))
325 error = suspend_ops->enter(state);
326 sysdev_resume();
327 }
328
329 arch_suspend_enable_irqs();
330 BUG_ON(irqs_disabled());
331
332 Enable_cpus:
333 enable_nonboot_cpus();
334
335 Platform_wake:
336 if (suspend_ops->wake)
337 suspend_ops->wake();
338
339 Power_up_devices:
340 device_power_up(PMSG_RESUME);
341
342 Platfrom_finish:
343 if (suspend_ops->finish)
344 suspend_ops->finish();
345
346 Done:
347 device_pm_unlock();
348
349 return error;
350}
351
352/**
353 * suspend_devices_and_enter - suspend devices and enter the desired system
354 * sleep state.
355 * @state: state to enter
356 */
357int suspend_devices_and_enter(suspend_state_t state)
358{
359 int error;
360
361 if (!suspend_ops)
362 return -ENOSYS;
363
364 if (suspend_ops->begin) {
365 error = suspend_ops->begin(state);
366 if (error)
367 goto Close;
368 }
369 suspend_console();
370 suspend_test_start();
371 error = device_suspend(PMSG_SUSPEND);
372 if (error) {
373 printk(KERN_ERR "PM: Some devices failed to suspend\n");
374 goto Recover_platform;
375 }
376 suspend_test_finish("suspend devices");
377 if (suspend_test(TEST_DEVICES))
378 goto Recover_platform;
379
380 suspend_enter(state);
381
382 Resume_devices:
383 suspend_test_start();
384 device_resume(PMSG_RESUME);
385 suspend_test_finish("resume devices");
386 resume_console();
387 Close:
388 if (suspend_ops->end)
389 suspend_ops->end();
390 return error;
391
392 Recover_platform:
393 if (suspend_ops->recover)
394 suspend_ops->recover();
395 goto Resume_devices;
396}
397
398/**
399 * suspend_finish - Do final work before exiting suspend sequence.
400 *
401 * Call platform code to clean up, restart processes, and free the
402 * console that we've allocated. This is not called for suspend-to-disk.
403 */
404static void suspend_finish(void)
405{
406 suspend_thaw_processes();
407 usermodehelper_enable();
408 pm_notifier_call_chain(PM_POST_SUSPEND);
409 pm_restore_console();
410}
411
412
413
414
415static const char * const pm_states[PM_SUSPEND_MAX] = {
416 [PM_SUSPEND_STANDBY] = "standby",
417 [PM_SUSPEND_MEM] = "mem",
418};
419
420static inline int valid_state(suspend_state_t state)
421{
422 /* All states need lowlevel support and need to be valid
423 * to the lowlevel implementation, no valid callback
424 * implies that none are valid. */
425 if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
426 return 0;
427 return 1;
428}
429
430
431/**
432 * enter_state - Do common work of entering low-power state.
433 * @state: pm_state structure for state we're entering.
434 *
435 * Make sure we're the only ones trying to enter a sleep state. Fail
436 * if someone has beat us to it, since we don't want anything weird to
437 * happen when we wake up.
438 * Then, do the setup for suspend, enter the state, and cleaup (after
439 * we've woken up).
440 */
441static int enter_state(suspend_state_t state)
442{
443 int error;
444
445 if (!valid_state(state))
446 return -ENODEV;
447
448 if (!mutex_trylock(&pm_mutex))
449 return -EBUSY;
450
451 printk(KERN_INFO "PM: Syncing filesystems ... ");
452 sys_sync();
453 printk("done.\n");
454
455 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
456 error = suspend_prepare();
457 if (error)
458 goto Unlock;
459
460 if (suspend_test(TEST_FREEZER))
461 goto Finish;
462
463 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
464 error = suspend_devices_and_enter(state);
465
466 Finish:
467 pr_debug("PM: Finishing wakeup.\n");
468 suspend_finish();
469 Unlock:
470 mutex_unlock(&pm_mutex);
471 return error;
472}
473
474
475/**
476 * pm_suspend - Externally visible function for suspending system.
477 * @state: Enumerated value of state to enter.
478 *
479 * Determine whether or not value is within range, get state
480 * structure, and enter (above).
481 */
482
483int pm_suspend(suspend_state_t state)
484{
485 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
486 return enter_state(state);
487 return -EINVAL;
488}
489
490EXPORT_SYMBOL(pm_suspend);
491
492#endif /* CONFIG_SUSPEND */
493
494struct kobject *power_kobj; 111struct kobject *power_kobj;
495 112
496/** 113/**
@@ -503,7 +120,6 @@ struct kobject *power_kobj;
503 * store() accepts one of those strings, translates it into the 120 * store() accepts one of those strings, translates it into the
504 * proper enumerated value, and initiates a suspend transition. 121 * proper enumerated value, and initiates a suspend transition.
505 */ 122 */
506
507static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, 123static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
508 char *buf) 124 char *buf)
509{ 125{
@@ -601,7 +217,6 @@ static struct attribute_group attr_group = {
601 .attrs = g, 217 .attrs = g,
602}; 218};
603 219
604
605static int __init pm_init(void) 220static int __init pm_init(void)
606{ 221{
607 power_kobj = kobject_create_and_add("power", NULL); 222 power_kobj = kobject_create_and_add("power", NULL);
@@ -611,144 +226,3 @@ static int __init pm_init(void)
611} 226}
612 227
613core_initcall(pm_init); 228core_initcall(pm_init);
614
615
616#ifdef CONFIG_PM_TEST_SUSPEND
617
618#include <linux/rtc.h>
619
620/*
621 * To test system suspend, we need a hands-off mechanism to resume the
622 * system. RTCs wake alarms are a common self-contained mechanism.
623 */
624
625static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
626{
627 static char err_readtime[] __initdata =
628 KERN_ERR "PM: can't read %s time, err %d\n";
629 static char err_wakealarm [] __initdata =
630 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
631 static char err_suspend[] __initdata =
632 KERN_ERR "PM: suspend test failed, error %d\n";
633 static char info_test[] __initdata =
634 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
635
636 unsigned long now;
637 struct rtc_wkalrm alm;
638 int status;
639
640 /* this may fail if the RTC hasn't been initialized */
641 status = rtc_read_time(rtc, &alm.time);
642 if (status < 0) {
643 printk(err_readtime, dev_name(&rtc->dev), status);
644 return;
645 }
646 rtc_tm_to_time(&alm.time, &now);
647
648 memset(&alm, 0, sizeof alm);
649 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
650 alm.enabled = true;
651
652 status = rtc_set_alarm(rtc, &alm);
653 if (status < 0) {
654 printk(err_wakealarm, dev_name(&rtc->dev), status);
655 return;
656 }
657
658 if (state == PM_SUSPEND_MEM) {
659 printk(info_test, pm_states[state]);
660 status = pm_suspend(state);
661 if (status == -ENODEV)
662 state = PM_SUSPEND_STANDBY;
663 }
664 if (state == PM_SUSPEND_STANDBY) {
665 printk(info_test, pm_states[state]);
666 status = pm_suspend(state);
667 }
668 if (status < 0)
669 printk(err_suspend, status);
670
671 /* Some platforms can't detect that the alarm triggered the
672 * wakeup, or (accordingly) disable it after it afterwards.
673 * It's supposed to give oneshot behavior; cope.
674 */
675 alm.enabled = false;
676 rtc_set_alarm(rtc, &alm);
677}
678
679static int __init has_wakealarm(struct device *dev, void *name_ptr)
680{
681 struct rtc_device *candidate = to_rtc_device(dev);
682
683 if (!candidate->ops->set_alarm)
684 return 0;
685 if (!device_may_wakeup(candidate->dev.parent))
686 return 0;
687
688 *(const char **)name_ptr = dev_name(dev);
689 return 1;
690}
691
692/*
693 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
694 * at startup time. They're normally disabled, for faster boot and because
695 * we can't know which states really work on this particular system.
696 */
697static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
698
699static char warn_bad_state[] __initdata =
700 KERN_WARNING "PM: can't test '%s' suspend state\n";
701
702static int __init setup_test_suspend(char *value)
703{
704 unsigned i;
705
706 /* "=mem" ==> "mem" */
707 value++;
708 for (i = 0; i < PM_SUSPEND_MAX; i++) {
709 if (!pm_states[i])
710 continue;
711 if (strcmp(pm_states[i], value) != 0)
712 continue;
713 test_state = (__force suspend_state_t) i;
714 return 0;
715 }
716 printk(warn_bad_state, value);
717 return 0;
718}
719__setup("test_suspend", setup_test_suspend);
720
721static int __init test_suspend(void)
722{
723 static char warn_no_rtc[] __initdata =
724 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
725
726 char *pony = NULL;
727 struct rtc_device *rtc = NULL;
728
729 /* PM is initialized by now; is that state testable? */
730 if (test_state == PM_SUSPEND_ON)
731 goto done;
732 if (!valid_state(test_state)) {
733 printk(warn_bad_state, pm_states[test_state]);
734 goto done;
735 }
736
737 /* RTCs have initialized by now too ... can we use one? */
738 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
739 if (pony)
740 rtc = rtc_class_open(pony);
741 if (!rtc) {
742 printk(warn_no_rtc);
743 goto done;
744 }
745
746 /* go for it */
747 test_wakealarm(rtc, test_state);
748 rtc_class_close(rtc);
749done:
750 return 0;
751}
752late_initcall(test_suspend);
753
754#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7a3afb..26d5a26f82e3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -45,7 +45,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
45 */ 45 */
46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 46#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
47 47
48/* kernel/power/disk.c */ 48/* kernel/power/hibernate.c */
49extern int hibernation_snapshot(int platform_mode); 49extern int hibernation_snapshot(int platform_mode);
50extern int hibernation_restore(int platform_mode); 50extern int hibernation_restore(int platform_mode);
51extern int hibernation_platform_enter(void); 51extern int hibernation_platform_enter(void);
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern unsigned int count_data_pages(void); 77extern int swsusp_shrink_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
@@ -147,9 +147,8 @@ extern int swsusp_swap_in_use(void);
147 */ 147 */
148#define SF_PLATFORM_MODE 1 148#define SF_PLATFORM_MODE 1
149 149
150/* kernel/power/disk.c */ 150/* kernel/power/hibernate.c */
151extern int swsusp_check(void); 151extern int swsusp_check(void);
152extern int swsusp_shrink_memory(void);
153extern void swsusp_free(void); 152extern void swsusp_free(void);
154extern int swsusp_read(unsigned int *flags_p); 153extern int swsusp_read(unsigned int *flags_p);
155extern int swsusp_write(unsigned int flags); 154extern int swsusp_write(unsigned int flags);
@@ -161,22 +160,36 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
161 unsigned int, char *); 160 unsigned int, char *);
162 161
163#ifdef CONFIG_SUSPEND 162#ifdef CONFIG_SUSPEND
164/* kernel/power/main.c */ 163/* kernel/power/suspend.c */
164extern const char *const pm_states[];
165
166extern bool valid_state(suspend_state_t state);
165extern int suspend_devices_and_enter(suspend_state_t state); 167extern int suspend_devices_and_enter(suspend_state_t state);
168extern int enter_state(suspend_state_t state);
166#else /* !CONFIG_SUSPEND */ 169#else /* !CONFIG_SUSPEND */
167static inline int suspend_devices_and_enter(suspend_state_t state) 170static inline int suspend_devices_and_enter(suspend_state_t state)
168{ 171{
169 return -ENOSYS; 172 return -ENOSYS;
170} 173}
174static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
175static inline bool valid_state(suspend_state_t state) { return false; }
171#endif /* !CONFIG_SUSPEND */ 176#endif /* !CONFIG_SUSPEND */
172 177
178#ifdef CONFIG_PM_TEST_SUSPEND
179/* kernel/power/suspend_test.c */
180extern void suspend_test_start(void);
181extern void suspend_test_finish(const char *label);
182#else /* !CONFIG_PM_TEST_SUSPEND */
183static inline void suspend_test_start(void) {}
184static inline void suspend_test_finish(const char *label) {}
185#endif /* !CONFIG_PM_TEST_SUSPEND */
186
173#ifdef CONFIG_PM_SLEEP 187#ifdef CONFIG_PM_SLEEP
174/* kernel/power/main.c */ 188/* kernel/power/main.c */
175extern int pm_notifier_call_chain(unsigned long val); 189extern int pm_notifier_call_chain(unsigned long val);
176#endif 190#endif
177 191
178#ifdef CONFIG_HIGHMEM 192#ifdef CONFIG_HIGHMEM
179unsigned int count_highmem_pages(void);
180int restore_highmem(void); 193int restore_highmem(void);
181#else 194#else
182static inline unsigned int count_highmem_pages(void) { return 0; } 195static inline unsigned int count_highmem_pages(void) { return 0; }
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 97890831e1b5..e8b337006276 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -34,7 +34,7 @@ static struct sysrq_key_op sysrq_poweroff_op = {
34 .handler = handle_poweroff, 34 .handler = handle_poweroff,
35 .help_msg = "powerOff", 35 .help_msg = "powerOff",
36 .action_msg = "Power Off", 36 .action_msg = "Power Off",
37 .enable_mask = SYSRQ_ENABLE_BOOT, 37 .enable_mask = SYSRQ_ENABLE_BOOT,
38}; 38};
39 39
40static int pm_sysrq_init(void) 40static int pm_sysrq_init(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca634019497a..da2072d73811 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -117,9 +117,12 @@ int freeze_processes(void)
117 if (error) 117 if (error)
118 goto Exit; 118 goto Exit;
119 printk("done."); 119 printk("done.");
120
121 oom_killer_disable();
120 Exit: 122 Exit:
121 BUG_ON(in_atomic()); 123 BUG_ON(in_atomic());
122 printk("\n"); 124 printk("\n");
125
123 return error; 126 return error;
124} 127}
125 128
@@ -145,6 +148,8 @@ static void thaw_tasks(bool nosig_only)
145 148
146void thaw_processes(void) 149void thaw_processes(void)
147{ 150{
151 oom_killer_enable();
152
148 printk("Restarting tasks ... "); 153 printk("Restarting tasks ... ");
149 thaw_tasks(true); 154 thaw_tasks(true);
150 thaw_tasks(false); 155 thaw_tasks(false);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a819f9..523a451b45d3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -39,6 +39,14 @@ static int swsusp_page_is_free(struct page *);
39static void swsusp_set_page_forbidden(struct page *); 39static void swsusp_set_page_forbidden(struct page *);
40static void swsusp_unset_page_forbidden(struct page *); 40static void swsusp_unset_page_forbidden(struct page *);
41 41
42/*
43 * Preferred image size in bytes (tunable via /sys/power/image_size).
44 * When it is set to N, swsusp will do its best to ensure the image
45 * size will not exceed N bytes, but if that is impossible, it will
46 * try to create the smallest image possible.
47 */
48unsigned long image_size = 500 * 1024 * 1024;
49
42/* List of PBEs needed for restoring the pages that were allocated before 50/* List of PBEs needed for restoring the pages that were allocated before
43 * the suspend and included in the suspend image, but have also been 51 * the suspend and included in the suspend image, but have also been
44 * allocated by the "resume" kernel, so their contents cannot be written 52 * allocated by the "resume" kernel, so their contents cannot be written
@@ -840,7 +848,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
840 * pages. 848 * pages.
841 */ 849 */
842 850
843unsigned int count_highmem_pages(void) 851static unsigned int count_highmem_pages(void)
844{ 852{
845 struct zone *zone; 853 struct zone *zone;
846 unsigned int n = 0; 854 unsigned int n = 0;
@@ -902,7 +910,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
902 * pages. 910 * pages.
903 */ 911 */
904 912
905unsigned int count_data_pages(void) 913static unsigned int count_data_pages(void)
906{ 914{
907 struct zone *zone; 915 struct zone *zone;
908 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
@@ -1058,6 +1066,74 @@ void swsusp_free(void)
1058 buffer = NULL; 1066 buffer = NULL;
1059} 1067}
1060 1068
1069/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed
1071 *
1072 * ... but do not OOM-kill anyone
1073 *
1074 * Notice: all userland should be stopped before it is called, or
1075 * livelock is possible.
1076 */
1077
1078#define SHRINK_BITE 10000
1079static inline unsigned long __shrink_memory(long tmp)
1080{
1081 if (tmp > SHRINK_BITE)
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084}
1085
1086int swsusp_shrink_memory(void)
1087{
1088 long tmp;
1089 struct zone *zone;
1090 unsigned long pages = 0;
1091 unsigned int i = 0;
1092 char *p = "-\\|/";
1093 struct timeval start, stop;
1094
1095 printk(KERN_INFO "PM: Shrinking memory... ");
1096 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114
1115 if (highmem_size < 0)
1116 highmem_size = 0;
1117
1118 tmp += highmem_size;
1119 if (tmp > 0) {
1120 tmp = __shrink_memory(tmp);
1121 if (!tmp)
1122 return -ENOMEM;
1123 pages += tmp;
1124 } else if (size > image_size / PAGE_SIZE) {
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
1126 pages += tmp;
1127 }
1128 printk("\b%c", p[i++%4]);
1129 } while (tmp > 0);
1130 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed");
1133
1134 return 0;
1135}
1136
1061#ifdef CONFIG_HIGHMEM 1137#ifdef CONFIG_HIGHMEM
1062/** 1138/**
1063 * count_pages_for_highmem - compute the number of non-highmem pages 1139 * count_pages_for_highmem - compute the number of non-highmem pages
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
new file mode 100644
index 000000000000..6f10dfc2d3e9
--- /dev/null
+++ b/kernel/power/suspend.c
@@ -0,0 +1,300 @@
1/*
2 * kernel/power/suspend.c - Suspend to RAM and standby functionality.
3 *
4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
7 *
8 * This file is released under the GPLv2.
9 */
10
11#include <linux/string.h>
12#include <linux/delay.h>
13#include <linux/errno.h>
14#include <linux/init.h>
15#include <linux/console.h>
16#include <linux/cpu.h>
17#include <linux/syscalls.h>
18
19#include "power.h"
20
21const char *const pm_states[PM_SUSPEND_MAX] = {
22 [PM_SUSPEND_STANDBY] = "standby",
23 [PM_SUSPEND_MEM] = "mem",
24};
25
26static struct platform_suspend_ops *suspend_ops;
27
28/**
29 * suspend_set_ops - Set the global suspend method table.
30 * @ops: Pointer to ops structure.
31 */
32void suspend_set_ops(struct platform_suspend_ops *ops)
33{
34 mutex_lock(&pm_mutex);
35 suspend_ops = ops;
36 mutex_unlock(&pm_mutex);
37}
38
39bool valid_state(suspend_state_t state)
40{
41 /*
42 * All states need lowlevel support and need to be valid to the lowlevel
43 * implementation, no valid callback implies that none are valid.
44 */
45 return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
46}
47
48/**
49 * suspend_valid_only_mem - generic memory-only valid callback
50 *
51 * Platform drivers that implement mem suspend only and only need
52 * to check for that in their .valid callback can use this instead
53 * of rolling their own .valid callback.
54 */
55int suspend_valid_only_mem(suspend_state_t state)
56{
57 return state == PM_SUSPEND_MEM;
58}
59
60static int suspend_test(int level)
61{
62#ifdef CONFIG_PM_DEBUG
63 if (pm_test_level == level) {
64 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
65 mdelay(5000);
66 return 1;
67 }
68#endif /* !CONFIG_PM_DEBUG */
69 return 0;
70}
71
72/**
73 * suspend_prepare - Do prep work before entering low-power state.
74 *
75 * This is common code that is called for each state that we're entering.
76 * Run suspend notifiers, allocate a console and stop all processes.
77 */
78static int suspend_prepare(void)
79{
80 int error;
81
82 if (!suspend_ops || !suspend_ops->enter)
83 return -EPERM;
84
85 pm_prepare_console();
86
87 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
88 if (error)
89 goto Finish;
90
91 error = usermodehelper_disable();
92 if (error)
93 goto Finish;
94
95 error = suspend_freeze_processes();
96 if (!error)
97 return 0;
98
99 suspend_thaw_processes();
100 usermodehelper_enable();
101 Finish:
102 pm_notifier_call_chain(PM_POST_SUSPEND);
103 pm_restore_console();
104 return error;
105}
106
107/* default implementation */
108void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
109{
110 local_irq_disable();
111}
112
113/* default implementation */
114void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
115{
116 local_irq_enable();
117}
118
119/**
120 * suspend_enter - enter the desired system sleep state.
121 * @state: state to enter
122 *
123 * This function should be called after devices have been suspended.
124 */
125static int suspend_enter(suspend_state_t state)
126{
127 int error;
128
129 if (suspend_ops->prepare) {
130 error = suspend_ops->prepare();
131 if (error)
132 return error;
133 }
134
135 error = dpm_suspend_noirq(PMSG_SUSPEND);
136 if (error) {
137 printk(KERN_ERR "PM: Some devices failed to power down\n");
138 goto Platfrom_finish;
139 }
140
141 if (suspend_ops->prepare_late) {
142 error = suspend_ops->prepare_late();
143 if (error)
144 goto Power_up_devices;
145 }
146
147 if (suspend_test(TEST_PLATFORM))
148 goto Platform_wake;
149
150 error = disable_nonboot_cpus();
151 if (error || suspend_test(TEST_CPUS))
152 goto Enable_cpus;
153
154 arch_suspend_disable_irqs();
155 BUG_ON(!irqs_disabled());
156
157 error = sysdev_suspend(PMSG_SUSPEND);
158 if (!error) {
159 if (!suspend_test(TEST_CORE))
160 error = suspend_ops->enter(state);
161 sysdev_resume();
162 }
163
164 arch_suspend_enable_irqs();
165 BUG_ON(irqs_disabled());
166
167 Enable_cpus:
168 enable_nonboot_cpus();
169
170 Platform_wake:
171 if (suspend_ops->wake)
172 suspend_ops->wake();
173
174 Power_up_devices:
175 dpm_resume_noirq(PMSG_RESUME);
176
177 Platfrom_finish:
178 if (suspend_ops->finish)
179 suspend_ops->finish();
180
181 return error;
182}
183
184/**
185 * suspend_devices_and_enter - suspend devices and enter the desired system
186 * sleep state.
187 * @state: state to enter
188 */
189int suspend_devices_and_enter(suspend_state_t state)
190{
191 int error;
192
193 if (!suspend_ops)
194 return -ENOSYS;
195
196 if (suspend_ops->begin) {
197 error = suspend_ops->begin(state);
198 if (error)
199 goto Close;
200 }
201 suspend_console();
202 suspend_test_start();
203 error = dpm_suspend_start(PMSG_SUSPEND);
204 if (error) {
205 printk(KERN_ERR "PM: Some devices failed to suspend\n");
206 goto Recover_platform;
207 }
208 suspend_test_finish("suspend devices");
209 if (suspend_test(TEST_DEVICES))
210 goto Recover_platform;
211
212 suspend_enter(state);
213
214 Resume_devices:
215 suspend_test_start();
216 dpm_resume_end(PMSG_RESUME);
217 suspend_test_finish("resume devices");
218 resume_console();
219 Close:
220 if (suspend_ops->end)
221 suspend_ops->end();
222 return error;
223
224 Recover_platform:
225 if (suspend_ops->recover)
226 suspend_ops->recover();
227 goto Resume_devices;
228}
229
230/**
231 * suspend_finish - Do final work before exiting suspend sequence.
232 *
233 * Call platform code to clean up, restart processes, and free the
234 * console that we've allocated. This is not called for suspend-to-disk.
235 */
236static void suspend_finish(void)
237{
238 suspend_thaw_processes();
239 usermodehelper_enable();
240 pm_notifier_call_chain(PM_POST_SUSPEND);
241 pm_restore_console();
242}
243
244/**
245 * enter_state - Do common work of entering low-power state.
246 * @state: pm_state structure for state we're entering.
247 *
248 * Make sure we're the only ones trying to enter a sleep state. Fail
249 * if someone has beat us to it, since we don't want anything weird to
250 * happen when we wake up.
251 * Then, do the setup for suspend, enter the state, and cleaup (after
252 * we've woken up).
253 */
254int enter_state(suspend_state_t state)
255{
256 int error;
257
258 if (!valid_state(state))
259 return -ENODEV;
260
261 if (!mutex_trylock(&pm_mutex))
262 return -EBUSY;
263
264 printk(KERN_INFO "PM: Syncing filesystems ... ");
265 sys_sync();
266 printk("done.\n");
267
268 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
269 error = suspend_prepare();
270 if (error)
271 goto Unlock;
272
273 if (suspend_test(TEST_FREEZER))
274 goto Finish;
275
276 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
277 error = suspend_devices_and_enter(state);
278
279 Finish:
280 pr_debug("PM: Finishing wakeup.\n");
281 suspend_finish();
282 Unlock:
283 mutex_unlock(&pm_mutex);
284 return error;
285}
286
287/**
288 * pm_suspend - Externally visible function for suspending system.
289 * @state: Enumerated value of state to enter.
290 *
291 * Determine whether or not value is within range, get state
292 * structure, and enter (above).
293 */
294int pm_suspend(suspend_state_t state)
295{
296 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
297 return enter_state(state);
298 return -EINVAL;
299}
300EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
new file mode 100644
index 000000000000..17d8bb1acf9c
--- /dev/null
+++ b/kernel/power/suspend_test.c
@@ -0,0 +1,187 @@
1/*
2 * kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
3 *
4 * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
5 *
6 * This file is released under the GPLv2.
7 */
8
9#include <linux/init.h>
10#include <linux/rtc.h>
11
12#include "power.h"
13
14/*
15 * We test the system suspend code by setting an RTC wakealarm a short
16 * time in the future, then suspending. Suspending the devices won't
17 * normally take long ... some systems only need a few milliseconds.
18 *
19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time.
21 */
22#define TEST_SUSPEND_SECONDS 5
23
24static unsigned long suspend_test_start_time;
25
26void suspend_test_start(void)
27{
28 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
29 * What we want is a hardware counter that will work correctly even
30 * during the irqs-are-off stages of the suspend/resume cycle...
31 */
32 suspend_test_start_time = jiffies;
33}
34
35void suspend_test_finish(const char *label)
36{
37 long nj = jiffies - suspend_test_start_time;
38 unsigned msec;
39
40 msec = jiffies_to_msecs(abs(nj));
41 pr_info("PM: %s took %d.%03d seconds\n", label,
42 msec / 1000, msec % 1000);
43
44 /* Warning on suspend means the RTC alarm period needs to be
45 * larger -- the system was sooo slooowwww to suspend that the
46 * alarm (should have) fired before the system went to sleep!
47 *
48 * Warning on either suspend or resume also means the system
49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk...
51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
53}
54
55/*
56 * To test system suspend, we need a hands-off mechanism to resume the
57 * system. RTCs wake alarms are a common self-contained mechanism.
58 */
59
60static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
61{
62 static char err_readtime[] __initdata =
63 KERN_ERR "PM: can't read %s time, err %d\n";
64 static char err_wakealarm [] __initdata =
65 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
66 static char err_suspend[] __initdata =
67 KERN_ERR "PM: suspend test failed, error %d\n";
68 static char info_test[] __initdata =
69 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
70
71 unsigned long now;
72 struct rtc_wkalrm alm;
73 int status;
74
75 /* this may fail if the RTC hasn't been initialized */
76 status = rtc_read_time(rtc, &alm.time);
77 if (status < 0) {
78 printk(err_readtime, dev_name(&rtc->dev), status);
79 return;
80 }
81 rtc_tm_to_time(&alm.time, &now);
82
83 memset(&alm, 0, sizeof alm);
84 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
85 alm.enabled = true;
86
87 status = rtc_set_alarm(rtc, &alm);
88 if (status < 0) {
89 printk(err_wakealarm, dev_name(&rtc->dev), status);
90 return;
91 }
92
93 if (state == PM_SUSPEND_MEM) {
94 printk(info_test, pm_states[state]);
95 status = pm_suspend(state);
96 if (status == -ENODEV)
97 state = PM_SUSPEND_STANDBY;
98 }
99 if (state == PM_SUSPEND_STANDBY) {
100 printk(info_test, pm_states[state]);
101 status = pm_suspend(state);
102 }
103 if (status < 0)
104 printk(err_suspend, status);
105
106 /* Some platforms can't detect that the alarm triggered the
107 * wakeup, or (accordingly) disable it after it afterwards.
108 * It's supposed to give oneshot behavior; cope.
109 */
110 alm.enabled = false;
111 rtc_set_alarm(rtc, &alm);
112}
113
114static int __init has_wakealarm(struct device *dev, void *name_ptr)
115{
116 struct rtc_device *candidate = to_rtc_device(dev);
117
118 if (!candidate->ops->set_alarm)
119 return 0;
120 if (!device_may_wakeup(candidate->dev.parent))
121 return 0;
122
123 *(const char **)name_ptr = dev_name(dev);
124 return 1;
125}
126
127/*
128 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
129 * at startup time. They're normally disabled, for faster boot and because
130 * we can't know which states really work on this particular system.
131 */
132static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
133
134static char warn_bad_state[] __initdata =
135 KERN_WARNING "PM: can't test '%s' suspend state\n";
136
137static int __init setup_test_suspend(char *value)
138{
139 unsigned i;
140
141 /* "=mem" ==> "mem" */
142 value++;
143 for (i = 0; i < PM_SUSPEND_MAX; i++) {
144 if (!pm_states[i])
145 continue;
146 if (strcmp(pm_states[i], value) != 0)
147 continue;
148 test_state = (__force suspend_state_t) i;
149 return 0;
150 }
151 printk(warn_bad_state, value);
152 return 0;
153}
154__setup("test_suspend", setup_test_suspend);
155
156static int __init test_suspend(void)
157{
158 static char warn_no_rtc[] __initdata =
159 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
160
161 char *pony = NULL;
162 struct rtc_device *rtc = NULL;
163
164 /* PM is initialized by now; is that state testable? */
165 if (test_state == PM_SUSPEND_ON)
166 goto done;
167 if (!valid_state(test_state)) {
168 printk(warn_bad_state, pm_states[test_state]);
169 goto done;
170 }
171
172 /* RTCs have initialized by now too ... can we use one? */
173 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
174 if (pony)
175 rtc = rtc_class_open(pony);
176 if (!rtc) {
177 printk(warn_no_rtc);
178 goto done;
179 }
180
181 /* go for it */
182 test_wakealarm(rtc, test_state);
183 rtc_class_close(rtc);
184done:
185 return 0;
186}
187late_initcall(test_suspend);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 78c35047586d..6a07f4dbf2f8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -55,14 +55,6 @@
55 55
56#include "power.h" 56#include "power.h"
57 57
58/*
59 * Preferred image size in bytes (tunable via /sys/power/image_size).
60 * When it is set to N, swsusp will do its best to ensure the image
61 * size will not exceed N bytes, but if that is impossible, it will
62 * try to create the smallest image possible.
63 */
64unsigned long image_size = 500 * 1024 * 1024;
65
66int in_suspend __nosavedata = 0; 58int in_suspend __nosavedata = 0;
67 59
68/** 60/**
@@ -194,193 +186,3 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
194 centisecs / 100, centisecs % 100, 186 centisecs / 100, centisecs % 100,
195 kps / 1000, (kps % 1000) / 10); 187 kps / 1000, (kps % 1000) / 10);
196} 188}
197
198/**
199 * swsusp_shrink_memory - Try to free as much memory as needed
200 *
201 * ... but do not OOM-kill anyone
202 *
203 * Notice: all userland should be stopped before it is called, or
204 * livelock is possible.
205 */
206
207#define SHRINK_BITE 10000
208static inline unsigned long __shrink_memory(long tmp)
209{
210 if (tmp > SHRINK_BITE)
211 tmp = SHRINK_BITE;
212 return shrink_all_memory(tmp);
213}
214
215int swsusp_shrink_memory(void)
216{
217 long tmp;
218 struct zone *zone;
219 unsigned long pages = 0;
220 unsigned int i = 0;
221 char *p = "-\\|/";
222 struct timeval start, stop;
223
224 printk(KERN_INFO "PM: Shrinking memory... ");
225 do_gettimeofday(&start);
226 do {
227 long size, highmem_size;
228
229 highmem_size = count_highmem_pages();
230 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
231 tmp = size;
232 size += highmem_size;
233 for_each_populated_zone(zone) {
234 tmp += snapshot_additional_pages(zone);
235 if (is_highmem(zone)) {
236 highmem_size -=
237 zone_page_state(zone, NR_FREE_PAGES);
238 } else {
239 tmp -= zone_page_state(zone, NR_FREE_PAGES);
240 tmp += zone->lowmem_reserve[ZONE_NORMAL];
241 }
242 }
243
244 if (highmem_size < 0)
245 highmem_size = 0;
246
247 tmp += highmem_size;
248 if (tmp > 0) {
249 tmp = __shrink_memory(tmp);
250 if (!tmp)
251 return -ENOMEM;
252 pages += tmp;
253 } else if (size > image_size / PAGE_SIZE) {
254 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
255 pages += tmp;
256 }
257 printk("\b%c", p[i++%4]);
258 } while (tmp > 0);
259 do_gettimeofday(&stop);
260 printk("\bdone (%lu pages freed)\n", pages);
261 swsusp_show_speed(&start, &stop, pages, "Freed");
262
263 return 0;
264}
265
266/*
267 * Platforms, like ACPI, may want us to save some memory used by them during
268 * hibernation and to restore the contents of this memory during the subsequent
269 * resume. The code below implements a mechanism allowing us to do that.
270 */
271
272struct nvs_page {
273 unsigned long phys_start;
274 unsigned int size;
275 void *kaddr;
276 void *data;
277 struct list_head node;
278};
279
280static LIST_HEAD(nvs_list);
281
282/**
283 * hibernate_nvs_register - register platform NVS memory region to save
284 * @start - physical address of the region
285 * @size - size of the region
286 *
287 * The NVS region need not be page-aligned (both ends) and we arrange
288 * things so that the data from page-aligned addresses in this region will
289 * be copied into separate RAM pages.
290 */
291int hibernate_nvs_register(unsigned long start, unsigned long size)
292{
293 struct nvs_page *entry, *next;
294
295 while (size > 0) {
296 unsigned int nr_bytes;
297
298 entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
299 if (!entry)
300 goto Error;
301
302 list_add_tail(&entry->node, &nvs_list);
303 entry->phys_start = start;
304 nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
305 entry->size = (size < nr_bytes) ? size : nr_bytes;
306
307 start += entry->size;
308 size -= entry->size;
309 }
310 return 0;
311
312 Error:
313 list_for_each_entry_safe(entry, next, &nvs_list, node) {
314 list_del(&entry->node);
315 kfree(entry);
316 }
317 return -ENOMEM;
318}
319
320/**
321 * hibernate_nvs_free - free data pages allocated for saving NVS regions
322 */
323void hibernate_nvs_free(void)
324{
325 struct nvs_page *entry;
326
327 list_for_each_entry(entry, &nvs_list, node)
328 if (entry->data) {
329 free_page((unsigned long)entry->data);
330 entry->data = NULL;
331 if (entry->kaddr) {
332 iounmap(entry->kaddr);
333 entry->kaddr = NULL;
334 }
335 }
336}
337
338/**
339 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions
340 */
341int hibernate_nvs_alloc(void)
342{
343 struct nvs_page *entry;
344
345 list_for_each_entry(entry, &nvs_list, node) {
346 entry->data = (void *)__get_free_page(GFP_KERNEL);
347 if (!entry->data) {
348 hibernate_nvs_free();
349 return -ENOMEM;
350 }
351 }
352 return 0;
353}
354
355/**
356 * hibernate_nvs_save - save NVS memory regions
357 */
358void hibernate_nvs_save(void)
359{
360 struct nvs_page *entry;
361
362 printk(KERN_INFO "PM: Saving platform NVS memory\n");
363
364 list_for_each_entry(entry, &nvs_list, node)
365 if (entry->data) {
366 entry->kaddr = ioremap(entry->phys_start, entry->size);
367 memcpy(entry->data, entry->kaddr, entry->size);
368 }
369}
370
371/**
372 * hibernate_nvs_restore - restore NVS memory regions
373 *
374 * This function is going to be called with interrupts disabled, so it
375 * cannot iounmap the virtual addresses used to access the NVS region.
376 */
377void hibernate_nvs_restore(void)
378{
379 struct nvs_page *entry;
380
381 printk(KERN_INFO "PM: Restoring platform NVS memory\n");
382
383 list_for_each_entry(entry, &nvs_list, node)
384 if (entry->data)
385 memcpy(entry->kaddr, entry->data, entry->size);
386}
diff --git a/kernel/printk.c b/kernel/printk.c
index 5052b5497c67..b4d97b54c1ec 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -687,20 +687,35 @@ asmlinkage int vprintk(const char *fmt, va_list args)
687 sizeof(printk_buf) - printed_len, fmt, args); 687 sizeof(printk_buf) - printed_len, fmt, args);
688 688
689 689
690 p = printk_buf;
691
692 /* Do we have a loglevel in the string? */
693 if (p[0] == '<') {
694 unsigned char c = p[1];
695 if (c && p[2] == '>') {
696 switch (c) {
697 case '0' ... '7': /* loglevel */
698 current_log_level = c - '0';
699 /* Fallthrough - make sure we're on a new line */
700 case 'd': /* KERN_DEFAULT */
701 if (!new_text_line) {
702 emit_log_char('\n');
703 new_text_line = 1;
704 }
705 /* Fallthrough - skip the loglevel */
706 case 'c': /* KERN_CONT */
707 p += 3;
708 break;
709 }
710 }
711 }
712
690 /* 713 /*
691 * Copy the output into log_buf. If the caller didn't provide 714 * Copy the output into log_buf. If the caller didn't provide
692 * appropriate log level tags, we insert them here 715 * appropriate log level tags, we insert them here
693 */ 716 */
694 for (p = printk_buf; *p; p++) { 717 for ( ; *p; p++) {
695 if (new_text_line) { 718 if (new_text_line) {
696 /* If a token, set current_log_level and skip over */
697 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
698 p[2] == '>') {
699 current_log_level = p[1] - '0';
700 p += 3;
701 printed_len -= 3;
702 }
703
704 /* Always output the token */ 719 /* Always output the token */
705 emit_log_char('<'); 720 emit_log_char('<');
706 emit_log_char(current_log_level + '0'); 721 emit_log_char(current_log_level + '0');
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae..69911b5745eb 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
111 /* only text is profiled */ 111 /* only text is profiled */
112 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
113 buffer_bytes = prof_len*sizeof(atomic_t); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 alloc_bootmem_cpumask_var(&prof_cpu_mask);
117 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
118 return 0;
119 }
120 114
121 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) 115 if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
122 return -ENOMEM; 116 return -ENOMEM;
@@ -371,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
371 node = cpu_to_node(cpu); 365 node = cpu_to_node(cpu);
372 per_cpu(cpu_profile_flip, cpu) = 0; 366 per_cpu(cpu_profile_flip, cpu) = 0;
373 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 367 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
374 page = alloc_pages_node(node, 368 page = alloc_pages_exact_node(node,
375 GFP_KERNEL | __GFP_ZERO, 369 GFP_KERNEL | __GFP_ZERO,
376 0); 370 0);
377 if (!page) 371 if (!page)
@@ -379,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
379 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 373 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
380 } 374 }
381 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 375 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
382 page = alloc_pages_node(node, 376 page = alloc_pages_exact_node(node,
383 GFP_KERNEL | __GFP_ZERO, 377 GFP_KERNEL | __GFP_ZERO,
384 0); 378 0);
385 if (!page) 379 if (!page)
@@ -570,14 +564,14 @@ static int create_hash_tables(void)
570 int node = cpu_to_node(cpu); 564 int node = cpu_to_node(cpu);
571 struct page *page; 565 struct page *page;
572 566
573 page = alloc_pages_node(node, 567 page = alloc_pages_exact_node(node,
574 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 568 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
575 0); 569 0);
576 if (!page) 570 if (!page)
577 goto out_cleanup; 571 goto out_cleanup;
578 per_cpu(cpu_profile_hits, cpu)[1] 572 per_cpu(cpu_profile_hits, cpu)[1]
579 = (struct profile_hit *)page_address(page); 573 = (struct profile_hit *)page_address(page);
580 page = alloc_pages_node(node, 574 page = alloc_pages_exact_node(node,
581 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 575 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
582 0); 576 0);
583 if (!page) 577 if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0692ab5a0d67..61c78b2c07ba 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -25,16 +25,6 @@
25 25
26 26
27/* 27/*
28 * Initialize a new task whose father had been ptraced.
29 *
30 * Called from copy_process().
31 */
32void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
33{
34 arch_ptrace_fork(child, clone_flags);
35}
36
37/*
38 * ptrace a task: make the debugger its new parent and 28 * ptrace a task: make the debugger its new parent and
39 * move it to the ptrace list. 29 * move it to the ptrace list.
40 * 30 *
@@ -177,66 +167,82 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
177int ptrace_attach(struct task_struct *task) 167int ptrace_attach(struct task_struct *task)
178{ 168{
179 int retval; 169 int retval;
180 unsigned long flags;
181 170
182 audit_ptrace(task); 171 audit_ptrace(task);
183 172
184 retval = -EPERM; 173 retval = -EPERM;
174 if (unlikely(task->flags & PF_KTHREAD))
175 goto out;
185 if (same_thread_group(task, current)) 176 if (same_thread_group(task, current))
186 goto out; 177 goto out;
187 178
188 /* Protect exec's credential calculations against our interference; 179 /*
189 * SUID, SGID and LSM creds get determined differently under ptrace. 180 * Protect exec's credential calculations against our interference;
181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace.
190 */ 183 */
191 retval = mutex_lock_interruptible(&task->cred_exec_mutex); 184 retval = mutex_lock_interruptible(&task->cred_guard_mutex);
192 if (retval < 0) 185 if (retval < 0)
193 goto out; 186 goto out;
194 187
195 retval = -EPERM;
196repeat:
197 /*
198 * Nasty, nasty.
199 *
200 * We want to hold both the task-lock and the
201 * tasklist_lock for writing at the same time.
202 * But that's against the rules (tasklist_lock
203 * is taken for reading by interrupts on other
204 * cpu's that may have task_lock).
205 */
206 task_lock(task); 188 task_lock(task);
207 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
208 task_unlock(task);
209 do {
210 cpu_relax();
211 } while (!write_can_lock(&tasklist_lock));
212 goto repeat;
213 }
214
215 if (!task->mm)
216 goto bad;
217 /* the same process cannot be attached many times */
218 if (task->ptrace & PT_PTRACED)
219 goto bad;
220 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); 189 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
190 task_unlock(task);
221 if (retval) 191 if (retval)
222 goto bad; 192 goto unlock_creds;
193
194 write_lock_irq(&tasklist_lock);
195 retval = -EPERM;
196 if (unlikely(task->exit_state))
197 goto unlock_tasklist;
198 if (task->ptrace)
199 goto unlock_tasklist;
223 200
224 /* Go */ 201 task->ptrace = PT_PTRACED;
225 task->ptrace |= PT_PTRACED;
226 if (capable(CAP_SYS_PTRACE)) 202 if (capable(CAP_SYS_PTRACE))
227 task->ptrace |= PT_PTRACE_CAP; 203 task->ptrace |= PT_PTRACE_CAP;
228 204
229 __ptrace_link(task, current); 205 __ptrace_link(task, current);
230
231 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); 206 send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
232bad: 207
233 write_unlock_irqrestore(&tasklist_lock, flags); 208 retval = 0;
234 task_unlock(task); 209unlock_tasklist:
235 mutex_unlock(&task->cred_exec_mutex); 210 write_unlock_irq(&tasklist_lock);
211unlock_creds:
212 mutex_unlock(&task->cred_guard_mutex);
236out: 213out:
237 return retval; 214 return retval;
238} 215}
239 216
217/**
218 * ptrace_traceme -- helper for PTRACE_TRACEME
219 *
220 * Performs checks and sets PT_PTRACED.
221 * Should be used by all ptrace implementations for PTRACE_TRACEME.
222 */
223int ptrace_traceme(void)
224{
225 int ret = -EPERM;
226
227 write_lock_irq(&tasklist_lock);
228 /* Are we already being traced? */
229 if (!current->ptrace) {
230 ret = security_ptrace_traceme(current->parent);
231 /*
232 * Check PF_EXITING to ensure ->real_parent has not passed
233 * exit_ptrace(). Otherwise we don't report the error but
234 * pretend ->real_parent untraces us right after return.
235 */
236 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
237 current->ptrace = PT_PTRACED;
238 __ptrace_link(current, current->real_parent);
239 }
240 }
241 write_unlock_irq(&tasklist_lock);
242
243 return ret;
244}
245
240/* 246/*
241 * Called with irqs disabled, returns true if childs should reap themselves. 247 * Called with irqs disabled, returns true if childs should reap themselves.
242 */ 248 */
@@ -304,6 +310,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
304 if (child->ptrace) { 310 if (child->ptrace) {
305 child->exit_code = data; 311 child->exit_code = data;
306 dead = __ptrace_detach(current, child); 312 dead = __ptrace_detach(current, child);
313 if (!child->exit_state)
314 wake_up_process(child);
307 } 315 }
308 write_unlock_irq(&tasklist_lock); 316 write_unlock_irq(&tasklist_lock);
309 317
@@ -416,37 +424,33 @@ static int ptrace_setoptions(struct task_struct *child, long data)
416 424
417static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 425static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
418{ 426{
427 unsigned long flags;
419 int error = -ESRCH; 428 int error = -ESRCH;
420 429
421 read_lock(&tasklist_lock); 430 if (lock_task_sighand(child, &flags)) {
422 if (likely(child->sighand != NULL)) {
423 error = -EINVAL; 431 error = -EINVAL;
424 spin_lock_irq(&child->sighand->siglock);
425 if (likely(child->last_siginfo != NULL)) { 432 if (likely(child->last_siginfo != NULL)) {
426 *info = *child->last_siginfo; 433 *info = *child->last_siginfo;
427 error = 0; 434 error = 0;
428 } 435 }
429 spin_unlock_irq(&child->sighand->siglock); 436 unlock_task_sighand(child, &flags);
430 } 437 }
431 read_unlock(&tasklist_lock);
432 return error; 438 return error;
433} 439}
434 440
435static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) 441static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
436{ 442{
443 unsigned long flags;
437 int error = -ESRCH; 444 int error = -ESRCH;
438 445
439 read_lock(&tasklist_lock); 446 if (lock_task_sighand(child, &flags)) {
440 if (likely(child->sighand != NULL)) {
441 error = -EINVAL; 447 error = -EINVAL;
442 spin_lock_irq(&child->sighand->siglock);
443 if (likely(child->last_siginfo != NULL)) { 448 if (likely(child->last_siginfo != NULL)) {
444 *child->last_siginfo = *info; 449 *child->last_siginfo = *info;
445 error = 0; 450 error = 0;
446 } 451 }
447 spin_unlock_irq(&child->sighand->siglock); 452 unlock_task_sighand(child, &flags);
448 } 453 }
449 read_unlock(&tasklist_lock);
450 return error; 454 return error;
451} 455}
452 456
@@ -573,72 +577,16 @@ int ptrace_request(struct task_struct *child, long request,
573 return ret; 577 return ret;
574} 578}
575 579
576/** 580static struct task_struct *ptrace_get_task_struct(pid_t pid)
577 * ptrace_traceme -- helper for PTRACE_TRACEME
578 *
579 * Performs checks and sets PT_PTRACED.
580 * Should be used by all ptrace implementations for PTRACE_TRACEME.
581 */
582int ptrace_traceme(void)
583{
584 int ret = -EPERM;
585
586 /*
587 * Are we already being traced?
588 */
589repeat:
590 task_lock(current);
591 if (!(current->ptrace & PT_PTRACED)) {
592 /*
593 * See ptrace_attach() comments about the locking here.
594 */
595 unsigned long flags;
596 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
597 task_unlock(current);
598 do {
599 cpu_relax();
600 } while (!write_can_lock(&tasklist_lock));
601 goto repeat;
602 }
603
604 ret = security_ptrace_traceme(current->parent);
605
606 /*
607 * Check PF_EXITING to ensure ->real_parent has not passed
608 * exit_ptrace(). Otherwise we don't report the error but
609 * pretend ->real_parent untraces us right after return.
610 */
611 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
612 current->ptrace |= PT_PTRACED;
613 __ptrace_link(current, current->real_parent);
614 }
615
616 write_unlock_irqrestore(&tasklist_lock, flags);
617 }
618 task_unlock(current);
619 return ret;
620}
621
622/**
623 * ptrace_get_task_struct -- grab a task struct reference for ptrace
624 * @pid: process id to grab a task_struct reference of
625 *
626 * This function is a helper for ptrace implementations. It checks
627 * permissions and then grabs a task struct for use of the actual
628 * ptrace implementation.
629 *
630 * Returns the task_struct for @pid or an ERR_PTR() on failure.
631 */
632struct task_struct *ptrace_get_task_struct(pid_t pid)
633{ 581{
634 struct task_struct *child; 582 struct task_struct *child;
635 583
636 read_lock(&tasklist_lock); 584 rcu_read_lock();
637 child = find_task_by_vpid(pid); 585 child = find_task_by_vpid(pid);
638 if (child) 586 if (child)
639 get_task_struct(child); 587 get_task_struct(child);
588 rcu_read_unlock();
640 589
641 read_unlock(&tasklist_lock);
642 if (!child) 590 if (!child)
643 return ERR_PTR(-ESRCH); 591 return ERR_PTR(-ESRCH);
644 return child; 592 return child;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d3..beb0e659adcc 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
1356 1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; 1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); 1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; 1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq, 1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, 1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret); 1362 ret);
1363 1363
1364 /*
1365 * Signals would prevent us from sleeping, and we cannot
1366 * do much with them in any case. So flush them.
1367 */
1368 if (ret)
1369 flush_signals(current);
1370 couldsleepnext = 0; 1364 couldsleepnext = 0;
1371 1365
1372 } while (!kthread_should_stop()); 1366 } while (!kthread_should_stop());
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b..0dccfbba6d26 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1259 check_cpu_stall(rsp, rdp); 1259 check_cpu_stall(rsp, rdp);
1260 1260
1261 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1261 /* Is the RCU core waiting for a quiescent state from this CPU? */
1262 if (rdp->qs_pending) 1262 if (rdp->qs_pending) {
1263 rdp->n_rp_qs_pending++;
1263 return 1; 1264 return 1;
1265 }
1264 1266
1265 /* Does this CPU have callbacks ready to invoke? */ 1267 /* Does this CPU have callbacks ready to invoke? */
1266 if (cpu_has_callbacks_ready_to_invoke(rdp)) 1268 if (cpu_has_callbacks_ready_to_invoke(rdp)) {
1269 rdp->n_rp_cb_ready++;
1267 return 1; 1270 return 1;
1271 }
1268 1272
1269 /* Has RCU gone idle with this CPU needing another grace period? */ 1273 /* Has RCU gone idle with this CPU needing another grace period? */
1270 if (cpu_needs_another_gp(rsp, rdp)) 1274 if (cpu_needs_another_gp(rsp, rdp)) {
1275 rdp->n_rp_cpu_needs_gp++;
1271 return 1; 1276 return 1;
1277 }
1272 1278
1273 /* Has another RCU grace period completed? */ 1279 /* Has another RCU grace period completed? */
1274 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ 1280 if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
1281 rdp->n_rp_gp_completed++;
1275 return 1; 1282 return 1;
1283 }
1276 1284
1277 /* Has a new RCU grace period started? */ 1285 /* Has a new RCU grace period started? */
1278 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ 1286 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
1287 rdp->n_rp_gp_started++;
1279 return 1; 1288 return 1;
1289 }
1280 1290
1281 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1291 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1282 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1292 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1283 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) 1293 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1294 rdp->n_rp_need_fqs++;
1284 return 1; 1295 return 1;
1296 }
1285 1297
1286 /* nothing to do */ 1298 /* nothing to do */
1299 rdp->n_rp_need_nothing++;
1287 return 0; 1300 return 0;
1288} 1301}
1289 1302
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba9404..fe1dcdbf1ca3 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
213 .release = single_release, 213 .release = single_release,
214}; 214};
215 215
216static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; 216static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
217{
218 seq_printf(m, "%3d%cnp=%ld "
219 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
220 rdp->cpu,
221 cpu_is_offline(rdp->cpu) ? '!' : ' ',
222 rdp->n_rcu_pending,
223 rdp->n_rp_qs_pending,
224 rdp->n_rp_cb_ready,
225 rdp->n_rp_cpu_needs_gp,
226 rdp->n_rp_gp_completed,
227 rdp->n_rp_gp_started,
228 rdp->n_rp_need_fqs,
229 rdp->n_rp_need_nothing);
230}
231
232static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
233{
234 int cpu;
235 struct rcu_data *rdp;
236
237 for_each_possible_cpu(cpu) {
238 rdp = rsp->rda[cpu];
239 if (rdp->beenonline)
240 print_one_rcu_pending(m, rdp);
241 }
242}
243
244static int show_rcu_pending(struct seq_file *m, void *unused)
245{
246 seq_puts(m, "rcu:\n");
247 print_rcu_pendings(m, &rcu_state);
248 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state);
250 return 0;
251}
252
253static int rcu_pending_open(struct inode *inode, struct file *file)
254{
255 return single_open(file, show_rcu_pending, NULL);
256}
257
258static struct file_operations rcu_pending_fops = {
259 .owner = THIS_MODULE,
260 .open = rcu_pending_open,
261 .read = seq_read,
262 .llseek = seq_lseek,
263 .release = single_release,
264};
265
266static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272
217static int __init rcuclassic_trace_init(void) 273static int __init rcuclassic_trace_init(void)
218{ 274{
219 rcudir = debugfs_create_dir("rcu", NULL); 275 rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
238 NULL, &rcuhier_fops); 294 NULL, &rcuhier_fops);
239 if (!hierdir) 295 if (!hierdir)
240 goto free_out; 296 goto free_out;
297
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir)
301 goto free_out;
241 return 0; 302 return 0;
242free_out: 303free_out:
243 if (datadir) 304 if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
257 debugfs_remove(datadir_csv); 318 debugfs_remove(datadir_csv);
258 debugfs_remove(gpdir); 319 debugfs_remove(gpdir);
259 debugfs_remove(hierdir); 320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
260 debugfs_remove(rcudir); 322 debugfs_remove(rcudir);
261} 323}
262 324
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bf8e7534c803..e1338f074314 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -18,7 +18,7 @@
18void res_counter_init(struct res_counter *counter, struct res_counter *parent) 18void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = (unsigned long long)LLONG_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->parent = parent; 22 counter->parent = parent;
23} 23}
24 24
@@ -133,6 +133,16 @@ int res_counter_memparse_write_strategy(const char *buf,
133 unsigned long long *res) 133 unsigned long long *res)
134{ 134{
135 char *end; 135 char *end;
136
137 /* return RESOURCE_MAX(unlimited) if "-1" is specified */
138 if (*buf == '-') {
139 *res = simple_strtoull(buf + 1, &end, 10);
140 if (*res != 1 || *end != '\0')
141 return -EINVAL;
142 *res = RESOURCE_MAX;
143 return 0;
144 }
145
136 /* FIXME - make memparse() take const char* args */ 146 /* FIXME - make memparse() take const char* args */
137 *res = memparse((char *)buf, &end); 147 *res = memparse((char *)buf, &end);
138 if (*end != '\0') 148 if (*end != '\0')
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..fcd107a78c5a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
300 * assigned pending owner [which might not have taken the 300 * assigned pending owner [which might not have taken the
301 * lock yet]: 301 * lock yet]:
302 */ 302 */
303static inline int try_to_steal_lock(struct rt_mutex *lock) 303static inline int try_to_steal_lock(struct rt_mutex *lock,
304 struct task_struct *task)
304{ 305{
305 struct task_struct *pendowner = rt_mutex_owner(lock); 306 struct task_struct *pendowner = rt_mutex_owner(lock);
306 struct rt_mutex_waiter *next; 307 struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
309 if (!rt_mutex_owner_pending(lock)) 310 if (!rt_mutex_owner_pending(lock))
310 return 0; 311 return 0;
311 312
312 if (pendowner == current) 313 if (pendowner == task)
313 return 1; 314 return 1;
314 315
315 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 spin_lock_irqsave(&pendowner->pi_lock, flags);
316 if (current->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
317 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
318 return 0; 319 return 0;
319 } 320 }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
338 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
339 * enqueued on the pending owners pi_waiters queue. So 340 * enqueued on the pending owners pi_waiters queue. So
340 * we have to enqueue this waiter into 341 * we have to enqueue this waiter into
341 * current->pi_waiters list. This covers the case, 342 * task->pi_waiters list. This covers the case,
342 * where current is boosted because it holds another 343 * where task is boosted because it holds another
343 * lock and gets unboosted because the booster is 344 * lock and gets unboosted because the booster is
344 * interrupted, so we would delay a waiter with higher 345 * interrupted, so we would delay a waiter with higher
345 * priority as current->normal_prio. 346 * priority as task->normal_prio.
346 * 347 *
347 * Note: in the rare case of a SCHED_OTHER task changing 348 * Note: in the rare case of a SCHED_OTHER task changing
348 * its priority and thus stealing the lock, next->task 349 * its priority and thus stealing the lock, next->task
349 * might be current: 350 * might be task:
350 */ 351 */
351 if (likely(next->task != current)) { 352 if (likely(next->task != task)) {
352 spin_lock_irqsave(&current->pi_lock, flags); 353 spin_lock_irqsave(&task->pi_lock, flags);
353 plist_add(&next->pi_list_entry, &current->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
354 __rt_mutex_adjust_prio(current); 355 __rt_mutex_adjust_prio(task);
355 spin_unlock_irqrestore(&current->pi_lock, flags); 356 spin_unlock_irqrestore(&task->pi_lock, flags);
356 } 357 }
357 return 1; 358 return 1;
358} 359}
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
389 */ 390 */
390 mark_rt_mutex_waiters(lock); 391 mark_rt_mutex_waiters(lock);
391 392
392 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) 393 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
393 return 0; 394 return 0;
394 395
395 /* We got the lock. */ 396 /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
411 */ 412 */
412static int task_blocks_on_rt_mutex(struct rt_mutex *lock, 413static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
413 struct rt_mutex_waiter *waiter, 414 struct rt_mutex_waiter *waiter,
415 struct task_struct *task,
414 int detect_deadlock) 416 int detect_deadlock)
415{ 417{
416 struct task_struct *owner = rt_mutex_owner(lock); 418 struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
418 unsigned long flags; 420 unsigned long flags;
419 int chain_walk = 0, res; 421 int chain_walk = 0, res;
420 422
421 spin_lock_irqsave(&current->pi_lock, flags); 423 spin_lock_irqsave(&task->pi_lock, flags);
422 __rt_mutex_adjust_prio(current); 424 __rt_mutex_adjust_prio(task);
423 waiter->task = current; 425 waiter->task = task;
424 waiter->lock = lock; 426 waiter->lock = lock;
425 plist_node_init(&waiter->list_entry, current->prio); 427 plist_node_init(&waiter->list_entry, task->prio);
426 plist_node_init(&waiter->pi_list_entry, current->prio); 428 plist_node_init(&waiter->pi_list_entry, task->prio);
427 429
428 /* Get the top priority waiter on the lock */ 430 /* Get the top priority waiter on the lock */
429 if (rt_mutex_has_waiters(lock)) 431 if (rt_mutex_has_waiters(lock))
430 top_waiter = rt_mutex_top_waiter(lock); 432 top_waiter = rt_mutex_top_waiter(lock);
431 plist_add(&waiter->list_entry, &lock->wait_list); 433 plist_add(&waiter->list_entry, &lock->wait_list);
432 434
433 current->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
434 436
435 spin_unlock_irqrestore(&current->pi_lock, flags); 437 spin_unlock_irqrestore(&task->pi_lock, flags);
436 438
437 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
438 spin_lock_irqsave(&owner->pi_lock, flags); 440 spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
460 spin_unlock(&lock->wait_lock); 462 spin_unlock(&lock->wait_lock);
461 463
462 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
463 current); 465 task);
464 466
465 spin_lock(&lock->wait_lock); 467 spin_lock(&lock->wait_lock);
466 468
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
605 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); 607 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
606} 608}
607 609
608/* 610/**
609 * Slow path lock function: 611 * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
612 * @lock: the rt_mutex to take
613 * @state: the state the task should block in (TASK_INTERRUPTIBLE
614 * or TASK_UNINTERRUPTIBLE)
615 * @timeout: the pre-initialized and started timer, or NULL for none
616 * @waiter: the pre-initialized rt_mutex_waiter
617 * @detect_deadlock: passed to task_blocks_on_rt_mutex
618 *
619 * lock->wait_lock must be held by the caller.
610 */ 620 */
611static int __sched 621static int __sched
612rt_mutex_slowlock(struct rt_mutex *lock, int state, 622__rt_mutex_slowlock(struct rt_mutex *lock, int state,
613 struct hrtimer_sleeper *timeout, 623 struct hrtimer_sleeper *timeout,
614 int detect_deadlock) 624 struct rt_mutex_waiter *waiter,
625 int detect_deadlock)
615{ 626{
616 struct rt_mutex_waiter waiter;
617 int ret = 0; 627 int ret = 0;
618 628
619 debug_rt_mutex_init_waiter(&waiter);
620 waiter.task = NULL;
621
622 spin_lock(&lock->wait_lock);
623
624 /* Try to acquire the lock again: */
625 if (try_to_take_rt_mutex(lock)) {
626 spin_unlock(&lock->wait_lock);
627 return 0;
628 }
629
630 set_current_state(state);
631
632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) {
634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 if (!hrtimer_active(&timeout->timer))
636 timeout->task = NULL;
637 }
638
639 for (;;) { 629 for (;;) {
640 /* Try to acquire the lock: */ 630 /* Try to acquire the lock: */
641 if (try_to_take_rt_mutex(lock)) 631 if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
656 } 646 }
657 647
658 /* 648 /*
659 * waiter.task is NULL the first time we come here and 649 * waiter->task is NULL the first time we come here and
660 * when we have been woken up by the previous owner 650 * when we have been woken up by the previous owner
661 * but the lock got stolen by a higher prio task. 651 * but the lock got stolen by a higher prio task.
662 */ 652 */
663 if (!waiter.task) { 653 if (!waiter->task) {
664 ret = task_blocks_on_rt_mutex(lock, &waiter, 654 ret = task_blocks_on_rt_mutex(lock, waiter, current,
665 detect_deadlock); 655 detect_deadlock);
666 /* 656 /*
667 * If we got woken up by the owner then start loop 657 * If we got woken up by the owner then start loop
668 * all over without going into schedule to try 658 * all over without going into schedule to try
669 * to get the lock now: 659 * to get the lock now:
670 */ 660 */
671 if (unlikely(!waiter.task)) { 661 if (unlikely(!waiter->task)) {
672 /* 662 /*
673 * Reset the return value. We might 663 * Reset the return value. We might
674 * have returned with -EDEADLK and the 664 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
684 674
685 spin_unlock(&lock->wait_lock); 675 spin_unlock(&lock->wait_lock);
686 676
687 debug_rt_mutex_print_deadlock(&waiter); 677 debug_rt_mutex_print_deadlock(waiter);
688 678
689 if (waiter.task) 679 if (waiter->task)
690 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
691 681
692 spin_lock(&lock->wait_lock); 682 spin_lock(&lock->wait_lock);
693 set_current_state(state); 683 set_current_state(state);
694 } 684 }
695 685
686 return ret;
687}
688
689/*
690 * Slow path lock function:
691 */
692static int __sched
693rt_mutex_slowlock(struct rt_mutex *lock, int state,
694 struct hrtimer_sleeper *timeout,
695 int detect_deadlock)
696{
697 struct rt_mutex_waiter waiter;
698 int ret = 0;
699
700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL;
702
703 spin_lock(&lock->wait_lock);
704
705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock);
708 return 0;
709 }
710
711 set_current_state(state);
712
713 /* Setup the timer, when timeout != NULL */
714 if (unlikely(timeout)) {
715 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
716 if (!hrtimer_active(&timeout->timer))
717 timeout->task = NULL;
718 }
719
720 ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
721 detect_deadlock);
722
696 set_current_state(TASK_RUNNING); 723 set_current_state(TASK_RUNNING);
697 724
698 if (unlikely(waiter.task)) 725 if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 891EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 892
866/** 893/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 894 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 895 * the timeout structure is provided
869 * by the caller 896 * by the caller
870 * 897 *
871 * @lock: the rt_mutex to be locked 898 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 899 * @timeout: timeout structure or NULL (no timeout)
@@ -875,7 +902,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
875 * Returns: 902 * Returns:
876 * 0 on success 903 * 0 on success
877 * -EINTR when interrupted by a signal 904 * -EINTR when interrupted by a signal
878 * -ETIMEOUT when the timeout expired 905 * -ETIMEDOUT when the timeout expired
879 * -EDEADLK when the lock would deadlock (when deadlock detection is on) 906 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
880 */ 907 */
881int 908int
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 940}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 941EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 942
916/*** 943/**
917 * rt_mutex_destroy - mark a mutex unusable 944 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 945 * @lock: the mutex to be destroyed
919 * 946 *
@@ -986,6 +1013,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
986} 1013}
987 1014
988/** 1015/**
1016 * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
1017 * @lock: the rt_mutex to take
1018 * @waiter: the pre-initialized rt_mutex_waiter
1019 * @task: the task to prepare
1020 * @detect_deadlock: perform deadlock detection (1) or not (0)
1021 *
1022 * Returns:
1023 * 0 - task blocked on lock
1024 * 1 - acquired the lock for task, caller should wake it up
1025 * <0 - error
1026 *
1027 * Special API call for FUTEX_REQUEUE_PI support.
1028 */
1029int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1030 struct rt_mutex_waiter *waiter,
1031 struct task_struct *task, int detect_deadlock)
1032{
1033 int ret;
1034
1035 spin_lock(&lock->wait_lock);
1036
1037 mark_rt_mutex_waiters(lock);
1038
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0);
1044
1045 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1;
1047 }
1048
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050
1051
1052 if (ret && !waiter->task) {
1053 /*
1054 * Reset the return value. We might have
1055 * returned with -EDEADLK and the owner
1056 * released the lock while we were walking the
1057 * pi chain. Let the waiter sort it out.
1058 */
1059 ret = 0;
1060 }
1061 spin_unlock(&lock->wait_lock);
1062
1063 debug_rt_mutex_print_deadlock(waiter);
1064
1065 return ret;
1066}
1067
1068/**
989 * rt_mutex_next_owner - return the next owner of the lock 1069 * rt_mutex_next_owner - return the next owner of the lock
990 * 1070 *
991 * @lock: the rt lock query 1071 * @lock: the rt lock query
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
1004 1084
1005 return rt_mutex_top_waiter(lock)->task; 1085 return rt_mutex_top_waiter(lock)->task;
1006} 1086}
1087
1088/**
1089 * rt_mutex_finish_proxy_lock() - Complete lock acquisition
1090 * @lock: the rt_mutex we were woken on
1091 * @to: the timeout, null if none. hrtimer should already have
1092 * been started.
1093 * @waiter: the pre-initialized rt_mutex_waiter
1094 * @detect_deadlock: perform deadlock detection (1) or not (0)
1095 *
1096 * Complete the lock acquisition started our behalf by another thread.
1097 *
1098 * Returns:
1099 * 0 - success
1100 * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
1101 *
1102 * Special API call for PI-futex requeue support
1103 */
1104int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1105 struct hrtimer_sleeper *to,
1106 struct rt_mutex_waiter *waiter,
1107 int detect_deadlock)
1108{
1109 int ret;
1110
1111 spin_lock(&lock->wait_lock);
1112
1113 set_current_state(TASK_INTERRUPTIBLE);
1114
1115 ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
1116 detect_deadlock);
1117
1118 set_current_state(TASK_RUNNING);
1119
1120 if (unlikely(waiter->task))
1121 remove_waiter(lock, waiter);
1122
1123 /*
1124 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
1125 * have to fix that up.
1126 */
1127 fixup_rt_mutex_waiters(lock);
1128
1129 spin_unlock(&lock->wait_lock);
1130
1131 /*
1132 * Readjust priority, when we did not get the lock. We might have been
1133 * the pending owner and boosted. Since we did not take the lock, the
1134 * PI boost has to go.
1135 */
1136 if (unlikely(ret))
1137 rt_mutex_adjust_prio(current);
1138
1139 return ret;
1140}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea..97a2f81866af 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 122 struct task_struct *proxy_owner);
123extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
124 struct rt_mutex_waiter *waiter,
125 struct task_struct *task,
126 int detect_deadlock);
127extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
128 struct hrtimer_sleeper *to,
129 struct rt_mutex_waiter *waiter,
130 int detect_deadlock);
123 131
124#ifdef CONFIG_DEBUG_RT_MUTEXES 132#ifdef CONFIG_DEBUG_RT_MUTEXES
125# include "rtmutex-debug.h" 133# include "rtmutex-debug.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..7c9098d186e6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,17 +69,18 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft)); 241 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0); 243 HRTIMER_MODE_ABS_PINNED, 0);
248 } 244 }
249 spin_unlock(&rt_b->rt_runtime_lock); 245 spin_unlock(&rt_b->rt_runtime_lock);
250} 246}
@@ -584,6 +580,7 @@ struct rq {
584 struct load_weight load; 580 struct load_weight load;
585 unsigned long nr_load_updates; 581 unsigned long nr_load_updates;
586 u64 nr_switches; 582 u64 nr_switches;
583 u64 nr_migrations_in;
587 584
588 struct cfs_rq cfs; 585 struct cfs_rq cfs;
589 struct rt_rq rt; 586 struct rt_rq rt;
@@ -630,6 +627,10 @@ struct rq {
630 struct list_head migration_queue; 627 struct list_head migration_queue;
631#endif 628#endif
632 629
630 /* calc_load related fields */
631 unsigned long calc_load_update;
632 long calc_load_active;
633
633#ifdef CONFIG_SCHED_HRTICK 634#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 635#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 636 int hrtick_csd_pending;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1154,7 +1155,7 @@ static __init void init_hrtick(void)
1154static void hrtick_start(struct rq *rq, u64 delay) 1155static void hrtick_start(struct rq *rq, u64 delay)
1155{ 1156{
1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1157 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0); 1158 HRTIMER_MODE_REL_PINNED, 0);
1158} 1159}
1159 1160
1160static inline void init_hrtick(void) 1161static inline void init_hrtick(void)
@@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1729}
1729#endif 1730#endif
1730 1731
1732static void calc_load_account_active(struct rq *this_rq);
1733
1731#include "sched_stats.h" 1734#include "sched_stats.h"
1732#include "sched_idletask.c" 1735#include "sched_idletask.c"
1733#include "sched_fair.c" 1736#include "sched_fair.c"
@@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1961
1959 clock_offset = old_rq->clock - new_rq->clock; 1962 clock_offset = old_rq->clock - new_rq->clock;
1960 1963
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1964 trace_sched_migrate_task(p, new_cpu);
1962 1965
1963#ifdef CONFIG_SCHEDSTATS 1966#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1967 if (p->se.wait_start)
@@ -1967,12 +1970,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1970 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1971 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1972 p->se.block_start -= clock_offset;
1973#endif
1970 if (old_cpu != new_cpu) { 1974 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1975 p->se.nr_migrations++;
1976 new_rq->nr_migrations_in++;
1977#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1980#endif
1981 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1982 1, 1, NULL, 0);
1983 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1984 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1985 new_cfsrq->min_vruntime;
1978 1986
@@ -2015,6 +2023,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2023}
2016 2024
2017/* 2025/*
2026 * wait_task_context_switch - wait for a thread to complete at least one
2027 * context switch.
2028 *
2029 * @p must not be current.
2030 */
2031void wait_task_context_switch(struct task_struct *p)
2032{
2033 unsigned long nvcsw, nivcsw, flags;
2034 int running;
2035 struct rq *rq;
2036
2037 nvcsw = p->nvcsw;
2038 nivcsw = p->nivcsw;
2039 for (;;) {
2040 /*
2041 * The runqueue is assigned before the actual context
2042 * switch. We need to take the runqueue lock.
2043 *
2044 * We could check initially without the lock but it is
2045 * very likely that we need to take the lock in every
2046 * iteration.
2047 */
2048 rq = task_rq_lock(p, &flags);
2049 running = task_running(rq, p);
2050 task_rq_unlock(rq, &flags);
2051
2052 if (likely(!running))
2053 break;
2054 /*
2055 * The switch count is incremented before the actual
2056 * context switch. We thus wait for two switches to be
2057 * sure at least one completed.
2058 */
2059 if ((p->nvcsw - nvcsw) > 1)
2060 break;
2061 if ((p->nivcsw - nivcsw) > 1)
2062 break;
2063
2064 cpu_relax();
2065 }
2066}
2067
2068/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2069 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2070 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2071 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2193,7 @@ void kick_process(struct task_struct *p)
2142 smp_send_reschedule(cpu); 2193 smp_send_reschedule(cpu);
2143 preempt_enable(); 2194 preempt_enable();
2144} 2195}
2196EXPORT_SYMBOL_GPL(kick_process);
2145 2197
2146/* 2198/*
2147 * Return a low guess at the load of a migration-source cpu weighted 2199 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2376,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2376
2325#endif /* CONFIG_SMP */ 2377#endif /* CONFIG_SMP */
2326 2378
2379/**
2380 * task_oncpu_function_call - call a function on the cpu on which a task runs
2381 * @p: the task to evaluate
2382 * @func: the function to be called
2383 * @info: the function call argument
2384 *
2385 * Calls the function @func when the task is currently running. This might
2386 * be on the current CPU, which just calls the function directly
2387 */
2388void task_oncpu_function_call(struct task_struct *p,
2389 void (*func) (void *info), void *info)
2390{
2391 int cpu;
2392
2393 preempt_disable();
2394 cpu = task_cpu(p);
2395 if (task_curr(p))
2396 smp_call_function_single(cpu, func, info, 1);
2397 preempt_enable();
2398}
2399
2327/*** 2400/***
2328 * try_to_wake_up - wake up a thread 2401 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2402 * @p: the to-be-woken-up thread
@@ -2458,6 +2531,17 @@ out:
2458 return success; 2531 return success;
2459} 2532}
2460 2533
2534/**
2535 * wake_up_process - Wake up a specific process
2536 * @p: The process to be woken up.
2537 *
2538 * Attempt to wake up the nominated process and move it to the set of runnable
2539 * processes. Returns 1 if the process was woken up, 0 if it was already
2540 * running.
2541 *
2542 * It may be assumed that this function implies a write memory barrier before
2543 * changing the task state if and only if any tasks are woken up.
2544 */
2461int wake_up_process(struct task_struct *p) 2545int wake_up_process(struct task_struct *p)
2462{ 2546{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2547 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,6 +2564,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2564 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2565 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2566 p->se.prev_sum_exec_runtime = 0;
2567 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2568 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2569 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2570 p->se.start_runtime = 0;
@@ -2710,6 +2795,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2795 */
2711 prev_state = prev->state; 2796 prev_state = prev->state;
2712 finish_arch_switch(prev); 2797 finish_arch_switch(prev);
2798 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2799 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2800#ifdef CONFIG_SMP
2715 if (post_schedule) 2801 if (post_schedule)
@@ -2766,7 +2852,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2852 * combine the page table reload and the switch backend into
2767 * one hypercall. 2853 * one hypercall.
2768 */ 2854 */
2769 arch_enter_lazy_cpu_mode(); 2855 arch_start_context_switch(prev);
2770 2856
2771 if (unlikely(!mm)) { 2857 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2858 next->active_mm = oldmm;
@@ -2856,19 +2942,81 @@ unsigned long nr_iowait(void)
2856 return sum; 2942 return sum;
2857} 2943}
2858 2944
2859unsigned long nr_active(void) 2945/* Variables and functions for calc_load */
2946static atomic_long_t calc_load_tasks;
2947static unsigned long calc_load_update;
2948unsigned long avenrun[3];
2949EXPORT_SYMBOL(avenrun);
2950
2951/**
2952 * get_avenrun - get the load average array
2953 * @loads: pointer to dest load array
2954 * @offset: offset to add
2955 * @shift: shift count to shift the result left
2956 *
2957 * These values are estimates at best, so no need for locking.
2958 */
2959void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2960{
2861 unsigned long i, running = 0, uninterruptible = 0; 2961 loads[0] = (avenrun[0] + offset) << shift;
2962 loads[1] = (avenrun[1] + offset) << shift;
2963 loads[2] = (avenrun[2] + offset) << shift;
2964}
2862 2965
2863 for_each_online_cpu(i) { 2966static unsigned long
2864 running += cpu_rq(i)->nr_running; 2967calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2968{
2866 } 2969 load *= exp;
2970 load += active * (FIXED_1 - exp);
2971 return load >> FSHIFT;
2972}
2973
2974/*
2975 * calc_load - update the avenrun load estimates 10 ticks after the
2976 * CPUs have updated calc_load_tasks.
2977 */
2978void calc_global_load(void)
2979{
2980 unsigned long upd = calc_load_update + 10;
2981 long active;
2982
2983 if (time_before(jiffies, upd))
2984 return;
2985
2986 active = atomic_long_read(&calc_load_tasks);
2987 active = active > 0 ? active * FIXED_1 : 0;
2988
2989 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2990 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2991 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2992
2993 calc_load_update += LOAD_FREQ;
2994}
2995
2996/*
2997 * Either called from update_cpu_load() or from a cpu going idle
2998 */
2999static void calc_load_account_active(struct rq *this_rq)
3000{
3001 long nr_active, delta;
3002
3003 nr_active = this_rq->nr_running;
3004 nr_active += (long) this_rq->nr_uninterruptible;
2867 3005
2868 if (unlikely((long)uninterruptible < 0)) 3006 if (nr_active != this_rq->calc_load_active) {
2869 uninterruptible = 0; 3007 delta = nr_active - this_rq->calc_load_active;
3008 this_rq->calc_load_active = nr_active;
3009 atomic_long_add(delta, &calc_load_tasks);
3010 }
3011}
2870 3012
2871 return running + uninterruptible; 3013/*
3014 * Externally visible per-cpu scheduler statistics:
3015 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3016 */
3017u64 cpu_nr_migrations(int cpu)
3018{
3019 return cpu_rq(cpu)->nr_migrations_in;
2872} 3020}
2873 3021
2874/* 3022/*
@@ -2899,6 +3047,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3047 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3048 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3049 }
3050
3051 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3052 this_rq->calc_load_update += LOAD_FREQ;
3053 calc_load_account_active(this_rq);
3054 }
2902} 3055}
2903 3056
2904#ifdef CONFIG_SMP 3057#ifdef CONFIG_SMP
@@ -4240,10 +4393,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4393static struct {
4241 atomic_t load_balancer; 4394 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4395 cpumask_var_t cpu_mask;
4396 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4397} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4398 .load_balancer = ATOMIC_INIT(-1),
4245}; 4399};
4246 4400
4401int get_nohz_load_balancer(void)
4402{
4403 return atomic_read(&nohz.load_balancer);
4404}
4405
4406#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4407/**
4408 * lowest_flag_domain - Return lowest sched_domain containing flag.
4409 * @cpu: The cpu whose lowest level of sched domain is to
4410 * be returned.
4411 * @flag: The flag to check for the lowest sched_domain
4412 * for the given cpu.
4413 *
4414 * Returns the lowest sched_domain of a cpu which contains the given flag.
4415 */
4416static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4417{
4418 struct sched_domain *sd;
4419
4420 for_each_domain(cpu, sd)
4421 if (sd && (sd->flags & flag))
4422 break;
4423
4424 return sd;
4425}
4426
4427/**
4428 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4429 * @cpu: The cpu whose domains we're iterating over.
4430 * @sd: variable holding the value of the power_savings_sd
4431 * for cpu.
4432 * @flag: The flag to filter the sched_domains to be iterated.
4433 *
4434 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4435 * set, starting from the lowest sched_domain to the highest.
4436 */
4437#define for_each_flag_domain(cpu, sd, flag) \
4438 for (sd = lowest_flag_domain(cpu, flag); \
4439 (sd && (sd->flags & flag)); sd = sd->parent)
4440
4441/**
4442 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4443 * @ilb_group: group to be checked for semi-idleness
4444 *
4445 * Returns: 1 if the group is semi-idle. 0 otherwise.
4446 *
4447 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4448 * and atleast one non-idle CPU. This helper function checks if the given
4449 * sched_group is semi-idle or not.
4450 */
4451static inline int is_semi_idle_group(struct sched_group *ilb_group)
4452{
4453 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4454 sched_group_cpus(ilb_group));
4455
4456 /*
4457 * A sched_group is semi-idle when it has atleast one busy cpu
4458 * and atleast one idle cpu.
4459 */
4460 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4461 return 0;
4462
4463 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4464 return 0;
4465
4466 return 1;
4467}
4468/**
4469 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4470 * @cpu: The cpu which is nominating a new idle_load_balancer.
4471 *
4472 * Returns: Returns the id of the idle load balancer if it exists,
4473 * Else, returns >= nr_cpu_ids.
4474 *
4475 * This algorithm picks the idle load balancer such that it belongs to a
4476 * semi-idle powersavings sched_domain. The idea is to try and avoid
4477 * completely idle packages/cores just for the purpose of idle load balancing
4478 * when there are other idle cpu's which are better suited for that job.
4479 */
4480static int find_new_ilb(int cpu)
4481{
4482 struct sched_domain *sd;
4483 struct sched_group *ilb_group;
4484
4485 /*
4486 * Have idle load balancer selection from semi-idle packages only
4487 * when power-aware load balancing is enabled
4488 */
4489 if (!(sched_smt_power_savings || sched_mc_power_savings))
4490 goto out_done;
4491
4492 /*
4493 * Optimize for the case when we have no idle CPUs or only one
4494 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4495 */
4496 if (cpumask_weight(nohz.cpu_mask) < 2)
4497 goto out_done;
4498
4499 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4500 ilb_group = sd->groups;
4501
4502 do {
4503 if (is_semi_idle_group(ilb_group))
4504 return cpumask_first(nohz.ilb_grp_nohz_mask);
4505
4506 ilb_group = ilb_group->next;
4507
4508 } while (ilb_group != sd->groups);
4509 }
4510
4511out_done:
4512 return cpumask_first(nohz.cpu_mask);
4513}
4514#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4515static inline int find_new_ilb(int call_cpu)
4516{
4517 return cpumask_first(nohz.cpu_mask);
4518}
4519#endif
4520
4247/* 4521/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4522 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4523 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4572,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4572 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4573 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4574 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4575 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4576 int new_ilb;
4577
4578 if (!(sched_smt_power_savings ||
4579 sched_mc_power_savings))
4580 return 1;
4581 /*
4582 * Check to see if there is a more power-efficient
4583 * ilb.
4584 */
4585 new_ilb = find_new_ilb(cpu);
4586 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4587 atomic_set(&nohz.load_balancer, -1);
4588 resched_cpu(new_ilb);
4589 return 0;
4590 }
4302 return 1; 4591 return 1;
4592 }
4303 } else { 4593 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4594 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4595 return 0;
@@ -4468,15 +4758,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4758 }
4469 4759
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4760 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4761 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4762
4481 if (ilb < nr_cpu_ids) 4763 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4764 resched_cpu(ilb);
@@ -4840,6 +5122,8 @@ void scheduler_tick(void)
4840 curr->sched_class->task_tick(rq, curr, 0); 5122 curr->sched_class->task_tick(rq, curr, 0);
4841 spin_unlock(&rq->lock); 5123 spin_unlock(&rq->lock);
4842 5124
5125 perf_counter_task_tick(curr, cpu);
5126
4843#ifdef CONFIG_SMP 5127#ifdef CONFIG_SMP
4844 rq->idle_at_tick = idle_cpu(cpu); 5128 rq->idle_at_tick = idle_cpu(cpu);
4845 trigger_load_balance(rq, cpu); 5129 trigger_load_balance(rq, cpu);
@@ -5007,13 +5291,15 @@ pick_next_task(struct rq *rq)
5007/* 5291/*
5008 * schedule() is the main scheduler function. 5292 * schedule() is the main scheduler function.
5009 */ 5293 */
5010asmlinkage void __sched __schedule(void) 5294asmlinkage void __sched schedule(void)
5011{ 5295{
5012 struct task_struct *prev, *next; 5296 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5297 unsigned long *switch_count;
5014 struct rq *rq; 5298 struct rq *rq;
5015 int cpu; 5299 int cpu;
5016 5300
5301need_resched:
5302 preempt_disable();
5017 cpu = smp_processor_id(); 5303 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5304 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5305 rcu_qsctr_inc(cpu);
@@ -5053,6 +5339,7 @@ need_resched_nonpreemptible:
5053 5339
5054 if (likely(prev != next)) { 5340 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5341 sched_info_switch(prev, next);
5342 perf_counter_task_sched_out(prev, next, cpu);
5056 5343
5057 rq->nr_switches++; 5344 rq->nr_switches++;
5058 rq->curr = next; 5345 rq->curr = next;
@@ -5070,15 +5357,9 @@ need_resched_nonpreemptible:
5070 5357
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5358 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5359 goto need_resched_nonpreemptible;
5073}
5074 5360
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5361 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5362 if (need_resched())
5082 goto need_resched; 5363 goto need_resched;
5083} 5364}
5084EXPORT_SYMBOL(schedule); 5365EXPORT_SYMBOL(schedule);
@@ -5221,7 +5502,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5502 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5503 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5504 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5505static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5506 int nr_exclusive, int sync, void *key)
5226{ 5507{
5227 wait_queue_t *curr, *next; 5508 wait_queue_t *curr, *next;
@@ -5241,6 +5522,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5522 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5523 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5524 * @key: is directly passed to the wakeup function
5525 *
5526 * It may be assumed that this function implies a write memory barrier before
5527 * changing the task state if and only if any tasks are woken up.
5244 */ 5528 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5529void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5530 int nr_exclusive, void *key)
@@ -5279,6 +5563,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5563 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5564 *
5281 * On UP it can prevent extra preemption. 5565 * On UP it can prevent extra preemption.
5566 *
5567 * It may be assumed that this function implies a write memory barrier before
5568 * changing the task state if and only if any tasks are woken up.
5282 */ 5569 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5570void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5571 int nr_exclusive, void *key)
@@ -5315,6 +5602,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5602 * awakened in the same order in which they were queued.
5316 * 5603 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5604 * See also complete_all(), wait_for_completion() and related routines.
5605 *
5606 * It may be assumed that this function implies a write memory barrier before
5607 * changing the task state if and only if any tasks are woken up.
5318 */ 5608 */
5319void complete(struct completion *x) 5609void complete(struct completion *x)
5320{ 5610{
@@ -5332,6 +5622,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5622 * @x: holds the state of this particular completion
5333 * 5623 *
5334 * This will wake up all threads waiting on this particular completion event. 5624 * This will wake up all threads waiting on this particular completion event.
5625 *
5626 * It may be assumed that this function implies a write memory barrier before
5627 * changing the task state if and only if any tasks are woken up.
5335 */ 5628 */
5336void complete_all(struct completion *x) 5629void complete_all(struct completion *x)
5337{ 5630{
@@ -6490,8 +6783,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6783#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6784 free = stack_not_used(p);
6492#endif 6785#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6786 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6787 task_pid_nr(p), task_pid_nr(p->real_parent),
6788 (unsigned long)task_thread_info(p)->flags);
6495 6789
6496 show_stack(p, NULL); 6790 show_stack(p, NULL);
6497} 6791}
@@ -6752,7 +7046,7 @@ static int migration_thread(void *data)
6752 7046
6753 if (cpu_is_offline(cpu)) { 7047 if (cpu_is_offline(cpu)) {
6754 spin_unlock_irq(&rq->lock); 7048 spin_unlock_irq(&rq->lock);
6755 goto wait_to_die; 7049 break;
6756 } 7050 }
6757 7051
6758 if (rq->active_balance) { 7052 if (rq->active_balance) {
@@ -6778,16 +7072,7 @@ static int migration_thread(void *data)
6778 complete(&req->done); 7072 complete(&req->done);
6779 } 7073 }
6780 __set_current_state(TASK_RUNNING); 7074 __set_current_state(TASK_RUNNING);
6781 return 0;
6782 7075
6783wait_to_die:
6784 /* Wait for kthread_stop */
6785 set_current_state(TASK_INTERRUPTIBLE);
6786 while (!kthread_should_stop()) {
6787 schedule();
6788 set_current_state(TASK_INTERRUPTIBLE);
6789 }
6790 __set_current_state(TASK_RUNNING);
6791 return 0; 7076 return 0;
6792} 7077}
6793 7078
@@ -6970,6 +7255,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7255
6971 } 7256 }
6972} 7257}
7258
7259/*
7260 * remove the tasks which were accounted by rq from calc_load_tasks.
7261 */
7262static void calc_global_load_remove(struct rq *rq)
7263{
7264 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7265}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7266#endif /* CONFIG_HOTPLUG_CPU */
6974 7267
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7268#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,6 +7486,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7193 rq = task_rq_lock(p, &flags); 7486 rq = task_rq_lock(p, &flags);
7194 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7487 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7195 task_rq_unlock(rq, &flags); 7488 task_rq_unlock(rq, &flags);
7489 get_task_struct(p);
7196 cpu_rq(cpu)->migration_thread = p; 7490 cpu_rq(cpu)->migration_thread = p;
7197 break; 7491 break;
7198 7492
@@ -7204,6 +7498,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7498 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7499 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7500 spin_lock_irqsave(&rq->lock, flags);
7501 rq->calc_load_update = calc_load_update;
7502 rq->calc_load_active = 0;
7207 if (rq->rd) { 7503 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7504 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7505
@@ -7221,6 +7517,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7221 kthread_bind(cpu_rq(cpu)->migration_thread, 7517 kthread_bind(cpu_rq(cpu)->migration_thread,
7222 cpumask_any(cpu_online_mask)); 7518 cpumask_any(cpu_online_mask));
7223 kthread_stop(cpu_rq(cpu)->migration_thread); 7519 kthread_stop(cpu_rq(cpu)->migration_thread);
7520 put_task_struct(cpu_rq(cpu)->migration_thread);
7224 cpu_rq(cpu)->migration_thread = NULL; 7521 cpu_rq(cpu)->migration_thread = NULL;
7225 break; 7522 break;
7226 7523
@@ -7230,6 +7527,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7230 migrate_live_tasks(cpu); 7527 migrate_live_tasks(cpu);
7231 rq = cpu_rq(cpu); 7528 rq = cpu_rq(cpu);
7232 kthread_stop(rq->migration_thread); 7529 kthread_stop(rq->migration_thread);
7530 put_task_struct(rq->migration_thread);
7233 rq->migration_thread = NULL; 7531 rq->migration_thread = NULL;
7234 /* Idle task back to normal (off runqueue, low prio) */ 7532 /* Idle task back to normal (off runqueue, low prio) */
7235 spin_lock_irq(&rq->lock); 7533 spin_lock_irq(&rq->lock);
@@ -7243,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7541 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7542 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7543 BUG_ON(rq->nr_running != 0);
7246 7544 calc_global_load_remove(rq);
7247 /* 7545 /*
7248 * No need to migrate the tasks: it was best-effort if 7546 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7547 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7577,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7279 return NOTIFY_OK; 7577 return NOTIFY_OK;
7280} 7578}
7281 7579
7282/* Register at highest priority so that task migration (migrate_all_tasks) 7580/*
7283 * happens before everything else. 7581 * Register at high priority so that task migration (migrate_all_tasks)
7582 * happens before everything else. This has to be lower priority than
7583 * the notifier in the perf_counter subsystem, though.
7284 */ 7584 */
7285static struct notifier_block __cpuinitdata migration_notifier = { 7585static struct notifier_block __cpuinitdata migration_notifier = {
7286 .notifier_call = migration_call, 7586 .notifier_call = migration_call,
@@ -7523,26 +7823,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7523 free_rootdomain(old_rd); 7823 free_rootdomain(old_rd);
7524} 7824}
7525 7825
7526static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7826static int init_rootdomain(struct root_domain *rd, bool bootmem)
7527{ 7827{
7828 gfp_t gfp = GFP_KERNEL;
7829
7528 memset(rd, 0, sizeof(*rd)); 7830 memset(rd, 0, sizeof(*rd));
7529 7831
7530 if (bootmem) { 7832 if (bootmem)
7531 alloc_bootmem_cpumask_var(&def_root_domain.span); 7833 gfp = GFP_NOWAIT;
7532 alloc_bootmem_cpumask_var(&def_root_domain.online);
7533 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7534 cpupri_init(&rd->cpupri, true);
7535 return 0;
7536 }
7537 7834
7538 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7835 if (!alloc_cpumask_var(&rd->span, gfp))
7539 goto out; 7836 goto out;
7540 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7837 if (!alloc_cpumask_var(&rd->online, gfp))
7541 goto free_span; 7838 goto free_span;
7542 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7839 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7543 goto free_online; 7840 goto free_online;
7544 7841
7545 if (cpupri_init(&rd->cpupri, false) != 0) 7842 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7546 goto free_rto_mask; 7843 goto free_rto_mask;
7547 return 0; 7844 return 0;
7548 7845
@@ -7753,8 +8050,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 8050
7754/* 8051/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 8052 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8053 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 8054 * ( See the the comments in include/linux/sched.h:struct sched_group
8055 * and struct sched_domain. )
7758 */ 8056 */
7759struct static_sched_group { 8057struct static_sched_group {
7760 struct sched_group sg; 8058 struct sched_group sg;
@@ -7875,7 +8173,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8173 struct sched_domain *sd;
7876 8174
7877 sd = &per_cpu(phys_domains, j).sd; 8175 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8176 if (j != group_first_cpu(sd->groups)) {
7879 /* 8177 /*
7880 * Only add "power" once for each 8178 * Only add "power" once for each
7881 * physical package. 8179 * physical package.
@@ -7953,7 +8251,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8251
7954 WARN_ON(!sd || !sd->groups); 8252 WARN_ON(!sd || !sd->groups);
7955 8253
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8254 if (cpu != group_first_cpu(sd->groups))
7957 return; 8255 return;
7958 8256
7959 child = sd->child; 8257 child = sd->child;
@@ -8731,6 +9029,8 @@ void __init sched_init_smp(void)
8731} 9029}
8732#endif /* CONFIG_SMP */ 9030#endif /* CONFIG_SMP */
8733 9031
9032const_debug unsigned int sysctl_timer_migration = 1;
9033
8734int in_sched_functions(unsigned long addr) 9034int in_sched_functions(unsigned long addr)
8735{ 9035{
8736 return in_lock_functions(addr) || 9036 return in_lock_functions(addr) ||
@@ -8865,7 +9165,7 @@ void __init sched_init(void)
8865 * we use alloc_bootmem(). 9165 * we use alloc_bootmem().
8866 */ 9166 */
8867 if (alloc_size) { 9167 if (alloc_size) {
8868 ptr = (unsigned long)alloc_bootmem(alloc_size); 9168 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8869 9169
8870#ifdef CONFIG_FAIR_GROUP_SCHED 9170#ifdef CONFIG_FAIR_GROUP_SCHED
8871 init_task_group.se = (struct sched_entity **)ptr; 9171 init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9238,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9238 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9239 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9240 rq->nr_running = 0;
9241 rq->calc_load_active = 0;
9242 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9243 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9244 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9245#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9260,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9260 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 9261 * then A0's share of the cpu resource is:
8960 * 9262 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9263 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9264 *
8963 * We achieve this by letting init_task_group's tasks sit 9265 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9266 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9347,26 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9347 * when this runqueue becomes "idle".
9046 */ 9348 */
9047 init_idle(current, smp_processor_id()); 9349 init_idle(current, smp_processor_id());
9350
9351 calc_load_update = jiffies + LOAD_FREQ;
9352
9048 /* 9353 /*
9049 * During early bootup we pretend to be a normal task: 9354 * During early bootup we pretend to be a normal task:
9050 */ 9355 */
9051 current->sched_class = &fair_sched_class; 9356 current->sched_class = &fair_sched_class;
9052 9357
9053 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9358 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9054 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9359 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9055#ifdef CONFIG_SMP 9360#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9361#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9362 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9363 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9058#endif 9364#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9365 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9060#endif /* SMP */ 9366#endif /* SMP */
9061 9367
9368 perf_counter_init();
9369
9062 scheduler_running = 1; 9370 scheduler_running = 1;
9063} 9371}
9064 9372
@@ -9800,6 +10108,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10108 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10109 return -EINVAL;
9802 10110
10111 /*
10112 * There's always some RT tasks in the root group
10113 * -- migration, kstopmachine etc..
10114 */
10115 if (sysctl_sched_rt_runtime == 0)
10116 return -EBUSY;
10117
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10118 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10119 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10120 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 819f17ac796e..e1d16c9a7680 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -38,7 +38,8 @@
38 */ 38 */
39unsigned long long __attribute__((weak)) sched_clock(void) 39unsigned long long __attribute__((weak)) sched_clock(void)
40{ 40{
41 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ);
42} 43}
43 44
44static __read_mostly int sched_clock_running; 45static __read_mostly int sched_clock_running;
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89574cd..e6c251790dde 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -152,10 +152,14 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
152 * 152 *
153 * Returns: -ENOMEM if memory fails. 153 * Returns: -ENOMEM if memory fails.
154 */ 154 */
155int __init_refok cpupri_init(struct cpupri *cp, bool bootmem) 155int cpupri_init(struct cpupri *cp, bool bootmem)
156{ 156{
157 gfp_t gfp = GFP_KERNEL;
157 int i; 158 int i;
158 159
160 if (bootmem)
161 gfp = GFP_NOWAIT;
162
159 memset(cp, 0, sizeof(*cp)); 163 memset(cp, 0, sizeof(*cp));
160 164
161 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 165 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
163 167
164 spin_lock_init(&vec->lock); 168 spin_lock_init(&vec->lock);
165 vec->count = 0; 169 vec->count = 0;
166 if (bootmem) 170 if (!zalloc_cpumask_var(&vec->mask, gfp))
167 alloc_bootmem_cpumask_var(&vec->mask);
168 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
169 goto cleanup; 171 goto cleanup;
170 } 172 }
171 173
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 467ca72f1657..70c7e0b79946 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -162,7 +162,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162{ 162{
163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 163 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
164 spread, rq0_min_vruntime, spread0; 164 spread, rq0_min_vruntime, spread0;
165 struct rq *rq = &per_cpu(runqueues, cpu); 165 struct rq *rq = cpu_rq(cpu);
166 struct sched_entity *last; 166 struct sched_entity *last;
167 unsigned long flags; 167 unsigned long flags;
168 168
@@ -191,7 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
191 if (last) 191 if (last)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = per_cpu(runqueues, 0).cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
@@ -248,7 +248,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
248 248
249static void print_cpu(struct seq_file *m, int cpu) 249static void print_cpu(struct seq_file *m, int cpu)
250{ 250{
251 struct rq *rq = &per_cpu(runqueues, cpu); 251 struct rq *rq = cpu_rq(cpu);
252 252
253#ifdef CONFIG_X86 253#ifdef CONFIG_X86
254 { 254 {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..ba7fd6e9556f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -430,12 +430,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
430 430
431 for_each_sched_entity(se) { 431 for_each_sched_entity(se) {
432 struct load_weight *load; 432 struct load_weight *load;
433 struct load_weight lw;
433 434
434 cfs_rq = cfs_rq_of(se); 435 cfs_rq = cfs_rq_of(se);
435 load = &cfs_rq->load; 436 load = &cfs_rq->load;
436 437
437 if (unlikely(!se->on_rq)) { 438 if (unlikely(!se->on_rq)) {
438 struct load_weight lw = cfs_rq->load; 439 lw = cfs_rq->load;
439 440
440 update_load_add(&lw, se->load.weight); 441 update_load_add(&lw, se->load.weight);
441 load = &lw; 442 load = &lw;
@@ -1487,17 +1488,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1488
1488 find_matching_se(&se, &pse); 1489 find_matching_se(&se, &pse);
1489 1490
1490 while (se) { 1491 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1492
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1493 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1494 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1495}
1502 1496
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1497static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f2c66f8f9712..9bf0d2a73045 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1591,7 +1591,7 @@ static inline void init_sched_rt_class(void)
1591 unsigned int i; 1591 unsigned int i;
1592 1592
1593 for_each_possible_cpu(i) 1593 for_each_possible_cpu(i)
1594 alloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1594 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1595 GFP_KERNEL, cpu_to_node(i)); 1595 GFP_KERNEL, cpu_to_node(i));
1596} 1596}
1597#endif /* CONFIG_SMP */ 1597#endif /* CONFIG_SMP */
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..ccf1ceedaebe 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
27#include <linux/freezer.h> 27#include <linux/freezer.h>
28#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
29#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h> 30#include <trace/events/sched.h>
31 31
32#include <asm/param.h> 32#include <asm/param.h>
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -41,8 +41,6 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
46static void __user *sig_handler(struct task_struct *t, int sig) 44static void __user *sig_handler(struct task_struct *t, int sig)
47{ 45{
48 return t->sighand->action[sig - 1].sa.sa_handler; 46 return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
249/* 247/*
250 * Flush all pending signals for a task. 248 * Flush all pending signals for a task.
251 */ 249 */
250void __flush_signals(struct task_struct *t)
251{
252 clear_tsk_thread_flag(t, TIF_SIGPENDING);
253 flush_sigqueue(&t->pending);
254 flush_sigqueue(&t->signal->shared_pending);
255}
256
252void flush_signals(struct task_struct *t) 257void flush_signals(struct task_struct *t)
253{ 258{
254 unsigned long flags; 259 unsigned long flags;
255 260
256 spin_lock_irqsave(&t->sighand->siglock, flags); 261 spin_lock_irqsave(&t->sighand->siglock, flags);
257 clear_tsk_thread_flag(t, TIF_SIGPENDING); 262 __flush_signals(t);
258 flush_sigqueue(&t->pending);
259 flush_sigqueue(&t->signal->shared_pending);
260 spin_unlock_irqrestore(&t->sighand->siglock, flags); 263 spin_unlock_irqrestore(&t->sighand->siglock, flags);
261} 264}
262 265
@@ -829,6 +832,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
829{ 832{
830 struct sigpending *pending; 833 struct sigpending *pending;
831 struct sigqueue *q; 834 struct sigqueue *q;
835 int override_rlimit;
832 836
833 trace_sched_signal_send(sig, t); 837 trace_sched_signal_send(sig, t);
834 838
@@ -860,9 +864,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
860 make sure at least one signal gets delivered and don't 864 make sure at least one signal gets delivered and don't
861 pass on the info struct. */ 865 pass on the info struct. */
862 866
863 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 867 if (sig < SIGRTMIN)
864 (is_si_special(info) || 868 override_rlimit = (is_si_special(info) || info->si_code >= 0);
865 info->si_code >= 0))); 869 else
870 override_rlimit = 0;
871
872 q = __sigqueue_alloc(t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
873 override_rlimit);
866 if (q) { 874 if (q) {
867 list_add_tail(&q->list, &pending->list); 875 list_add_tail(&q->list, &pending->list);
868 switch ((unsigned long) info) { 876 switch ((unsigned long) info) {
@@ -1402,7 +1410,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1402 /* do_notify_parent_cldstop should have been called instead. */ 1410 /* do_notify_parent_cldstop should have been called instead. */
1403 BUG_ON(task_is_stopped_or_traced(tsk)); 1411 BUG_ON(task_is_stopped_or_traced(tsk));
1404 1412
1405 BUG_ON(!tsk->ptrace && 1413 BUG_ON(!task_ptrace(tsk) &&
1406 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1414 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1407 1415
1408 info.si_signo = sig; 1416 info.si_signo = sig;
@@ -1441,7 +1449,7 @@ int do_notify_parent(struct task_struct *tsk, int sig)
1441 1449
1442 psig = tsk->parent->sighand; 1450 psig = tsk->parent->sighand;
1443 spin_lock_irqsave(&psig->siglock, flags); 1451 spin_lock_irqsave(&psig->siglock, flags);
1444 if (!tsk->ptrace && sig == SIGCHLD && 1452 if (!task_ptrace(tsk) && sig == SIGCHLD &&
1445 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1453 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1446 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1454 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1447 /* 1455 /*
@@ -1478,7 +1486,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1478 struct task_struct *parent; 1486 struct task_struct *parent;
1479 struct sighand_struct *sighand; 1487 struct sighand_struct *sighand;
1480 1488
1481 if (tsk->ptrace & PT_PTRACED) 1489 if (task_ptrace(tsk))
1482 parent = tsk->parent; 1490 parent = tsk->parent;
1483 else { 1491 else {
1484 tsk = tsk->group_leader; 1492 tsk = tsk->group_leader;
@@ -1491,7 +1499,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1491 * see comment in do_notify_parent() abot the following 3 lines 1499 * see comment in do_notify_parent() abot the following 3 lines
1492 */ 1500 */
1493 rcu_read_lock(); 1501 rcu_read_lock();
1494 info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); 1502 info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
1495 info.si_uid = __task_cred(tsk)->uid; 1503 info.si_uid = __task_cred(tsk)->uid;
1496 rcu_read_unlock(); 1504 rcu_read_unlock();
1497 1505
@@ -1527,7 +1535,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1527 1535
1528static inline int may_ptrace_stop(void) 1536static inline int may_ptrace_stop(void)
1529{ 1537{
1530 if (!likely(current->ptrace & PT_PTRACED)) 1538 if (!likely(task_ptrace(current)))
1531 return 0; 1539 return 0;
1532 /* 1540 /*
1533 * Are we in the middle of do_coredump? 1541 * Are we in the middle of do_coredump?
@@ -1745,7 +1753,7 @@ static int do_signal_stop(int signr)
1745static int ptrace_signal(int signr, siginfo_t *info, 1753static int ptrace_signal(int signr, siginfo_t *info,
1746 struct pt_regs *regs, void *cookie) 1754 struct pt_regs *regs, void *cookie)
1747{ 1755{
1748 if (!(current->ptrace & PT_PTRACED)) 1756 if (!task_ptrace(current))
1749 return signr; 1757 return signr;
1750 1758
1751 ptrace_signal_deliver(regs, cookie); 1759 ptrace_signal_deliver(regs, cookie);
@@ -2278,24 +2286,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2278 return kill_something_info(sig, &info, pid); 2286 return kill_something_info(sig, &info, pid);
2279} 2287}
2280 2288
2281static int do_tkill(pid_t tgid, pid_t pid, int sig) 2289static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2282{ 2291{
2283 int error;
2284 struct siginfo info;
2285 struct task_struct *p; 2292 struct task_struct *p;
2286 unsigned long flags; 2293 unsigned long flags;
2287 2294 int error = -ESRCH;
2288 error = -ESRCH;
2289 info.si_signo = sig;
2290 info.si_errno = 0;
2291 info.si_code = SI_TKILL;
2292 info.si_pid = task_tgid_vnr(current);
2293 info.si_uid = current_uid();
2294 2295
2295 rcu_read_lock(); 2296 rcu_read_lock();
2296 p = find_task_by_vpid(pid); 2297 p = find_task_by_vpid(pid);
2297 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2298 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2298 error = check_kill_permission(sig, &info, p); 2299 error = check_kill_permission(sig, info, p);
2299 /* 2300 /*
2300 * The null signal is a permissions and process existence 2301 * The null signal is a permissions and process existence
2301 * probe. No signal is actually delivered. 2302 * probe. No signal is actually delivered.
@@ -2305,7 +2306,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2305 * signal is private anyway. 2306 * signal is private anyway.
2306 */ 2307 */
2307 if (!error && sig && lock_task_sighand(p, &flags)) { 2308 if (!error && sig && lock_task_sighand(p, &flags)) {
2308 error = specific_send_sig_info(sig, &info, p); 2309 error = specific_send_sig_info(sig, info, p);
2309 unlock_task_sighand(p, &flags); 2310 unlock_task_sighand(p, &flags);
2310 } 2311 }
2311 } 2312 }
@@ -2314,6 +2315,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314 return error; 2315 return error;
2315} 2316}
2316 2317
2318static int do_tkill(pid_t tgid, pid_t pid, int sig)
2319{
2320 struct siginfo info;
2321
2322 info.si_signo = sig;
2323 info.si_errno = 0;
2324 info.si_code = SI_TKILL;
2325 info.si_pid = task_tgid_vnr(current);
2326 info.si_uid = current_uid();
2327
2328 return do_send_specific(tgid, pid, sig, &info);
2329}
2330
2317/** 2331/**
2318 * sys_tgkill - send signal to one specific thread 2332 * sys_tgkill - send signal to one specific thread
2319 * @tgid: the thread group ID of the thread 2333 * @tgid: the thread group ID of the thread
@@ -2363,6 +2377,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2363 return kill_proc_info(sig, &info, pid); 2377 return kill_proc_info(sig, &info, pid);
2364} 2378}
2365 2379
2380long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2381{
2382 /* This is only valid for single tasks */
2383 if (pid <= 0 || tgid <= 0)
2384 return -EINVAL;
2385
2386 /* Not even root can pretend to send signals from the kernel.
2387 Nor can they impersonate a kill(), which adds source info. */
2388 if (info->si_code >= 0)
2389 return -EPERM;
2390 info->si_signo = sig;
2391
2392 return do_send_specific(tgid, pid, sig, info);
2393}
2394
2395SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2396 siginfo_t __user *, uinfo)
2397{
2398 siginfo_t info;
2399
2400 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2401 return -EFAULT;
2402
2403 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2404}
2405
2366int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2406int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2367{ 2407{
2368 struct task_struct *t = current; 2408 struct task_struct *t = current;
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f43..09d7519557d3 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -319,6 +319,15 @@ cant_get_ref:
319EXPORT_SYMBOL(slow_work_enqueue); 319EXPORT_SYMBOL(slow_work_enqueue);
320 320
321/* 321/*
322 * Schedule a cull of the thread pool at some time in the near future
323 */
324static void slow_work_schedule_cull(void)
325{
326 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328}
329
330/*
322 * Worker thread culling algorithm 331 * Worker thread culling algorithm
323 */ 332 */
324static bool slow_work_cull_thread(void) 333static bool slow_work_cull_thread(void)
@@ -335,8 +344,7 @@ static bool slow_work_cull_thread(void)
335 list_empty(&vslow_work_queue) && 344 list_empty(&vslow_work_queue) &&
336 atomic_read(&slow_work_thread_count) > 345 atomic_read(&slow_work_thread_count) >
337 slow_work_min_threads) { 346 slow_work_min_threads) {
338 mod_timer(&slow_work_cull_timer, 347 slow_work_schedule_cull();
339 jiffies + SLOW_WORK_CULL_TIMEOUT);
340 do_cull = true; 348 do_cull = true;
341 } 349 }
342 } 350 }
@@ -372,8 +380,8 @@ static int slow_work_thread(void *_data)
372 vsmax *= atomic_read(&slow_work_thread_count); 380 vsmax *= atomic_read(&slow_work_thread_count);
373 vsmax /= 100; 381 vsmax /= 100;
374 382
375 prepare_to_wait(&slow_work_thread_wq, &wait, 383 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
376 TASK_INTERRUPTIBLE); 384 TASK_INTERRUPTIBLE);
377 if (!freezing(current) && 385 if (!freezing(current) &&
378 !slow_work_threads_should_exit && 386 !slow_work_threads_should_exit &&
379 !slow_work_available(vsmax) && 387 !slow_work_available(vsmax) &&
@@ -393,8 +401,7 @@ static int slow_work_thread(void *_data)
393 list_empty(&vslow_work_queue) && 401 list_empty(&vslow_work_queue) &&
394 atomic_read(&slow_work_thread_count) > 402 atomic_read(&slow_work_thread_count) >
395 slow_work_min_threads) 403 slow_work_min_threads)
396 mod_timer(&slow_work_cull_timer, 404 slow_work_schedule_cull();
397 jiffies + SLOW_WORK_CULL_TIMEOUT);
398 continue; 405 continue;
399 } 406 }
400 407
@@ -458,7 +465,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
458 if (atomic_dec_and_test(&slow_work_thread_count)) 465 if (atomic_dec_and_test(&slow_work_thread_count))
459 BUG(); /* we're running on a slow work thread... */ 466 BUG(); /* we're running on a slow work thread... */
460 mod_timer(&slow_work_oom_timer, 467 mod_timer(&slow_work_oom_timer,
461 jiffies + SLOW_WORK_OOM_TIMEOUT); 468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
462 } else { 469 } else {
463 /* ratelimit the starting of new threads */ 470 /* ratelimit the starting of new threads */
464 mod_timer(&slow_work_oom_timer, jiffies + 1); 471 mod_timer(&slow_work_oom_timer, jiffies + 1);
@@ -502,8 +509,7 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
502 if (n < 0 && !slow_work_may_not_start_new_thread) 509 if (n < 0 && !slow_work_may_not_start_new_thread)
503 slow_work_enqueue(&slow_work_new_thread); 510 slow_work_enqueue(&slow_work_new_thread);
504 else if (n > 0) 511 else if (n > 0)
505 mod_timer(&slow_work_cull_timer, 512 slow_work_schedule_cull();
506 jiffies + SLOW_WORK_CULL_TIMEOUT);
507 } 513 }
508 mutex_unlock(&slow_work_user_lock); 514 mutex_unlock(&slow_work_user_lock);
509 } 515 }
@@ -529,8 +535,7 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
529 atomic_read(&slow_work_thread_count); 535 atomic_read(&slow_work_thread_count);
530 536
531 if (n < 0) 537 if (n < 0)
532 mod_timer(&slow_work_cull_timer, 538 slow_work_schedule_cull();
533 jiffies + SLOW_WORK_CULL_TIMEOUT);
534 } 539 }
535 mutex_unlock(&slow_work_user_lock); 540 mutex_unlock(&slow_work_user_lock);
536 } 541 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 858baac568ee..ad63d8501207 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 switch (action) { 52 switch (action) {
53 case CPU_UP_PREPARE: 53 case CPU_UP_PREPARE:
54 case CPU_UP_PREPARE_FROZEN: 54 case CPU_UP_PREPARE_FROZEN:
55 if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 55 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
56 cpu_to_node(cpu))) 56 cpu_to_node(cpu)))
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511..3a94905fa5d2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,9 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/smp.h> 25#include <linux/smp.h>
26#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h> 27
28#define CREATE_TRACE_POINTS
29#include <trace/events/irq.h>
28 30
29#include <asm/irq.h> 31#include <asm/irq.h>
30/* 32/*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
186 */ 188 */
187#define MAX_SOFTIRQ_RESTART 10 189#define MAX_SOFTIRQ_RESTART 10
188 190
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
192asmlinkage void __do_softirq(void) 191asmlinkage void __do_softirq(void)
193{ 192{
194 struct softirq_action *h; 193 struct softirq_action *h;
@@ -214,6 +213,7 @@ restart:
214 do { 213 do {
215 if (pending & 1) { 214 if (pending & 1) {
216 int prev_count = preempt_count(); 215 int prev_count = preempt_count();
216 kstat_incr_softirqs_this_cpu(h - softirq_vec);
217 217
218 trace_softirq_entry(h, softirq_vec); 218 trace_softirq_entry(h, softirq_vec);
219 h->action(h); 219 h->action(h);
@@ -383,6 +383,17 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
383 383
384EXPORT_SYMBOL(__tasklet_hi_schedule); 384EXPORT_SYMBOL(__tasklet_hi_schedule);
385 385
386void __tasklet_hi_schedule_first(struct tasklet_struct *t)
387{
388 BUG_ON(!irqs_disabled());
389
390 t->next = __get_cpu_var(tasklet_hi_vec).head;
391 __get_cpu_var(tasklet_hi_vec).head = t;
392 __raise_softirq_irqoff(HI_SOFTIRQ);
393}
394
395EXPORT_SYMBOL(__tasklet_hi_schedule_first);
396
386static void tasklet_action(struct softirq_action *a) 397static void tasklet_action(struct softirq_action *a)
387{ 398{
388 struct tasklet_struct *list; 399 struct tasklet_struct *list;
@@ -828,7 +839,7 @@ int __init __weak arch_early_irq_init(void)
828 return 0; 839 return 0;
829} 840}
830 841
831int __weak arch_init_chip_data(struct irq_desc *desc, int cpu) 842int __weak arch_init_chip_data(struct irq_desc *desc, int node)
832{ 843{
833 return 0; 844 return 0;
834} 845}
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..b3f1097c76fa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1112,289 +1113,6 @@ out:
1112 return err; 1113 return err;
1113} 1114}
1114 1115
1115/*
1116 * Supplementary group IDs
1117 */
1118
1119/* init to 2 - one for init_task, one to ensure it is never freed */
1120struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
1121
1122struct group_info *groups_alloc(int gidsetsize)
1123{
1124 struct group_info *group_info;
1125 int nblocks;
1126 int i;
1127
1128 nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
1129 /* Make sure we always allocate at least one indirect block pointer */
1130 nblocks = nblocks ? : 1;
1131 group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
1132 if (!group_info)
1133 return NULL;
1134 group_info->ngroups = gidsetsize;
1135 group_info->nblocks = nblocks;
1136 atomic_set(&group_info->usage, 1);
1137
1138 if (gidsetsize <= NGROUPS_SMALL)
1139 group_info->blocks[0] = group_info->small_block;
1140 else {
1141 for (i = 0; i < nblocks; i++) {
1142 gid_t *b;
1143 b = (void *)__get_free_page(GFP_USER);
1144 if (!b)
1145 goto out_undo_partial_alloc;
1146 group_info->blocks[i] = b;
1147 }
1148 }
1149 return group_info;
1150
1151out_undo_partial_alloc:
1152 while (--i >= 0) {
1153 free_page((unsigned long)group_info->blocks[i]);
1154 }
1155 kfree(group_info);
1156 return NULL;
1157}
1158
1159EXPORT_SYMBOL(groups_alloc);
1160
1161void groups_free(struct group_info *group_info)
1162{
1163 if (group_info->blocks[0] != group_info->small_block) {
1164 int i;
1165 for (i = 0; i < group_info->nblocks; i++)
1166 free_page((unsigned long)group_info->blocks[i]);
1167 }
1168 kfree(group_info);
1169}
1170
1171EXPORT_SYMBOL(groups_free);
1172
1173/* export the group_info to a user-space array */
1174static int groups_to_user(gid_t __user *grouplist,
1175 const struct group_info *group_info)
1176{
1177 int i;
1178 unsigned int count = group_info->ngroups;
1179
1180 for (i = 0; i < group_info->nblocks; i++) {
1181 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1182 unsigned int len = cp_count * sizeof(*grouplist);
1183
1184 if (copy_to_user(grouplist, group_info->blocks[i], len))
1185 return -EFAULT;
1186
1187 grouplist += NGROUPS_PER_BLOCK;
1188 count -= cp_count;
1189 }
1190 return 0;
1191}
1192
1193/* fill a group_info from a user-space array - it must be allocated already */
1194static int groups_from_user(struct group_info *group_info,
1195 gid_t __user *grouplist)
1196{
1197 int i;
1198 unsigned int count = group_info->ngroups;
1199
1200 for (i = 0; i < group_info->nblocks; i++) {
1201 unsigned int cp_count = min(NGROUPS_PER_BLOCK, count);
1202 unsigned int len = cp_count * sizeof(*grouplist);
1203
1204 if (copy_from_user(group_info->blocks[i], grouplist, len))
1205 return -EFAULT;
1206
1207 grouplist += NGROUPS_PER_BLOCK;
1208 count -= cp_count;
1209 }
1210 return 0;
1211}
1212
1213/* a simple Shell sort */
1214static void groups_sort(struct group_info *group_info)
1215{
1216 int base, max, stride;
1217 int gidsetsize = group_info->ngroups;
1218
1219 for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
1220 ; /* nothing */
1221 stride /= 3;
1222
1223 while (stride) {
1224 max = gidsetsize - stride;
1225 for (base = 0; base < max; base++) {
1226 int left = base;
1227 int right = left + stride;
1228 gid_t tmp = GROUP_AT(group_info, right);
1229
1230 while (left >= 0 && GROUP_AT(group_info, left) > tmp) {
1231 GROUP_AT(group_info, right) =
1232 GROUP_AT(group_info, left);
1233 right = left;
1234 left -= stride;
1235 }
1236 GROUP_AT(group_info, right) = tmp;
1237 }
1238 stride /= 3;
1239 }
1240}
1241
1242/* a simple bsearch */
1243int groups_search(const struct group_info *group_info, gid_t grp)
1244{
1245 unsigned int left, right;
1246
1247 if (!group_info)
1248 return 0;
1249
1250 left = 0;
1251 right = group_info->ngroups;
1252 while (left < right) {
1253 unsigned int mid = (left+right)/2;
1254 int cmp = grp - GROUP_AT(group_info, mid);
1255 if (cmp > 0)
1256 left = mid + 1;
1257 else if (cmp < 0)
1258 right = mid;
1259 else
1260 return 1;
1261 }
1262 return 0;
1263}
1264
1265/**
1266 * set_groups - Change a group subscription in a set of credentials
1267 * @new: The newly prepared set of credentials to alter
1268 * @group_info: The group list to install
1269 *
1270 * Validate a group subscription and, if valid, insert it into a set
1271 * of credentials.
1272 */
1273int set_groups(struct cred *new, struct group_info *group_info)
1274{
1275 int retval;
1276
1277 retval = security_task_setgroups(group_info);
1278 if (retval)
1279 return retval;
1280
1281 put_group_info(new->group_info);
1282 groups_sort(group_info);
1283 get_group_info(group_info);
1284 new->group_info = group_info;
1285 return 0;
1286}
1287
1288EXPORT_SYMBOL(set_groups);
1289
1290/**
1291 * set_current_groups - Change current's group subscription
1292 * @group_info: The group list to impose
1293 *
1294 * Validate a group subscription and, if valid, impose it upon current's task
1295 * security record.
1296 */
1297int set_current_groups(struct group_info *group_info)
1298{
1299 struct cred *new;
1300 int ret;
1301
1302 new = prepare_creds();
1303 if (!new)
1304 return -ENOMEM;
1305
1306 ret = set_groups(new, group_info);
1307 if (ret < 0) {
1308 abort_creds(new);
1309 return ret;
1310 }
1311
1312 return commit_creds(new);
1313}
1314
1315EXPORT_SYMBOL(set_current_groups);
1316
1317SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
1318{
1319 const struct cred *cred = current_cred();
1320 int i;
1321
1322 if (gidsetsize < 0)
1323 return -EINVAL;
1324
1325 /* no need to grab task_lock here; it cannot change */
1326 i = cred->group_info->ngroups;
1327 if (gidsetsize) {
1328 if (i > gidsetsize) {
1329 i = -EINVAL;
1330 goto out;
1331 }
1332 if (groups_to_user(grouplist, cred->group_info)) {
1333 i = -EFAULT;
1334 goto out;
1335 }
1336 }
1337out:
1338 return i;
1339}
1340
1341/*
1342 * SMP: Our groups are copy-on-write. We can set them safely
1343 * without another task interfering.
1344 */
1345
1346SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
1347{
1348 struct group_info *group_info;
1349 int retval;
1350
1351 if (!capable(CAP_SETGID))
1352 return -EPERM;
1353 if ((unsigned)gidsetsize > NGROUPS_MAX)
1354 return -EINVAL;
1355
1356 group_info = groups_alloc(gidsetsize);
1357 if (!group_info)
1358 return -ENOMEM;
1359 retval = groups_from_user(group_info, grouplist);
1360 if (retval) {
1361 put_group_info(group_info);
1362 return retval;
1363 }
1364
1365 retval = set_current_groups(group_info);
1366 put_group_info(group_info);
1367
1368 return retval;
1369}
1370
1371/*
1372 * Check whether we're fsgid/egid or in the supplemental group..
1373 */
1374int in_group_p(gid_t grp)
1375{
1376 const struct cred *cred = current_cred();
1377 int retval = 1;
1378
1379 if (grp != cred->fsgid)
1380 retval = groups_search(cred->group_info, grp);
1381 return retval;
1382}
1383
1384EXPORT_SYMBOL(in_group_p);
1385
1386int in_egroup_p(gid_t grp)
1387{
1388 const struct cred *cred = current_cred();
1389 int retval = 1;
1390
1391 if (grp != cred->egid)
1392 retval = groups_search(cred->group_info, grp);
1393 return retval;
1394}
1395
1396EXPORT_SYMBOL(in_egroup_p);
1397
1398DECLARE_RWSEM(uts_sem); 1116DECLARE_RWSEM(uts_sem);
1399 1117
1400SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) 1118SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
@@ -1793,6 +1511,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1511 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1512 error = SET_TSC_CTL(arg2);
1795 break; 1513 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE:
1515 error = perf_counter_task_disable();
1516 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE:
1518 error = perf_counter_task_enable();
1519 break;
1796 case PR_GET_TIMERSLACK: 1520 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1521 error = current->timer_slack_ns;
1798 break; 1522 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ea78fa101ad6..62e4ff9968b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -27,6 +27,7 @@
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h> 29#include <linux/utsname.h>
30#include <linux/kmemcheck.h>
30#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
31#include <linux/fs.h> 32#include <linux/fs.h>
32#include <linux/init.h> 33#include <linux/init.h>
@@ -49,6 +50,7 @@
49#include <linux/reboot.h> 50#include <linux/reboot.h>
50#include <linux/ftrace.h> 51#include <linux/ftrace.h>
51#include <linux/slow-work.h> 52#include <linux/slow-work.h>
53#include <linux/perf_counter.h>
52 54
53#include <asm/uaccess.h> 55#include <asm/uaccess.h>
54#include <asm/processor.h> 56#include <asm/processor.h>
@@ -101,7 +103,6 @@ static int __maybe_unused one = 1;
101static int __maybe_unused two = 2; 103static int __maybe_unused two = 2;
102static unsigned long one_ul = 1; 104static unsigned long one_ul = 1;
103static int one_hundred = 100; 105static int one_hundred = 100;
104static int one_thousand = 1000;
105 106
106/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 107/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
107static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 108static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -115,6 +116,7 @@ static int ngroups_max = NGROUPS_MAX;
115 116
116#ifdef CONFIG_MODULES 117#ifdef CONFIG_MODULES
117extern char modprobe_path[]; 118extern char modprobe_path[];
119extern int modules_disabled;
118#endif 120#endif
119#ifdef CONFIG_CHR_DEV_SG 121#ifdef CONFIG_CHR_DEV_SG
120extern int sg_big_buff; 122extern int sg_big_buff;
@@ -327,6 +329,14 @@ static struct ctl_table kern_table[] = {
327 .mode = 0644, 329 .mode = 0644,
328 .proc_handler = &proc_dointvec, 330 .proc_handler = &proc_dointvec,
329 }, 331 },
332 {
333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration",
335 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
330#endif 340#endif
331 { 341 {
332 .ctl_name = CTL_UNNUMBERED, 342 .ctl_name = CTL_UNNUMBERED,
@@ -535,6 +545,17 @@ static struct ctl_table kern_table[] = {
535 .proc_handler = &proc_dostring, 545 .proc_handler = &proc_dostring,
536 .strategy = &sysctl_string, 546 .strategy = &sysctl_string,
537 }, 547 },
548 {
549 .ctl_name = CTL_UNNUMBERED,
550 .procname = "modules_disabled",
551 .data = &modules_disabled,
552 .maxlen = sizeof(int),
553 .mode = 0644,
554 /* only handle a transition from default "0" to "1" */
555 .proc_handler = &proc_dointvec_minmax,
556 .extra1 = &one,
557 .extra2 = &one,
558 },
538#endif 559#endif
539#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 560#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
540 { 561 {
@@ -732,6 +753,14 @@ static struct ctl_table kern_table[] = {
732 }, 753 },
733 { 754 {
734 .ctl_name = CTL_UNNUMBERED, 755 .ctl_name = CTL_UNNUMBERED,
756 .procname = "bootloader_version",
757 .data = &bootloader_version,
758 .maxlen = sizeof (int),
759 .mode = 0444,
760 .proc_handler = &proc_dointvec,
761 },
762 {
763 .ctl_name = CTL_UNNUMBERED,
735 .procname = "kstack_depth_to_print", 764 .procname = "kstack_depth_to_print",
736 .data = &kstack_depth_to_print, 765 .data = &kstack_depth_to_print,
737 .maxlen = sizeof(int), 766 .maxlen = sizeof(int),
@@ -913,6 +942,43 @@ static struct ctl_table kern_table[] = {
913 .child = slow_work_sysctls, 942 .child = slow_work_sysctls,
914 }, 943 },
915#endif 944#endif
945#ifdef CONFIG_PERF_COUNTERS
946 {
947 .ctl_name = CTL_UNNUMBERED,
948 .procname = "perf_counter_paranoid",
949 .data = &sysctl_perf_counter_paranoid,
950 .maxlen = sizeof(sysctl_perf_counter_paranoid),
951 .mode = 0644,
952 .proc_handler = &proc_dointvec,
953 },
954 {
955 .ctl_name = CTL_UNNUMBERED,
956 .procname = "perf_counter_mlock_kb",
957 .data = &sysctl_perf_counter_mlock,
958 .maxlen = sizeof(sysctl_perf_counter_mlock),
959 .mode = 0644,
960 .proc_handler = &proc_dointvec,
961 },
962 {
963 .ctl_name = CTL_UNNUMBERED,
964 .procname = "perf_counter_max_sample_rate",
965 .data = &sysctl_perf_counter_sample_rate,
966 .maxlen = sizeof(sysctl_perf_counter_sample_rate),
967 .mode = 0644,
968 .proc_handler = &proc_dointvec,
969 },
970#endif
971#ifdef CONFIG_KMEMCHECK
972 {
973 .ctl_name = CTL_UNNUMBERED,
974 .procname = "kmemcheck",
975 .data = &kmemcheck_enabled,
976 .maxlen = sizeof(int),
977 .mode = 0644,
978 .proc_handler = &proc_dointvec,
979 },
980#endif
981
916/* 982/*
917 * NOTE: do not add new entries to this table unless you have read 983 * NOTE: do not add new entries to this table unless you have read
918 * Documentation/sysctl/ctl_unnumbered.txt 984 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1034,28 +1100,6 @@ static struct ctl_table vm_table[] = {
1034 .proc_handler = &proc_dointvec, 1100 .proc_handler = &proc_dointvec,
1035 }, 1101 },
1036 { 1102 {
1037 .ctl_name = CTL_UNNUMBERED,
1038 .procname = "nr_pdflush_threads_min",
1039 .data = &nr_pdflush_threads_min,
1040 .maxlen = sizeof nr_pdflush_threads_min,
1041 .mode = 0644 /* read-write */,
1042 .proc_handler = &proc_dointvec_minmax,
1043 .strategy = &sysctl_intvec,
1044 .extra1 = &one,
1045 .extra2 = &nr_pdflush_threads_max,
1046 },
1047 {
1048 .ctl_name = CTL_UNNUMBERED,
1049 .procname = "nr_pdflush_threads_max",
1050 .data = &nr_pdflush_threads_max,
1051 .maxlen = sizeof nr_pdflush_threads_max,
1052 .mode = 0644 /* read-write */,
1053 .proc_handler = &proc_dointvec_minmax,
1054 .strategy = &sysctl_intvec,
1055 .extra1 = &nr_pdflush_threads_min,
1056 .extra2 = &one_thousand,
1057 },
1058 {
1059 .ctl_name = VM_SWAPPINESS, 1103 .ctl_name = VM_SWAPPINESS,
1060 .procname = "swappiness", 1104 .procname = "swappiness",
1061 .data = &vm_swappiness, 1105 .data = &vm_swappiness,
@@ -1248,7 +1292,6 @@ static struct ctl_table vm_table[] = {
1248 .strategy = &sysctl_jiffies, 1292 .strategy = &sysctl_jiffies,
1249 }, 1293 },
1250#endif 1294#endif
1251#ifdef CONFIG_SECURITY
1252 { 1295 {
1253 .ctl_name = CTL_UNNUMBERED, 1296 .ctl_name = CTL_UNNUMBERED,
1254 .procname = "mmap_min_addr", 1297 .procname = "mmap_min_addr",
@@ -1257,7 +1300,6 @@ static struct ctl_table vm_table[] = {
1257 .mode = 0644, 1300 .mode = 0644,
1258 .proc_handler = &proc_doulongvec_minmax, 1301 .proc_handler = &proc_doulongvec_minmax,
1259 }, 1302 },
1260#endif
1261#ifdef CONFIG_NUMA 1303#ifdef CONFIG_NUMA
1262 { 1304 {
1263 .ctl_name = CTL_UNNUMBERED, 1305 .ctl_name = CTL_UNNUMBERED,
@@ -1295,7 +1337,6 @@ static struct ctl_table vm_table[] = {
1295 .extra2 = &one, 1337 .extra2 = &one,
1296 }, 1338 },
1297#endif 1339#endif
1298#ifdef CONFIG_UNEVICTABLE_LRU
1299 { 1340 {
1300 .ctl_name = CTL_UNNUMBERED, 1341 .ctl_name = CTL_UNNUMBERED,
1301 .procname = "scan_unevictable_pages", 1342 .procname = "scan_unevictable_pages",
@@ -1304,7 +1345,6 @@ static struct ctl_table vm_table[] = {
1304 .mode = 0644, 1345 .mode = 0644,
1305 .proc_handler = &scan_unevictable_handler, 1346 .proc_handler = &scan_unevictable_handler,
1306 }, 1347 },
1307#endif
1308/* 1348/*
1309 * NOTE: do not add new entries to this table unless you have read 1349 * NOTE: do not add new entries to this table unless you have read
1310 * Documentation/sysctl/ctl_unnumbered.txt 1350 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2243,7 +2283,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2243 void *data) 2283 void *data)
2244{ 2284{
2245#define TMPBUFLEN 21 2285#define TMPBUFLEN 21
2246 int *i, vleft, first=1, neg, val; 2286 int *i, vleft, first = 1, neg;
2247 unsigned long lval; 2287 unsigned long lval;
2248 size_t left, len; 2288 size_t left, len;
2249 2289
@@ -2296,8 +2336,6 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2296 len = p-buf; 2336 len = p-buf;
2297 if ((len < left) && *p && !isspace(*p)) 2337 if ((len < left) && *p && !isspace(*p))
2298 break; 2338 break;
2299 if (neg)
2300 val = -val;
2301 s += len; 2339 s += len;
2302 left -= len; 2340 left -= len;
2303 2341
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d13be216a790..1ad6dd461119 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,6 +18,7 @@
18#include <linux/notifier.h> 18#include <linux/notifier.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h>
21 22
22/* The registered clock event devices */ 23/* The registered clock event devices */
23static LIST_HEAD(clockevent_devices); 24static LIST_HEAD(clockevent_devices);
@@ -54,6 +55,7 @@ unsigned long clockevent_delta2ns(unsigned long latch,
54 55
55 return (unsigned long) clc; 56 return (unsigned long) clc;
56} 57}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns);
57 59
58/** 60/**
59 * clockevents_set_mode - set the operating mode of a clock event device 61 * clockevents_set_mode - set the operating mode of a clock event device
@@ -187,6 +189,7 @@ void clockevents_register_device(struct clock_event_device *dev)
187 189
188 spin_unlock(&clockevents_lock); 190 spin_unlock(&clockevents_lock);
189} 191}
192EXPORT_SYMBOL_GPL(clockevents_register_device);
190 193
191/* 194/*
192 * Noop handler when we shut down an event device 195 * Noop handler when we shut down an event device
@@ -251,4 +254,15 @@ void clockevents_notify(unsigned long reason, void *arg)
251 spin_unlock(&clockevents_lock); 254 spin_unlock(&clockevents_lock);
252} 255}
253EXPORT_SYMBOL_GPL(clockevents_notify); 256EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
254#endif 268#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0..592bf584d1d2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
402 unsigned long flags; 402 unsigned long flags;
403 int ret; 403 int ret;
404 404
405 /* save mult_orig on registration */
406 c->mult_orig = c->mult;
407
408 spin_lock_irqsave(&clocksource_lock, flags); 405 spin_lock_irqsave(&clocksource_lock, flags);
409 ret = clocksource_enqueue(c); 406 ret = clocksource_enqueue(c);
410 if (!ret) 407 if (!ret)
@@ -512,6 +509,18 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
512 } 509 }
513 } 510 }
514 511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
515 /* Reselect, when the override name has changed */ 524 /* Reselect, when the override name has changed */
516 if (ovr != clocksource_override) { 525 if (ovr != clocksource_override) {
517 clocksource_override = ovr; 526 clocksource_override = ovr;
@@ -540,7 +549,13 @@ sysfs_show_available_clocksources(struct sys_device *dev,
540 549
541 spin_lock_irq(&clocksource_lock); 550 spin_lock_irq(&clocksource_lock);
542 list_for_each_entry(src, &clocksource_list, list) { 551 list_for_each_entry(src, &clocksource_list, list) {
543 count += snprintf(buf + count, 552 /*
553 * Don't show non-HRES clocksource if the tick code is
554 * in one shot mode (highres=on or nohz=on)
555 */
556 if (!tick_oneshot_mode_active() ||
557 (src->flags & CLOCK_SOURCE_VALID_FOR_HRES))
558 count += snprintf(buf + count,
544 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
545 "%s ", src->name); 560 "%s ", src->name);
546 } 561 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 118a3b3b3f9a..877dbedc3118 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -27,7 +27,7 @@
27 * timer stops in C3 state. 27 * timer stops in C3 state.
28 */ 28 */
29 29
30struct tick_device tick_broadcast_device; 30static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8de678e767..a96c0e2b89cf 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -128,6 +128,23 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
128 return 0; 128 return 0;
129} 129}
130 130
131/**
132 * tick_check_oneshot_mode - check whether the system is in oneshot mode
133 *
134 * returns 1 when either nohz or highres are enabled. otherwise 0.
135 */
136int tick_oneshot_mode_active(void)
137{
138 unsigned long flags;
139 int ret;
140
141 local_irq_save(flags);
142 ret = __get_cpu_var(tick_cpu_device).mode == TICKDEV_MODE_ONESHOT;
143 local_irq_restore(flags);
144
145 return ret;
146}
147
131#ifdef CONFIG_HIGH_RES_TIMERS 148#ifdef CONFIG_HIGH_RES_TIMERS
132/** 149/**
133 * tick_init_highres - switch to high resolution mode 150 * tick_init_highres - switch to high resolution mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d3f1ef4d5cbe..e0f59a21c061 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -222,6 +222,15 @@ void tick_nohz_stop_sched_tick(int inidle)
222 222
223 cpu = smp_processor_id(); 223 cpu = smp_processor_id();
224 ts = &per_cpu(tick_cpu_sched, cpu); 224 ts = &per_cpu(tick_cpu_sched, cpu);
225
226 /*
227 * Call to tick_nohz_start_idle stops the last_update_time from being
228 * updated. Thus, it must not be called in the event we are called from
229 * irq_exit() with the prior state different than idle.
230 */
231 if (!inidle && !ts->inidle)
232 goto end;
233
225 now = tick_nohz_start_idle(ts); 234 now = tick_nohz_start_idle(ts);
226 235
227 /* 236 /*
@@ -239,9 +248,6 @@ void tick_nohz_stop_sched_tick(int inidle)
239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
240 goto end; 249 goto end;
241 250
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1; 251 ts->inidle = 1;
246 252
247 if (need_resched()) 253 if (need_resched())
@@ -349,7 +355,7 @@ void tick_nohz_stop_sched_tick(int inidle)
349 355
350 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 356 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
351 hrtimer_start(&ts->sched_timer, expires, 357 hrtimer_start(&ts->sched_timer, expires,
352 HRTIMER_MODE_ABS); 358 HRTIMER_MODE_ABS_PINNED);
353 /* Check, if the timer was already in the past */ 359 /* Check, if the timer was already in the past */
354 if (hrtimer_active(&ts->sched_timer)) 360 if (hrtimer_active(&ts->sched_timer))
355 goto out; 361 goto out;
@@ -395,7 +401,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
395 401
396 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 402 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
397 hrtimer_start_expires(&ts->sched_timer, 403 hrtimer_start_expires(&ts->sched_timer,
398 HRTIMER_MODE_ABS); 404 HRTIMER_MODE_ABS_PINNED);
399 /* Check, if the timer was already in the past */ 405 /* Check, if the timer was already in the past */
400 if (hrtimer_active(&ts->sched_timer)) 406 if (hrtimer_active(&ts->sched_timer))
401 break; 407 break;
@@ -698,7 +704,8 @@ void tick_setup_sched_timer(void)
698 704
699 for (;;) { 705 for (;;) {
700 hrtimer_forward(&ts->sched_timer, now, tick_period); 706 hrtimer_forward(&ts->sched_timer, now, tick_period);
701 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS); 707 hrtimer_start_expires(&ts->sched_timer,
708 HRTIMER_MODE_ABS_PINNED);
702 /* Check, if the timer was already in the past */ 709 /* Check, if the timer was already in the past */
703 if (hrtimer_active(&ts->sched_timer)) 710 if (hrtimer_active(&ts->sched_timer))
704 break; 711 break;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..e8c77d9c633a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
77 clock->cycle_last = cycle_now; 77 clock->cycle_last = cycle_now;
78 78
79 nsec = cyc2ns(clock, cycle_delta); 79 nsec = cyc2ns(clock, cycle_delta);
80
81 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset();
83
80 timespec_add_ns(&xtime, nsec); 84 timespec_add_ns(&xtime, nsec);
81 85
82 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
111 /* convert to nanoseconds: */ 115 /* convert to nanoseconds: */
112 nsecs = cyc2ns(clock, cycle_delta); 116 nsecs = cyc2ns(clock, cycle_delta);
113 117
118 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset();
120
114 } while (read_seqretry(&xtime_lock, seq)); 121 } while (read_seqretry(&xtime_lock, seq));
115 122
116 timespec_add_ns(ts, nsecs); 123 timespec_add_ns(ts, nsecs);
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..54d3912f8cad 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,8 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
41#include <linux/sched.h>
40 42
41#include <asm/uaccess.h> 43#include <asm/uaccess.h>
42#include <asm/unistd.h> 44#include <asm/unistd.h>
@@ -604,13 +606,12 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
604} 606}
605 607
606static inline int 608static inline int
607__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) 609__mod_timer(struct timer_list *timer, unsigned long expires,
610 bool pending_only, int pinned)
608{ 611{
609 struct tvec_base *base, *new_base; 612 struct tvec_base *base, *new_base;
610 unsigned long flags; 613 unsigned long flags;
611 int ret; 614 int ret = 0 , cpu;
612
613 ret = 0;
614 615
615 timer_stats_timer_set_start_info(timer); 616 timer_stats_timer_set_start_info(timer);
616 BUG_ON(!timer->function); 617 BUG_ON(!timer->function);
@@ -629,6 +630,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
629 630
630 new_base = __get_cpu_var(tvec_bases); 631 new_base = __get_cpu_var(tvec_bases);
631 632
633 cpu = smp_processor_id();
634
635#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
636 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
637 int preferred_cpu = get_nohz_load_balancer();
638
639 if (preferred_cpu >= 0)
640 cpu = preferred_cpu;
641 }
642#endif
643 new_base = per_cpu(tvec_bases, cpu);
644
632 if (base != new_base) { 645 if (base != new_base) {
633 /* 646 /*
634 * We are trying to schedule the timer on the local CPU. 647 * We are trying to schedule the timer on the local CPU.
@@ -668,7 +681,7 @@ out_unlock:
668 */ 681 */
669int mod_timer_pending(struct timer_list *timer, unsigned long expires) 682int mod_timer_pending(struct timer_list *timer, unsigned long expires)
670{ 683{
671 return __mod_timer(timer, expires, true); 684 return __mod_timer(timer, expires, true, TIMER_NOT_PINNED);
672} 685}
673EXPORT_SYMBOL(mod_timer_pending); 686EXPORT_SYMBOL(mod_timer_pending);
674 687
@@ -702,11 +715,33 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
702 if (timer->expires == expires && timer_pending(timer)) 715 if (timer->expires == expires && timer_pending(timer))
703 return 1; 716 return 1;
704 717
705 return __mod_timer(timer, expires, false); 718 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
706} 719}
707EXPORT_SYMBOL(mod_timer); 720EXPORT_SYMBOL(mod_timer);
708 721
709/** 722/**
723 * mod_timer_pinned - modify a timer's timeout
724 * @timer: the timer to be modified
725 * @expires: new timeout in jiffies
726 *
727 * mod_timer_pinned() is a way to update the expire field of an
728 * active timer (if the timer is inactive it will be activated)
729 * and not allow the timer to be migrated to a different CPU.
730 *
731 * mod_timer_pinned(timer, expires) is equivalent to:
732 *
733 * del_timer(timer); timer->expires = expires; add_timer(timer);
734 */
735int mod_timer_pinned(struct timer_list *timer, unsigned long expires)
736{
737 if (timer->expires == expires && timer_pending(timer))
738 return 1;
739
740 return __mod_timer(timer, expires, false, TIMER_PINNED);
741}
742EXPORT_SYMBOL(mod_timer_pinned);
743
744/**
710 * add_timer - start a timer 745 * add_timer - start a timer
711 * @timer: the timer to be added 746 * @timer: the timer to be added
712 * 747 *
@@ -756,6 +791,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
756 wake_up_idle_cpu(cpu); 791 wake_up_idle_cpu(cpu);
757 spin_unlock_irqrestore(&base->lock, flags); 792 spin_unlock_irqrestore(&base->lock, flags);
758} 793}
794EXPORT_SYMBOL_GPL(add_timer_on);
759 795
760/** 796/**
761 * del_timer - deactive a timer. 797 * del_timer - deactive a timer.
@@ -1015,6 +1051,9 @@ cascade:
1015 index = slot = timer_jiffies & TVN_MASK; 1051 index = slot = timer_jiffies & TVN_MASK;
1016 do { 1052 do {
1017 list_for_each_entry(nte, varp->vec + slot, entry) { 1053 list_for_each_entry(nte, varp->vec + slot, entry) {
1054 if (tbase_get_deferrable(nte->base))
1055 continue;
1056
1018 found = 1; 1057 found = 1;
1019 if (time_before(nte->expires, expires)) 1058 if (time_before(nte->expires, expires))
1020 expires = nte->expires; 1059 expires = nte->expires;
@@ -1123,53 +1162,14 @@ void update_process_times(int user_tick)
1123} 1162}
1124 1163
1125/* 1164/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1165 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1166 */
1169static void run_timer_softirq(struct softirq_action *h) 1167static void run_timer_softirq(struct softirq_action *h)
1170{ 1168{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1169 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1170
1171 perf_counter_do_pending();
1172
1173 hrtimer_run_pending(); 1173 hrtimer_run_pending();
1174 1174
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1175 if (time_after_eq(jiffies, base->timer_jiffies))
@@ -1187,16 +1187,6 @@ void run_local_timers(void)
1187} 1187}
1188 1188
1189/* 1189/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1190 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1191 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1192 * jiffies is defined in the linker script...
@@ -1205,7 +1195,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1195void do_timer(unsigned long ticks)
1206{ 1196{
1207 jiffies_64 += ticks; 1197 jiffies_64 += ticks;
1208 update_times(ticks); 1198 update_wall_time();
1199 calc_global_load();
1209} 1200}
1210 1201
1211#ifdef __ARCH_WANT_SYS_ALARM 1202#ifdef __ARCH_WANT_SYS_ALARM
@@ -1353,7 +1344,7 @@ signed long __sched schedule_timeout(signed long timeout)
1353 expire = timeout + jiffies; 1344 expire = timeout + jiffies;
1354 1345
1355 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); 1346 setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
1356 __mod_timer(&timer, expire, false); 1347 __mod_timer(&timer, expire, false, TIMER_NOT_PINNED);
1357 schedule(); 1348 schedule();
1358 del_singleshot_timer_sync(&timer); 1349 del_singleshot_timer_sync(&timer);
1359 1350
@@ -1406,37 +1397,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1397{
1407 unsigned long mem_total, sav_total; 1398 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1399 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1400 struct timespec tp;
1410 1401
1411 memset(info, 0, sizeof(struct sysinfo)); 1402 memset(info, 0, sizeof(struct sysinfo));
1412 1403
1413 do { 1404 ktime_get_ts(&tp);
1414 struct timespec tp; 1405 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1406 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1407
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1408 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1409
1438 info->procs = nr_threads; 1410 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1411
1441 si_meminfo(info); 1412 si_meminfo(info);
1442 si_swapinfo(info); 1413 si_swapinfo(info);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 417d1985e299..1551f47e7669 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -18,6 +18,13 @@ config HAVE_FUNCTION_TRACER
18config HAVE_FUNCTION_GRAPH_TRACER 18config HAVE_FUNCTION_GRAPH_TRACER
19 bool 19 bool
20 20
21config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool
23 help
24 An arch may pass in a unique value (frame pointer) to both the
25 entering and exiting of a function. On exit, the value is compared
26 and if it does not match, then it will panic the kernel.
27
21config HAVE_FUNCTION_TRACE_MCOUNT_TEST 28config HAVE_FUNCTION_TRACE_MCOUNT_TEST
22 bool 29 bool
23 help 30 help
@@ -48,6 +55,21 @@ config FTRACE_NMI_ENTER
48 depends on HAVE_FTRACE_NMI_ENTER 55 depends on HAVE_FTRACE_NMI_ENTER
49 default y 56 default y
50 57
58config EVENT_TRACING
59 select CONTEXT_SWITCH_TRACER
60 bool
61
62config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool
65
66# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the
69# options do not appear when something else selects it. We need the two options
70# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
71# hidding of the automatic options options.
72
51config TRACING 73config TRACING
52 bool 74 bool
53 select DEBUG_FS 75 select DEBUG_FS
@@ -56,6 +78,11 @@ config TRACING
56 select TRACEPOINTS 78 select TRACEPOINTS
57 select NOP_TRACER 79 select NOP_TRACER
58 select BINARY_PRINTF 80 select BINARY_PRINTF
81 select EVENT_TRACING
82
83config GENERIC_TRACER
84 bool
85 select TRACING
59 86
60# 87#
61# Minimum requirements an architecture has to meet for us to 88# Minimum requirements an architecture has to meet for us to
@@ -73,14 +100,20 @@ config TRACING_SUPPORT
73 100
74if TRACING_SUPPORT 101if TRACING_SUPPORT
75 102
76menu "Tracers" 103menuconfig FTRACE
104 bool "Tracers"
105 default y if DEBUG_KERNEL
106 help
107 Enable the kernel tracing infrastructure.
108
109if FTRACE
77 110
78config FUNCTION_TRACER 111config FUNCTION_TRACER
79 bool "Kernel Function Tracer" 112 bool "Kernel Function Tracer"
80 depends on HAVE_FUNCTION_TRACER 113 depends on HAVE_FUNCTION_TRACER
81 select FRAME_POINTER 114 select FRAME_POINTER
82 select KALLSYMS 115 select KALLSYMS
83 select TRACING 116 select GENERIC_TRACER
84 select CONTEXT_SWITCH_TRACER 117 select CONTEXT_SWITCH_TRACER
85 help 118 help
86 Enable the kernel to trace every kernel function. This is done 119 Enable the kernel to trace every kernel function. This is done
@@ -95,6 +128,7 @@ config FUNCTION_GRAPH_TRACER
95 bool "Kernel Function Graph Tracer" 128 bool "Kernel Function Graph Tracer"
96 depends on HAVE_FUNCTION_GRAPH_TRACER 129 depends on HAVE_FUNCTION_GRAPH_TRACER
97 depends on FUNCTION_TRACER 130 depends on FUNCTION_TRACER
131 depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE
98 default y 132 default y
99 help 133 help
100 Enable the kernel to trace a function at both its return 134 Enable the kernel to trace a function at both its return
@@ -104,13 +138,14 @@ config FUNCTION_GRAPH_TRACER
104 the return value. This is done by setting the current return 138 the return value. This is done by setting the current return
105 address on the current task structure into a stack of calls. 139 address on the current task structure into a stack of calls.
106 140
141
107config IRQSOFF_TRACER 142config IRQSOFF_TRACER
108 bool "Interrupts-off Latency Tracer" 143 bool "Interrupts-off Latency Tracer"
109 default n 144 default n
110 depends on TRACE_IRQFLAGS_SUPPORT 145 depends on TRACE_IRQFLAGS_SUPPORT
111 depends on GENERIC_TIME 146 depends on GENERIC_TIME
112 select TRACE_IRQFLAGS 147 select TRACE_IRQFLAGS
113 select TRACING 148 select GENERIC_TRACER
114 select TRACER_MAX_TRACE 149 select TRACER_MAX_TRACE
115 help 150 help
116 This option measures the time spent in irqs-off critical 151 This option measures the time spent in irqs-off critical
@@ -120,7 +155,7 @@ config IRQSOFF_TRACER
120 disabled by default and can be runtime (re-)started 155 disabled by default and can be runtime (re-)started
121 via: 156 via:
122 157
123 echo 0 > /debugfs/tracing/tracing_max_latency 158 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
124 159
125 (Note that kernel size and overhead increases with this option 160 (Note that kernel size and overhead increases with this option
126 enabled. This option and the preempt-off timing option can be 161 enabled. This option and the preempt-off timing option can be
@@ -131,7 +166,7 @@ config PREEMPT_TRACER
131 default n 166 default n
132 depends on GENERIC_TIME 167 depends on GENERIC_TIME
133 depends on PREEMPT 168 depends on PREEMPT
134 select TRACING 169 select GENERIC_TRACER
135 select TRACER_MAX_TRACE 170 select TRACER_MAX_TRACE
136 help 171 help
137 This option measures the time spent in preemption off critical 172 This option measures the time spent in preemption off critical
@@ -141,7 +176,7 @@ config PREEMPT_TRACER
141 disabled by default and can be runtime (re-)started 176 disabled by default and can be runtime (re-)started
142 via: 177 via:
143 178
144 echo 0 > /debugfs/tracing/tracing_max_latency 179 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
145 180
146 (Note that kernel size and overhead increases with this option 181 (Note that kernel size and overhead increases with this option
147 enabled. This option and the irqs-off timing option can be 182 enabled. This option and the irqs-off timing option can be
@@ -150,7 +185,7 @@ config PREEMPT_TRACER
150config SYSPROF_TRACER 185config SYSPROF_TRACER
151 bool "Sysprof Tracer" 186 bool "Sysprof Tracer"
152 depends on X86 187 depends on X86
153 select TRACING 188 select GENERIC_TRACER
154 select CONTEXT_SWITCH_TRACER 189 select CONTEXT_SWITCH_TRACER
155 help 190 help
156 This tracer provides the trace needed by the 'Sysprof' userspace 191 This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,40 +193,33 @@ config SYSPROF_TRACER
158 193
159config SCHED_TRACER 194config SCHED_TRACER
160 bool "Scheduling Latency Tracer" 195 bool "Scheduling Latency Tracer"
161 select TRACING 196 select GENERIC_TRACER
162 select CONTEXT_SWITCH_TRACER 197 select CONTEXT_SWITCH_TRACER
163 select TRACER_MAX_TRACE 198 select TRACER_MAX_TRACE
164 help 199 help
165 This tracer tracks the latency of the highest priority task 200 This tracer tracks the latency of the highest priority task
166 to be scheduled in, starting from the point it has woken up. 201 to be scheduled in, starting from the point it has woken up.
167 202
168config CONTEXT_SWITCH_TRACER 203config ENABLE_DEFAULT_TRACERS
169 bool "Trace process context switches" 204 bool "Trace process context switches and events"
170 select TRACING 205 depends on !GENERIC_TRACER
171 select MARKERS
172 help
173 This tracer gets called from the context switch and records
174 all switching of tasks.
175
176config EVENT_TRACER
177 bool "Trace various events in the kernel"
178 select TRACING 206 select TRACING
179 help 207 help
180 This tracer hooks to various trace points in the kernel 208 This tracer hooks to various trace points in the kernel
181 allowing the user to pick and choose which trace point they 209 allowing the user to pick and choose which trace point they
182 want to trace. 210 want to trace. It also includes the sched_switch tracer plugin.
183 211
184config FTRACE_SYSCALLS 212config FTRACE_SYSCALLS
185 bool "Trace syscalls" 213 bool "Trace syscalls"
186 depends on HAVE_FTRACE_SYSCALLS 214 depends on HAVE_FTRACE_SYSCALLS
187 select TRACING 215 select GENERIC_TRACER
188 select KALLSYMS 216 select KALLSYMS
189 help 217 help
190 Basic tracer to catch the syscall entry and exit events. 218 Basic tracer to catch the syscall entry and exit events.
191 219
192config BOOT_TRACER 220config BOOT_TRACER
193 bool "Trace boot initcalls" 221 bool "Trace boot initcalls"
194 select TRACING 222 select GENERIC_TRACER
195 select CONTEXT_SWITCH_TRACER 223 select CONTEXT_SWITCH_TRACER
196 help 224 help
197 This tracer helps developers to optimize boot times: it records 225 This tracer helps developers to optimize boot times: it records
@@ -207,34 +235,61 @@ config BOOT_TRACER
207 to enable this on bootup. 235 to enable this on bootup.
208 236
209config TRACE_BRANCH_PROFILING 237config TRACE_BRANCH_PROFILING
238 bool
239 select GENERIC_TRACER
240
241choice
242 prompt "Branch Profiling"
243 default BRANCH_PROFILE_NONE
244 help
245 The branch profiling is a software profiler. It will add hooks
246 into the C conditionals to test which path a branch takes.
247
248 The likely/unlikely profiler only looks at the conditions that
249 are annotated with a likely or unlikely macro.
250
251 The "all branch" profiler will profile every if statement in the
252 kernel. This profiler will also enable the likely/unlikely
253 profiler as well.
254
255 Either of the above profilers add a bit of overhead to the system.
256 If unsure choose "No branch profiling".
257
258config BRANCH_PROFILE_NONE
259 bool "No branch profiling"
260 help
261 No branch profiling. Branch profiling adds a bit of overhead.
262 Only enable it if you want to analyse the branching behavior.
263 Otherwise keep it disabled.
264
265config PROFILE_ANNOTATED_BRANCHES
210 bool "Trace likely/unlikely profiler" 266 bool "Trace likely/unlikely profiler"
211 select TRACING 267 select TRACE_BRANCH_PROFILING
212 help 268 help
213 This tracer profiles all the the likely and unlikely macros 269 This tracer profiles all the the likely and unlikely macros
214 in the kernel. It will display the results in: 270 in the kernel. It will display the results in:
215 271
216 /debugfs/tracing/profile_annotated_branch 272 /sys/kernel/debug/tracing/profile_annotated_branch
217 273
218 Note: this will add a significant overhead, only turn this 274 Note: this will add a significant overhead, only turn this
219 on if you need to profile the system's use of these macros. 275 on if you need to profile the system's use of these macros.
220 276
221 Say N if unsure.
222
223config PROFILE_ALL_BRANCHES 277config PROFILE_ALL_BRANCHES
224 bool "Profile all if conditionals" 278 bool "Profile all if conditionals"
225 depends on TRACE_BRANCH_PROFILING 279 select TRACE_BRANCH_PROFILING
226 help 280 help
227 This tracer profiles all branch conditions. Every if () 281 This tracer profiles all branch conditions. Every if ()
228 taken in the kernel is recorded whether it hit or miss. 282 taken in the kernel is recorded whether it hit or miss.
229 The results will be displayed in: 283 The results will be displayed in:
230 284
231 /debugfs/tracing/profile_branch 285 /sys/kernel/debug/tracing/profile_branch
286
287 This option also enables the likely/unlikely profiler.
232 288
233 This configuration, when enabled, will impose a great overhead 289 This configuration, when enabled, will impose a great overhead
234 on the system. This should only be enabled when the system 290 on the system. This should only be enabled when the system
235 is to be analyzed 291 is to be analyzed
236 292endchoice
237 Say N if unsure.
238 293
239config TRACING_BRANCHES 294config TRACING_BRANCHES
240 bool 295 bool
@@ -261,7 +316,7 @@ config BRANCH_TRACER
261config POWER_TRACER 316config POWER_TRACER
262 bool "Trace power consumption behavior" 317 bool "Trace power consumption behavior"
263 depends on X86 318 depends on X86
264 select TRACING 319 select GENERIC_TRACER
265 help 320 help
266 This tracer helps developers to analyze and optimize the kernels 321 This tracer helps developers to analyze and optimize the kernels
267 power management decisions, specifically the C-state and P-state 322 power management decisions, specifically the C-state and P-state
@@ -276,7 +331,7 @@ config STACK_TRACER
276 select KALLSYMS 331 select KALLSYMS
277 help 332 help
278 This special tracer records the maximum stack footprint of the 333 This special tracer records the maximum stack footprint of the
279 kernel and displays it in debugfs/tracing/stack_trace. 334 kernel and displays it in /sys/kernel/debug/tracing/stack_trace.
280 335
281 This tracer works by hooking into every function call that the 336 This tracer works by hooking into every function call that the
282 kernel executes, and keeping a maximum stack depth value and 337 kernel executes, and keeping a maximum stack depth value and
@@ -295,14 +350,14 @@ config STACK_TRACER
295config HW_BRANCH_TRACER 350config HW_BRANCH_TRACER
296 depends on HAVE_HW_BRANCH_TRACER 351 depends on HAVE_HW_BRANCH_TRACER
297 bool "Trace hw branches" 352 bool "Trace hw branches"
298 select TRACING 353 select GENERIC_TRACER
299 help 354 help
300 This tracer records all branches on the system in a circular 355 This tracer records all branches on the system in a circular
301 buffer giving access to the last N branches for each cpu. 356 buffer giving access to the last N branches for each cpu.
302 357
303config KMEMTRACE 358config KMEMTRACE
304 bool "Trace SLAB allocations" 359 bool "Trace SLAB allocations"
305 select TRACING 360 select GENERIC_TRACER
306 help 361 help
307 kmemtrace provides tracing for slab allocator functions, such as 362 kmemtrace provides tracing for slab allocator functions, such as
308 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 363 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +377,7 @@ config KMEMTRACE
322 377
323config WORKQUEUE_TRACER 378config WORKQUEUE_TRACER
324 bool "Trace workqueues" 379 bool "Trace workqueues"
325 select TRACING 380 select GENERIC_TRACER
326 help 381 help
327 The workqueue tracer provides some statistical informations 382 The workqueue tracer provides some statistical informations
328 about each cpu workqueue thread such as the number of the 383 about each cpu workqueue thread such as the number of the
@@ -338,7 +393,7 @@ config BLK_DEV_IO_TRACE
338 select RELAY 393 select RELAY
339 select DEBUG_FS 394 select DEBUG_FS
340 select TRACEPOINTS 395 select TRACEPOINTS
341 select TRACING 396 select GENERIC_TRACER
342 select STACKTRACE 397 select STACKTRACE
343 help 398 help
344 Say Y here if you want to be able to trace the block layer actions 399 Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +430,20 @@ config DYNAMIC_FTRACE
375 were made. If so, it runs stop_machine (stops all CPUS) 430 were made. If so, it runs stop_machine (stops all CPUS)
376 and modifies the code to jump over the call to ftrace. 431 and modifies the code to jump over the call to ftrace.
377 432
433config FUNCTION_PROFILER
434 bool "Kernel function profiler"
435 depends on FUNCTION_TRACER
436 default n
437 help
438 This option enables the kernel function profiler. A file is created
439 in debugfs called function_profile_enabled which defaults to zero.
440 When a 1 is echoed into this file profiling begins, and when a
441 zero is entered, profiling stops. A file in the trace_stats
442 directory called functions, that show the list of functions that
443 have been hit and their counters.
444
445 If in doubt, say N
446
378config FTRACE_MCOUNT_RECORD 447config FTRACE_MCOUNT_RECORD
379 def_bool y 448 def_bool y
380 depends on DYNAMIC_FTRACE 449 depends on DYNAMIC_FTRACE
@@ -385,7 +454,7 @@ config FTRACE_SELFTEST
385 454
386config FTRACE_STARTUP_TEST 455config FTRACE_STARTUP_TEST
387 bool "Perform a startup test on ftrace" 456 bool "Perform a startup test on ftrace"
388 depends on TRACING 457 depends on GENERIC_TRACER
389 select FTRACE_SELFTEST 458 select FTRACE_SELFTEST
390 help 459 help
391 This option performs a series of startup tests on ftrace. On bootup 460 This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +465,7 @@ config FTRACE_STARTUP_TEST
396config MMIOTRACE 465config MMIOTRACE
397 bool "Memory mapped IO tracing" 466 bool "Memory mapped IO tracing"
398 depends on HAVE_MMIOTRACE_SUPPORT && PCI 467 depends on HAVE_MMIOTRACE_SUPPORT && PCI
399 select TRACING 468 select GENERIC_TRACER
400 help 469 help
401 Mmiotrace traces Memory Mapped I/O access and is meant for 470 Mmiotrace traces Memory Mapped I/O access and is meant for
402 debugging and reverse engineering. It is called from the ioremap 471 debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +485,23 @@ config MMIOTRACE_TEST
416 485
417 Say N, unless you absolutely know what you are doing. 486 Say N, unless you absolutely know what you are doing.
418 487
419endmenu 488config RING_BUFFER_BENCHMARK
489 tristate "Ring buffer benchmark stress tester"
490 depends on RING_BUFFER
491 help
492 This option creates a test to stress the ring buffer and bench mark it.
493 It creates its own ring buffer such that it will not interfer with
494 any other users of the ring buffer (such as ftrace). It then creates
495 a producer and consumer that will run for 10 seconds and sleep for
496 10 seconds. Each interval it will print out the number of events
497 it recorded and give a rough estimate of how long each iteration took.
498
499 It does not disable interrupts or raise its priority, so it may be
500 affected by processes that are running.
501
502 If unsure, say N
503
504endif # FTRACE
420 505
421endif # TRACING_SUPPORT 506endif # TRACING_SUPPORT
422 507
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec1..844164dca90a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif 16endif
17 17
18#
19# Make the trace clocks available generally: it's infrastructure
20# relied on by ptrace for example:
21#
22obj-y += trace_clock.o
23
18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 24obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 25obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
26obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
20 27
21obj-$(CONFIG_TRACING) += trace.o 28obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o 29obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o 30obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o 31obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
39obj-$(CONFIG_POWER_TRACER) += trace_power.o 45obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o 49ifeq ($(CONFIG_BLOCK),y)
44obj-$(CONFIG_EVENT_TRACER) += events.o 50obj-$(CONFIG_EVENT_TRACING) += blktrace.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o 51endif
52obj-$(CONFIG_EVENT_TRACING) += trace_events.o
53obj-$(CONFIG_EVENT_TRACING) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o 56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
49 57
50libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba..39af8af6fc30 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27
28#include <trace/events/block.h>
29
28#include "trace_output.h" 30#include "trace_output.h"
29 31
32#ifdef CONFIG_BLK_DEV_IO_TRACE
33
30static unsigned int blktrace_seq __read_mostly = 1; 34static unsigned int blktrace_seq __read_mostly = 1;
31 35
32static struct trace_array *blk_tr; 36static struct trace_array *blk_tr;
@@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
147{ 151{
148 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) 152 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 return 1; 153 return 1;
150 if (sector < bt->start_lba || sector > bt->end_lba) 154 if (sector && (sector < bt->start_lba || sector > bt->end_lba))
151 return 1; 155 return 1;
152 if (bt->pid && pid != bt->pid) 156 if (bt->pid && pid != bt->pid)
153 return 1; 157 return 1;
@@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
192 what |= MASK_TC_BIT(rw, DISCARD); 196 what |= MASK_TC_BIT(rw, DISCARD);
193 197
194 pid = tsk->pid; 198 pid = tsk->pid;
195 if (unlikely(act_log_check(bt, what, sector, pid))) 199 if (act_log_check(bt, what, sector, pid))
196 return; 200 return;
197 cpu = raw_smp_processor_id(); 201 cpu = raw_smp_processor_id();
198 202
@@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)
262{ 266{
263 debugfs_remove(bt->msg_file); 267 debugfs_remove(bt->msg_file);
264 debugfs_remove(bt->dropped_file); 268 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
265 relay_close(bt->rchan); 270 relay_close(bt->rchan);
266 free_percpu(bt->sequence); 271 free_percpu(bt->sequence);
267 free_percpu(bt->msg_data); 272 free_percpu(bt->msg_data);
@@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
403 .remove_buf_file = blk_remove_buf_file_callback, 408 .remove_buf_file = blk_remove_buf_file_callback,
404}; 409};
405 410
411static void blk_trace_setup_lba(struct blk_trace *bt,
412 struct block_device *bdev)
413{
414 struct hd_struct *part = NULL;
415
416 if (bdev)
417 part = bdev->bd_part;
418
419 if (part) {
420 bt->start_lba = part->start_sect;
421 bt->end_lba = part->start_sect + part->nr_sects;
422 } else {
423 bt->start_lba = 0;
424 bt->end_lba = -1ULL;
425 }
426}
427
406/* 428/*
407 * Setup everything required to start tracing 429 * Setup everything required to start tracing
408 */ 430 */
409int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 431int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 struct blk_user_trace_setup *buts) 432 struct block_device *bdev,
433 struct blk_user_trace_setup *buts)
411{ 434{
412 struct blk_trace *old_bt, *bt = NULL; 435 struct blk_trace *old_bt, *bt = NULL;
413 struct dentry *dir = NULL; 436 struct dentry *dir = NULL;
@@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
480 if (!bt->act_mask) 503 if (!bt->act_mask)
481 bt->act_mask = (u16) -1; 504 bt->act_mask = (u16) -1;
482 505
483 bt->start_lba = buts->start_lba; 506 blk_trace_setup_lba(bt, bdev);
484 bt->end_lba = buts->end_lba; 507
485 if (!bt->end_lba) 508 /* overwrite with user settings */
486 bt->end_lba = -1ULL; 509 if (buts->start_lba)
510 bt->start_lba = buts->start_lba;
511 if (buts->end_lba)
512 bt->end_lba = buts->end_lba;
487 513
488 bt->pid = buts->pid; 514 bt->pid = buts->pid;
489 bt->trace_state = Blktrace_setup; 515 bt->trace_state = Blktrace_setup;
@@ -505,6 +531,7 @@ err:
505} 531}
506 532
507int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 533int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
534 struct block_device *bdev,
508 char __user *arg) 535 char __user *arg)
509{ 536{
510 struct blk_user_trace_setup buts; 537 struct blk_user_trace_setup buts;
@@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
514 if (ret) 541 if (ret)
515 return -EFAULT; 542 return -EFAULT;
516 543
517 ret = do_blk_trace_setup(q, name, dev, &buts); 544 ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
518 if (ret) 545 if (ret)
519 return ret; 546 return ret;
520 547
@@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
582 switch (cmd) { 609 switch (cmd) {
583 case BLKTRACESETUP: 610 case BLKTRACESETUP:
584 bdevname(bdev, b); 611 bdevname(bdev, b);
585 ret = blk_trace_setup(q, b, bdev->bd_dev, arg); 612 ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
586 break; 613 break;
587 case BLKTRACESTART: 614 case BLKTRACESTART:
588 start = 1; 615 start = 1;
@@ -642,12 +669,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
642 669
643 if (blk_pc_request(rq)) { 670 if (blk_pc_request(rq)) {
644 what |= BLK_TC_ACT(BLK_TC_PC); 671 what |= BLK_TC_ACT(BLK_TC_PC);
645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, 672 __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
646 rq->cmd_len, rq->cmd); 673 what, rq->errors, rq->cmd_len, rq->cmd);
647 } else { 674 } else {
648 what |= BLK_TC_ACT(BLK_TC_FS); 675 what |= BLK_TC_ACT(BLK_TC_FS);
649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 676 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
650 rw, what, rq->errors, 0, NULL); 677 what, rq->errors, 0, NULL);
651 } 678 }
652} 679}
653 680
@@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
809 * @bio: the source bio 836 * @bio: the source bio
810 * @dev: target device 837 * @dev: target device
811 * @from: source sector 838 * @from: source sector
812 * @to: target sector
813 * 839 *
814 * Description: 840 * Description:
815 * Device mapper or raid target sometimes need to split a bio because 841 * Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
817 * 843 *
818 **/ 844 **/
819static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 845static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 dev_t dev, sector_t from, sector_t to) 846 dev_t dev, sector_t from)
821{ 847{
822 struct blk_trace *bt = q->blk_trace; 848 struct blk_trace *bt = q->blk_trace;
823 struct blk_io_trace_remap r; 849 struct blk_io_trace_remap r;
@@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
825 if (likely(!bt)) 851 if (likely(!bt))
826 return; 852 return;
827 853
828 r.device = cpu_to_be32(dev); 854 r.device_from = cpu_to_be32(dev);
829 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); 855 r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
830 r.sector = cpu_to_be64(to); 856 r.sector_from = cpu_to_be64(from);
831 857
832 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, 858 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
833 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); 859 BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
860 sizeof(r), &r);
834} 861}
835 862
836/** 863/**
@@ -854,11 +881,11 @@ void blk_add_driver_data(struct request_queue *q,
854 return; 881 return;
855 882
856 if (blk_pc_request(rq)) 883 if (blk_pc_request(rq))
857 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, 884 __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
858 rq->errors, len, data); 885 BLK_TA_DRV_DATA, rq->errors, len, data);
859 else 886 else
860 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 887 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
861 0, BLK_TA_DRV_DATA, rq->errors, len, data); 888 BLK_TA_DRV_DATA, rq->errors, len, data);
862} 889}
863EXPORT_SYMBOL_GPL(blk_add_driver_data); 890EXPORT_SYMBOL_GPL(blk_add_driver_data);
864 891
@@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
971 return te_blk_io_trace(ent) + 1; 998 return te_blk_io_trace(ent) + 1;
972} 999}
973 1000
1001static inline u32 t_action(const struct trace_entry *ent)
1002{
1003 return te_blk_io_trace(ent)->action;
1004}
1005
1006static inline u32 t_bytes(const struct trace_entry *ent)
1007{
1008 return te_blk_io_trace(ent)->bytes;
1009}
1010
974static inline u32 t_sec(const struct trace_entry *ent) 1011static inline u32 t_sec(const struct trace_entry *ent)
975{ 1012{
976 return te_blk_io_trace(ent)->bytes >> 9; 1013 return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
996 struct blk_io_trace_remap *r) 1033 struct blk_io_trace_remap *r)
997{ 1034{
998 const struct blk_io_trace_remap *__r = pdu_start(ent); 1035 const struct blk_io_trace_remap *__r = pdu_start(ent);
999 __u64 sector = __r->sector; 1036 __u64 sector_from = __r->sector_from;
1000 1037
1001 r->device = be32_to_cpu(__r->device);
1002 r->device_from = be32_to_cpu(__r->device_from); 1038 r->device_from = be32_to_cpu(__r->device_from);
1003 r->sector = be64_to_cpu(sector); 1039 r->device_to = be32_to_cpu(__r->device_to);
1040 r->sector_from = be64_to_cpu(sector_from);
1004} 1041}
1005 1042
1006typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); 1043typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
1031 MAJOR(t->device), MINOR(t->device), act, rwbs); 1068 MAJOR(t->device), MINOR(t->device), act, rwbs);
1032} 1069}
1033 1070
1071static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
1072{
1073 const unsigned char *pdu_buf;
1074 int pdu_len;
1075 int i, end, ret;
1076
1077 pdu_buf = pdu_start(ent);
1078 pdu_len = te_blk_io_trace(ent)->pdu_len;
1079
1080 if (!pdu_len)
1081 return 1;
1082
1083 /* find the last zero that needs to be printed */
1084 for (end = pdu_len - 1; end >= 0; end--)
1085 if (pdu_buf[end])
1086 break;
1087 end++;
1088
1089 if (!trace_seq_putc(s, '('))
1090 return 0;
1091
1092 for (i = 0; i < pdu_len; i++) {
1093
1094 ret = trace_seq_printf(s, "%s%02x",
1095 i == 0 ? "" : " ", pdu_buf[i]);
1096 if (!ret)
1097 return ret;
1098
1099 /*
1100 * stop when the rest is just zeroes and indicate so
1101 * with a ".." appended
1102 */
1103 if (i == end && end != pdu_len - 1)
1104 return trace_seq_puts(s, " ..) ");
1105 }
1106
1107 return trace_seq_puts(s, ") ");
1108}
1109
1034static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) 1110static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035{ 1111{
1036 char cmd[TASK_COMM_LEN]; 1112 char cmd[TASK_COMM_LEN];
1037 1113
1038 trace_find_cmdline(ent->pid, cmd); 1114 trace_find_cmdline(ent->pid, cmd);
1039 1115
1040 if (t_sec(ent)) 1116 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1041 return trace_seq_printf(s, "%llu + %u [%s]\n", 1117 int ret;
1042 t_sector(ent), t_sec(ent), cmd); 1118
1043 return trace_seq_printf(s, "[%s]\n", cmd); 1119 ret = trace_seq_printf(s, "%u ", t_bytes(ent));
1120 if (!ret)
1121 return 0;
1122 ret = blk_log_dump_pdu(s, ent);
1123 if (!ret)
1124 return 0;
1125 return trace_seq_printf(s, "[%s]\n", cmd);
1126 } else {
1127 if (t_sec(ent))
1128 return trace_seq_printf(s, "%llu + %u [%s]\n",
1129 t_sector(ent), t_sec(ent), cmd);
1130 return trace_seq_printf(s, "[%s]\n", cmd);
1131 }
1044} 1132}
1045 1133
1046static int blk_log_with_error(struct trace_seq *s, 1134static int blk_log_with_error(struct trace_seq *s,
1047 const struct trace_entry *ent) 1135 const struct trace_entry *ent)
1048{ 1136{
1049 if (t_sec(ent)) 1137 if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
1050 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), 1138 int ret;
1051 t_sec(ent), t_error(ent)); 1139
1052 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); 1140 ret = blk_log_dump_pdu(s, ent);
1141 if (ret)
1142 return trace_seq_printf(s, "[%d]\n", t_error(ent));
1143 return 0;
1144 } else {
1145 if (t_sec(ent))
1146 return trace_seq_printf(s, "%llu + %u [%d]\n",
1147 t_sector(ent),
1148 t_sec(ent), t_error(ent));
1149 return trace_seq_printf(s, "%llu [%d]\n",
1150 t_sector(ent), t_error(ent));
1151 }
1053} 1152}
1054 1153
1055static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) 1154static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056{ 1155{
1057 struct blk_io_trace_remap r = { .device = 0, }; 1156 struct blk_io_trace_remap r = { .device_from = 0, };
1058 1157
1059 get_pdu_remap(ent, &r); 1158 get_pdu_remap(ent, &r);
1060 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", 1159 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 t_sector(ent), 1160 t_sector(ent), t_sec(ent),
1062 t_sec(ent), MAJOR(r.device), MINOR(r.device), 1161 MAJOR(r.device_from), MINOR(r.device_from),
1063 (unsigned long long)r.sector); 1162 (unsigned long long)r.sector_from);
1064} 1163}
1065 1164
1066static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) 1165static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)
1117static void blk_tracer_start(struct trace_array *tr) 1216static void blk_tracer_start(struct trace_array *tr)
1118{ 1217{
1119 blk_tracer_enabled = true; 1218 blk_tracer_enabled = true;
1120 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121} 1219}
1122 1220
1123static int blk_tracer_init(struct trace_array *tr) 1221static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)
1130static void blk_tracer_stop(struct trace_array *tr) 1228static void blk_tracer_stop(struct trace_array *tr)
1131{ 1229{
1132 blk_tracer_enabled = false; 1230 blk_tracer_enabled = false;
1133 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134} 1231}
1135 1232
1136static void blk_tracer_reset(struct trace_array *tr) 1233static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
1182 } 1279 }
1183 1280
1184 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) 1281 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 ret = trace_seq_printf(s, "Bad pc action %x\n", what); 1282 ret = trace_seq_printf(s, "Unknown action %x\n", what);
1186 else { 1283 else {
1187 ret = log_action(iter, what2act[what].act[long_act]); 1284 ret = log_action(iter, what2act[what].act[long_act]);
1188 if (ret) 1285 if (ret)
@@ -1195,9 +1292,6 @@ out:
1195static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1292static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 int flags) 1293 int flags)
1197{ 1294{
1198 if (!trace_print_context(iter))
1199 return TRACE_TYPE_PARTIAL_LINE;
1200
1201 return print_one_line(iter, false); 1295 return print_one_line(iter, false);
1202} 1296}
1203 1297
@@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1232 return print_one_line(iter, true); 1326 return print_one_line(iter, true);
1233} 1327}
1234 1328
1329static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
1330{
1331 /* don't output context-info for blk_classic output */
1332 if (bit == TRACE_BLK_OPT_CLASSIC) {
1333 if (set)
1334 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1335 else
1336 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1337 }
1338 return 0;
1339}
1340
1235static struct tracer blk_tracer __read_mostly = { 1341static struct tracer blk_tracer __read_mostly = {
1236 .name = "blk", 1342 .name = "blk",
1237 .init = blk_tracer_init, 1343 .init = blk_tracer_init,
@@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {
1241 .print_header = blk_tracer_print_header, 1347 .print_header = blk_tracer_print_header,
1242 .print_line = blk_tracer_print_line, 1348 .print_line = blk_tracer_print_line,
1243 .flags = &blk_tracer_flags, 1349 .flags = &blk_tracer_flags,
1350 .set_flag = blk_tracer_set_flag,
1244}; 1351};
1245 1352
1246static struct trace_event trace_blk_event = { 1353static struct trace_event trace_blk_event = {
@@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
1285/* 1392/*
1286 * Setup everything required to start tracing 1393 * Setup everything required to start tracing
1287 */ 1394 */
1288static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) 1395static int blk_trace_setup_queue(struct request_queue *q,
1396 struct block_device *bdev)
1289{ 1397{
1290 struct blk_trace *old_bt, *bt = NULL; 1398 struct blk_trace *old_bt, *bt = NULL;
1291 int ret = -ENOMEM; 1399 int ret = -ENOMEM;
@@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1298 if (!bt->msg_data) 1406 if (!bt->msg_data)
1299 goto free_bt; 1407 goto free_bt;
1300 1408
1301 bt->dev = dev; 1409 bt->dev = bdev->bd_dev;
1302 bt->act_mask = (u16)-1; 1410 bt->act_mask = (u16)-1;
1303 bt->end_lba = -1ULL; 1411
1412 blk_trace_setup_lba(bt, bdev);
1304 1413
1305 old_bt = xchg(&q->blk_trace, bt); 1414 old_bt = xchg(&q->blk_trace, bt);
1306 if (old_bt != NULL) { 1415 if (old_bt != NULL) {
@@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1517 1626
1518 if (attr == &dev_attr_enable) { 1627 if (attr == &dev_attr_enable) {
1519 if (value) 1628 if (value)
1520 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1629 ret = blk_trace_setup_queue(q, bdev);
1521 else 1630 else
1522 ret = blk_trace_remove_queue(q); 1631 ret = blk_trace_remove_queue(q);
1523 goto out_unlock_bdev; 1632 goto out_unlock_bdev;
@@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1525 1634
1526 ret = 0; 1635 ret = 0;
1527 if (q->blk_trace == NULL) 1636 if (q->blk_trace == NULL)
1528 ret = blk_trace_setup_queue(q, bdev->bd_dev); 1637 ret = blk_trace_setup_queue(q, bdev);
1529 1638
1530 if (ret == 0) { 1639 if (ret == 0) {
1531 if (attr == &dev_attr_act_mask) 1640 if (attr == &dev_attr_act_mask)
@@ -1548,3 +1657,77 @@ out:
1548 return ret ? ret : count; 1657 return ret ? ret : count;
1549} 1658}
1550 1659
1660int blk_trace_init_sysfs(struct device *dev)
1661{
1662 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1663}
1664
1665#endif /* CONFIG_BLK_DEV_IO_TRACE */
1666
1667#ifdef CONFIG_EVENT_TRACING
1668
1669void blk_dump_cmd(char *buf, struct request *rq)
1670{
1671 int i, end;
1672 int len = rq->cmd_len;
1673 unsigned char *cmd = rq->cmd;
1674
1675 if (!blk_pc_request(rq)) {
1676 buf[0] = '\0';
1677 return;
1678 }
1679
1680 for (end = len - 1; end >= 0; end--)
1681 if (cmd[end])
1682 break;
1683 end++;
1684
1685 for (i = 0; i < len; i++) {
1686 buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
1687 if (i == end && end != len - 1) {
1688 sprintf(buf, " ..");
1689 break;
1690 }
1691 }
1692}
1693
1694void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
1695{
1696 int i = 0;
1697
1698 if (rw & WRITE)
1699 rwbs[i++] = 'W';
1700 else if (rw & 1 << BIO_RW_DISCARD)
1701 rwbs[i++] = 'D';
1702 else if (bytes)
1703 rwbs[i++] = 'R';
1704 else
1705 rwbs[i++] = 'N';
1706
1707 if (rw & 1 << BIO_RW_AHEAD)
1708 rwbs[i++] = 'A';
1709 if (rw & 1 << BIO_RW_BARRIER)
1710 rwbs[i++] = 'B';
1711 if (rw & 1 << BIO_RW_SYNCIO)
1712 rwbs[i++] = 'S';
1713 if (rw & 1 << BIO_RW_META)
1714 rwbs[i++] = 'M';
1715
1716 rwbs[i] = '\0';
1717}
1718
1719void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
1720{
1721 int rw = rq->cmd_flags & 0x03;
1722 int bytes;
1723
1724 if (blk_discard_rq(rq))
1725 rw |= (1 << BIO_RW_DISCARD);
1726
1727 bytes = blk_rq_bytes(rq);
1728
1729 blk_fill_rwbs(rwbs, rw, bytes);
1730}
1731
1732#endif /* CONFIG_EVENT_TRACING */
1733
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 246f2aa6dc46..000000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,14 +0,0 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c3..3718d55fb4c3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,11 +29,13 @@
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h> 30#include <linux/hash.h>
31 31
32#include <trace/sched.h> 32#include <trace/events/sched.h>
33 33
34#include <asm/ftrace.h> 34#include <asm/ftrace.h>
35#include <asm/setup.h>
35 36
36#include "trace.h" 37#include "trace_output.h"
38#include "trace_stat.h"
37 39
38#define FTRACE_WARN_ON(cond) \ 40#define FTRACE_WARN_ON(cond) \
39 do { \ 41 do { \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
68 70
69static struct ftrace_ops ftrace_list_end __read_mostly = 71static struct ftrace_ops ftrace_list_end __read_mostly =
70{ 72{
71 .func = ftrace_stub, 73 .func = ftrace_stub,
72}; 74};
73 75
74static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 76static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)
240#endif 242#endif
241} 243}
242 244
245#ifdef CONFIG_FUNCTION_PROFILER
246struct ftrace_profile {
247 struct hlist_node node;
248 unsigned long ip;
249 unsigned long counter;
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251 unsigned long long time;
252#endif
253};
254
255struct ftrace_profile_page {
256 struct ftrace_profile_page *next;
257 unsigned long index;
258 struct ftrace_profile records[];
259};
260
261struct ftrace_profile_stat {
262 atomic_t disabled;
263 struct hlist_head *hash;
264 struct ftrace_profile_page *pages;
265 struct ftrace_profile_page *start;
266 struct tracer_stat stat;
267};
268
269#define PROFILE_RECORDS_SIZE \
270 (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
271
272#define PROFILES_PER_PAGE \
273 (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
274
275static int ftrace_profile_bits __read_mostly;
276static int ftrace_profile_enabled __read_mostly;
277
278/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
279static DEFINE_MUTEX(ftrace_profile_lock);
280
281static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
282
283#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
284
285static void *
286function_stat_next(void *v, int idx)
287{
288 struct ftrace_profile *rec = v;
289 struct ftrace_profile_page *pg;
290
291 pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
292
293 again:
294 rec++;
295 if ((void *)rec >= (void *)&pg->records[pg->index]) {
296 pg = pg->next;
297 if (!pg)
298 return NULL;
299 rec = &pg->records[0];
300 if (!rec->counter)
301 goto again;
302 }
303
304 return rec;
305}
306
307static void *function_stat_start(struct tracer_stat *trace)
308{
309 struct ftrace_profile_stat *stat =
310 container_of(trace, struct ftrace_profile_stat, stat);
311
312 if (!stat || !stat->start)
313 return NULL;
314
315 return function_stat_next(&stat->start->records[0], 0);
316}
317
318#ifdef CONFIG_FUNCTION_GRAPH_TRACER
319/* function graph compares on total time */
320static int function_stat_cmp(void *p1, void *p2)
321{
322 struct ftrace_profile *a = p1;
323 struct ftrace_profile *b = p2;
324
325 if (a->time < b->time)
326 return -1;
327 if (a->time > b->time)
328 return 1;
329 else
330 return 0;
331}
332#else
333/* not function graph compares against hits */
334static int function_stat_cmp(void *p1, void *p2)
335{
336 struct ftrace_profile *a = p1;
337 struct ftrace_profile *b = p2;
338
339 if (a->counter < b->counter)
340 return -1;
341 if (a->counter > b->counter)
342 return 1;
343 else
344 return 0;
345}
346#endif
347
348static int function_stat_headers(struct seq_file *m)
349{
350#ifdef CONFIG_FUNCTION_GRAPH_TRACER
351 seq_printf(m, " Function "
352 "Hit Time Avg\n"
353 " -------- "
354 "--- ---- ---\n");
355#else
356 seq_printf(m, " Function Hit\n"
357 " -------- ---\n");
358#endif
359 return 0;
360}
361
362static int function_stat_show(struct seq_file *m, void *v)
363{
364 struct ftrace_profile *rec = v;
365 char str[KSYM_SYMBOL_LEN];
366#ifdef CONFIG_FUNCTION_GRAPH_TRACER
367 static DEFINE_MUTEX(mutex);
368 static struct trace_seq s;
369 unsigned long long avg;
370#endif
371
372 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
373 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
374
375#ifdef CONFIG_FUNCTION_GRAPH_TRACER
376 seq_printf(m, " ");
377 avg = rec->time;
378 do_div(avg, rec->counter);
379
380 mutex_lock(&mutex);
381 trace_seq_init(&s);
382 trace_print_graph_duration(rec->time, &s);
383 trace_seq_puts(&s, " ");
384 trace_print_graph_duration(avg, &s);
385 trace_print_seq(m, &s);
386 mutex_unlock(&mutex);
387#endif
388 seq_putc(m, '\n');
389
390 return 0;
391}
392
393static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
394{
395 struct ftrace_profile_page *pg;
396
397 pg = stat->pages = stat->start;
398
399 while (pg) {
400 memset(pg->records, 0, PROFILE_RECORDS_SIZE);
401 pg->index = 0;
402 pg = pg->next;
403 }
404
405 memset(stat->hash, 0,
406 FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
407}
408
409int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
410{
411 struct ftrace_profile_page *pg;
412 int functions;
413 int pages;
414 int i;
415
416 /* If we already allocated, do nothing */
417 if (stat->pages)
418 return 0;
419
420 stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
421 if (!stat->pages)
422 return -ENOMEM;
423
424#ifdef CONFIG_DYNAMIC_FTRACE
425 functions = ftrace_update_tot_cnt;
426#else
427 /*
428 * We do not know the number of functions that exist because
429 * dynamic tracing is what counts them. With past experience
430 * we have around 20K functions. That should be more than enough.
431 * It is highly unlikely we will execute every function in
432 * the kernel.
433 */
434 functions = 20000;
435#endif
436
437 pg = stat->start = stat->pages;
438
439 pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
440
441 for (i = 0; i < pages; i++) {
442 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
443 if (!pg->next)
444 goto out_free;
445 pg = pg->next;
446 }
447
448 return 0;
449
450 out_free:
451 pg = stat->start;
452 while (pg) {
453 unsigned long tmp = (unsigned long)pg;
454
455 pg = pg->next;
456 free_page(tmp);
457 }
458
459 free_page((unsigned long)stat->pages);
460 stat->pages = NULL;
461 stat->start = NULL;
462
463 return -ENOMEM;
464}
465
466static int ftrace_profile_init_cpu(int cpu)
467{
468 struct ftrace_profile_stat *stat;
469 int size;
470
471 stat = &per_cpu(ftrace_profile_stats, cpu);
472
473 if (stat->hash) {
474 /* If the profile is already created, simply reset it */
475 ftrace_profile_reset(stat);
476 return 0;
477 }
478
479 /*
480 * We are profiling all functions, but usually only a few thousand
481 * functions are hit. We'll make a hash of 1024 items.
482 */
483 size = FTRACE_PROFILE_HASH_SIZE;
484
485 stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
486
487 if (!stat->hash)
488 return -ENOMEM;
489
490 if (!ftrace_profile_bits) {
491 size--;
492
493 for (; size; size >>= 1)
494 ftrace_profile_bits++;
495 }
496
497 /* Preallocate the function profiling pages */
498 if (ftrace_profile_pages_init(stat) < 0) {
499 kfree(stat->hash);
500 stat->hash = NULL;
501 return -ENOMEM;
502 }
503
504 return 0;
505}
506
507static int ftrace_profile_init(void)
508{
509 int cpu;
510 int ret = 0;
511
512 for_each_online_cpu(cpu) {
513 ret = ftrace_profile_init_cpu(cpu);
514 if (ret)
515 break;
516 }
517
518 return ret;
519}
520
521/* interrupts must be disabled */
522static struct ftrace_profile *
523ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
524{
525 struct ftrace_profile *rec;
526 struct hlist_head *hhd;
527 struct hlist_node *n;
528 unsigned long key;
529
530 key = hash_long(ip, ftrace_profile_bits);
531 hhd = &stat->hash[key];
532
533 if (hlist_empty(hhd))
534 return NULL;
535
536 hlist_for_each_entry_rcu(rec, n, hhd, node) {
537 if (rec->ip == ip)
538 return rec;
539 }
540
541 return NULL;
542}
543
544static void ftrace_add_profile(struct ftrace_profile_stat *stat,
545 struct ftrace_profile *rec)
546{
547 unsigned long key;
548
549 key = hash_long(rec->ip, ftrace_profile_bits);
550 hlist_add_head_rcu(&rec->node, &stat->hash[key]);
551}
552
553/*
554 * The memory is already allocated, this simply finds a new record to use.
555 */
556static struct ftrace_profile *
557ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
558{
559 struct ftrace_profile *rec = NULL;
560
561 /* prevent recursion (from NMIs) */
562 if (atomic_inc_return(&stat->disabled) != 1)
563 goto out;
564
565 /*
566 * Try to find the function again since an NMI
567 * could have added it
568 */
569 rec = ftrace_find_profiled_func(stat, ip);
570 if (rec)
571 goto out;
572
573 if (stat->pages->index == PROFILES_PER_PAGE) {
574 if (!stat->pages->next)
575 goto out;
576 stat->pages = stat->pages->next;
577 }
578
579 rec = &stat->pages->records[stat->pages->index++];
580 rec->ip = ip;
581 ftrace_add_profile(stat, rec);
582
583 out:
584 atomic_dec(&stat->disabled);
585
586 return rec;
587}
588
589static void
590function_profile_call(unsigned long ip, unsigned long parent_ip)
591{
592 struct ftrace_profile_stat *stat;
593 struct ftrace_profile *rec;
594 unsigned long flags;
595
596 if (!ftrace_profile_enabled)
597 return;
598
599 local_irq_save(flags);
600
601 stat = &__get_cpu_var(ftrace_profile_stats);
602 if (!stat->hash || !ftrace_profile_enabled)
603 goto out;
604
605 rec = ftrace_find_profiled_func(stat, ip);
606 if (!rec) {
607 rec = ftrace_profile_alloc(stat, ip);
608 if (!rec)
609 goto out;
610 }
611
612 rec->counter++;
613 out:
614 local_irq_restore(flags);
615}
616
617#ifdef CONFIG_FUNCTION_GRAPH_TRACER
618static int profile_graph_entry(struct ftrace_graph_ent *trace)
619{
620 function_profile_call(trace->func, 0);
621 return 1;
622}
623
624static void profile_graph_return(struct ftrace_graph_ret *trace)
625{
626 struct ftrace_profile_stat *stat;
627 unsigned long long calltime;
628 struct ftrace_profile *rec;
629 unsigned long flags;
630
631 local_irq_save(flags);
632 stat = &__get_cpu_var(ftrace_profile_stats);
633 if (!stat->hash || !ftrace_profile_enabled)
634 goto out;
635
636 calltime = trace->rettime - trace->calltime;
637
638 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
639 int index;
640
641 index = trace->depth;
642
643 /* Append this call time to the parent time to subtract */
644 if (index)
645 current->ret_stack[index - 1].subtime += calltime;
646
647 if (current->ret_stack[index].subtime < calltime)
648 calltime -= current->ret_stack[index].subtime;
649 else
650 calltime = 0;
651 }
652
653 rec = ftrace_find_profiled_func(stat, trace->func);
654 if (rec)
655 rec->time += calltime;
656
657 out:
658 local_irq_restore(flags);
659}
660
661static int register_ftrace_profiler(void)
662{
663 return register_ftrace_graph(&profile_graph_return,
664 &profile_graph_entry);
665}
666
667static void unregister_ftrace_profiler(void)
668{
669 unregister_ftrace_graph();
670}
671#else
672static struct ftrace_ops ftrace_profile_ops __read_mostly =
673{
674 .func = function_profile_call,
675};
676
677static int register_ftrace_profiler(void)
678{
679 return register_ftrace_function(&ftrace_profile_ops);
680}
681
682static void unregister_ftrace_profiler(void)
683{
684 unregister_ftrace_function(&ftrace_profile_ops);
685}
686#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
687
688static ssize_t
689ftrace_profile_write(struct file *filp, const char __user *ubuf,
690 size_t cnt, loff_t *ppos)
691{
692 unsigned long val;
693 char buf[64]; /* big enough to hold a number */
694 int ret;
695
696 if (cnt >= sizeof(buf))
697 return -EINVAL;
698
699 if (copy_from_user(&buf, ubuf, cnt))
700 return -EFAULT;
701
702 buf[cnt] = 0;
703
704 ret = strict_strtoul(buf, 10, &val);
705 if (ret < 0)
706 return ret;
707
708 val = !!val;
709
710 mutex_lock(&ftrace_profile_lock);
711 if (ftrace_profile_enabled ^ val) {
712 if (val) {
713 ret = ftrace_profile_init();
714 if (ret < 0) {
715 cnt = ret;
716 goto out;
717 }
718
719 ret = register_ftrace_profiler();
720 if (ret < 0) {
721 cnt = ret;
722 goto out;
723 }
724 ftrace_profile_enabled = 1;
725 } else {
726 ftrace_profile_enabled = 0;
727 /*
728 * unregister_ftrace_profiler calls stop_machine
729 * so this acts like an synchronize_sched.
730 */
731 unregister_ftrace_profiler();
732 }
733 }
734 out:
735 mutex_unlock(&ftrace_profile_lock);
736
737 filp->f_pos += cnt;
738
739 return cnt;
740}
741
742static ssize_t
743ftrace_profile_read(struct file *filp, char __user *ubuf,
744 size_t cnt, loff_t *ppos)
745{
746 char buf[64]; /* big enough to hold a number */
747 int r;
748
749 r = sprintf(buf, "%u\n", ftrace_profile_enabled);
750 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
751}
752
753static const struct file_operations ftrace_profile_fops = {
754 .open = tracing_open_generic,
755 .read = ftrace_profile_read,
756 .write = ftrace_profile_write,
757};
758
759/* used to initialize the real stat files */
760static struct tracer_stat function_stats __initdata = {
761 .name = "functions",
762 .stat_start = function_stat_start,
763 .stat_next = function_stat_next,
764 .stat_cmp = function_stat_cmp,
765 .stat_headers = function_stat_headers,
766 .stat_show = function_stat_show
767};
768
769static void ftrace_profile_debugfs(struct dentry *d_tracer)
770{
771 struct ftrace_profile_stat *stat;
772 struct dentry *entry;
773 char *name;
774 int ret;
775 int cpu;
776
777 for_each_possible_cpu(cpu) {
778 stat = &per_cpu(ftrace_profile_stats, cpu);
779
780 /* allocate enough for function name + cpu number */
781 name = kmalloc(32, GFP_KERNEL);
782 if (!name) {
783 /*
784 * The files created are permanent, if something happens
785 * we still do not free memory.
786 */
787 kfree(stat);
788 WARN(1,
789 "Could not allocate stat file for cpu %d\n",
790 cpu);
791 return;
792 }
793 stat->stat = function_stats;
794 snprintf(name, 32, "function%d", cpu);
795 stat->stat.name = name;
796 ret = register_stat_tracer(&stat->stat);
797 if (ret) {
798 WARN(1,
799 "Could not register function stat for cpu %d\n",
800 cpu);
801 kfree(name);
802 return;
803 }
804 }
805
806 entry = debugfs_create_file("function_profile_enabled", 0644,
807 d_tracer, NULL, &ftrace_profile_fops);
808 if (!entry)
809 pr_warning("Could not create debugfs "
810 "'function_profile_enabled' entry\n");
811}
812
813#else /* CONFIG_FUNCTION_PROFILER */
814static void ftrace_profile_debugfs(struct dentry *d_tracer)
815{
816}
817#endif /* CONFIG_FUNCTION_PROFILER */
818
243/* set when tracing only a pid */ 819/* set when tracing only a pid */
244struct pid *ftrace_pid_trace; 820struct pid *ftrace_pid_trace;
245static struct pid * const ftrace_swapper_pid = &init_struct_pid; 821static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +837,6 @@ struct ftrace_func_probe {
261 struct rcu_head rcu; 837 struct rcu_head rcu;
262}; 838};
263 839
264
265enum { 840enum {
266 FTRACE_ENABLE_CALLS = (1 << 0), 841 FTRACE_ENABLE_CALLS = (1 << 0),
267 FTRACE_DISABLE_CALLS = (1 << 1), 842 FTRACE_DISABLE_CALLS = (1 << 1),
@@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
346 rec->flags |= FTRACE_FL_FREE; 921 rec->flags |= FTRACE_FL_FREE;
347} 922}
348 923
349void ftrace_release(void *start, unsigned long size)
350{
351 struct dyn_ftrace *rec;
352 struct ftrace_page *pg;
353 unsigned long s = (unsigned long)start;
354 unsigned long e = s + size;
355
356 if (ftrace_disabled || !start)
357 return;
358
359 mutex_lock(&ftrace_lock);
360 do_for_each_ftrace_rec(pg, rec) {
361 if ((rec->ip >= s) && (rec->ip < e)) {
362 /*
363 * rec->ip is changed in ftrace_free_rec()
364 * It should not between s and e if record was freed.
365 */
366 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
367 ftrace_free_rec(rec);
368 }
369 } while_for_each_ftrace_rec();
370 mutex_unlock(&ftrace_lock);
371}
372
373static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 924static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
374{ 925{
375 struct dyn_ftrace *rec; 926 struct dyn_ftrace *rec;
@@ -673,6 +1224,13 @@ static void ftrace_shutdown(int command)
673 return; 1224 return;
674 1225
675 ftrace_start_up--; 1226 ftrace_start_up--;
1227 /*
1228 * Just warn in case of unbalance, no need to kill ftrace, it's not
1229 * critical but the ftrace_call callers may be never nopped again after
1230 * further ftrace uses.
1231 */
1232 WARN_ON_ONCE(ftrace_start_up < 0);
1233
676 if (!ftrace_start_up) 1234 if (!ftrace_start_up)
677 command |= FTRACE_DISABLE_CALLS; 1235 command |= FTRACE_DISABLE_CALLS;
678 1236
@@ -1408,7 +1966,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1408 1966
1409static struct ftrace_ops trace_probe_ops __read_mostly = 1967static struct ftrace_ops trace_probe_ops __read_mostly =
1410{ 1968{
1411 .func = function_trace_probe_call, 1969 .func = function_trace_probe_call,
1412}; 1970};
1413 1971
1414static int ftrace_probe_registered; 1972static int ftrace_probe_registered;
@@ -1823,6 +2381,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
1823 ftrace_set_regex(buf, len, reset, 0); 2381 ftrace_set_regex(buf, len, reset, 0);
1824} 2382}
1825 2383
2384/*
2385 * command line interface to allow users to set filters on boot up.
2386 */
2387#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2388static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2389static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2390
2391static int __init set_ftrace_notrace(char *str)
2392{
2393 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
2394 return 1;
2395}
2396__setup("ftrace_notrace=", set_ftrace_notrace);
2397
2398static int __init set_ftrace_filter(char *str)
2399{
2400 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
2401 return 1;
2402}
2403__setup("ftrace_filter=", set_ftrace_filter);
2404
2405static void __init set_ftrace_early_filter(char *buf, int enable)
2406{
2407 char *func;
2408
2409 while (buf) {
2410 func = strsep(&buf, ",");
2411 ftrace_set_regex(func, strlen(func), 0, enable);
2412 }
2413}
2414
2415static void __init set_ftrace_early_filters(void)
2416{
2417 if (ftrace_filter_buf[0])
2418 set_ftrace_early_filter(ftrace_filter_buf, 1);
2419 if (ftrace_notrace_buf[0])
2420 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2421}
2422
1826static int 2423static int
1827ftrace_regex_release(struct inode *inode, struct file *file, int enable) 2424ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1828{ 2425{
@@ -2128,38 +2725,23 @@ static const struct file_operations ftrace_graph_fops = {
2128 2725
2129static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) 2726static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2130{ 2727{
2131 struct dentry *entry;
2132 2728
2133 entry = debugfs_create_file("available_filter_functions", 0444, 2729 trace_create_file("available_filter_functions", 0444,
2134 d_tracer, NULL, &ftrace_avail_fops); 2730 d_tracer, NULL, &ftrace_avail_fops);
2135 if (!entry)
2136 pr_warning("Could not create debugfs "
2137 "'available_filter_functions' entry\n");
2138 2731
2139 entry = debugfs_create_file("failures", 0444, 2732 trace_create_file("failures", 0444,
2140 d_tracer, NULL, &ftrace_failures_fops); 2733 d_tracer, NULL, &ftrace_failures_fops);
2141 if (!entry)
2142 pr_warning("Could not create debugfs 'failures' entry\n");
2143 2734
2144 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, 2735 trace_create_file("set_ftrace_filter", 0644, d_tracer,
2145 NULL, &ftrace_filter_fops); 2736 NULL, &ftrace_filter_fops);
2146 if (!entry)
2147 pr_warning("Could not create debugfs "
2148 "'set_ftrace_filter' entry\n");
2149 2737
2150 entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, 2738 trace_create_file("set_ftrace_notrace", 0644, d_tracer,
2151 NULL, &ftrace_notrace_fops); 2739 NULL, &ftrace_notrace_fops);
2152 if (!entry)
2153 pr_warning("Could not create debugfs "
2154 "'set_ftrace_notrace' entry\n");
2155 2740
2156#ifdef CONFIG_FUNCTION_GRAPH_TRACER 2741#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2157 entry = debugfs_create_file("set_graph_function", 0444, d_tracer, 2742 trace_create_file("set_graph_function", 0444, d_tracer,
2158 NULL, 2743 NULL,
2159 &ftrace_graph_fops); 2744 &ftrace_graph_fops);
2160 if (!entry)
2161 pr_warning("Could not create debugfs "
2162 "'set_graph_function' entry\n");
2163#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2745#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2164 2746
2165 return 0; 2747 return 0;
@@ -2197,14 +2779,72 @@ static int ftrace_convert_nops(struct module *mod,
2197 return 0; 2779 return 0;
2198} 2780}
2199 2781
2200void ftrace_init_module(struct module *mod, 2782#ifdef CONFIG_MODULES
2201 unsigned long *start, unsigned long *end) 2783void ftrace_release(void *start, void *end)
2784{
2785 struct dyn_ftrace *rec;
2786 struct ftrace_page *pg;
2787 unsigned long s = (unsigned long)start;
2788 unsigned long e = (unsigned long)end;
2789
2790 if (ftrace_disabled || !start || start == end)
2791 return;
2792
2793 mutex_lock(&ftrace_lock);
2794 do_for_each_ftrace_rec(pg, rec) {
2795 if ((rec->ip >= s) && (rec->ip < e)) {
2796 /*
2797 * rec->ip is changed in ftrace_free_rec()
2798 * It should not between s and e if record was freed.
2799 */
2800 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
2801 ftrace_free_rec(rec);
2802 }
2803 } while_for_each_ftrace_rec();
2804 mutex_unlock(&ftrace_lock);
2805}
2806
2807static void ftrace_init_module(struct module *mod,
2808 unsigned long *start, unsigned long *end)
2202{ 2809{
2203 if (ftrace_disabled || start == end) 2810 if (ftrace_disabled || start == end)
2204 return; 2811 return;
2205 ftrace_convert_nops(mod, start, end); 2812 ftrace_convert_nops(mod, start, end);
2206} 2813}
2207 2814
2815static int ftrace_module_notify(struct notifier_block *self,
2816 unsigned long val, void *data)
2817{
2818 struct module *mod = data;
2819
2820 switch (val) {
2821 case MODULE_STATE_COMING:
2822 ftrace_init_module(mod, mod->ftrace_callsites,
2823 mod->ftrace_callsites +
2824 mod->num_ftrace_callsites);
2825 break;
2826 case MODULE_STATE_GOING:
2827 ftrace_release(mod->ftrace_callsites,
2828 mod->ftrace_callsites +
2829 mod->num_ftrace_callsites);
2830 break;
2831 }
2832
2833 return 0;
2834}
2835#else
2836static int ftrace_module_notify(struct notifier_block *self,
2837 unsigned long val, void *data)
2838{
2839 return 0;
2840}
2841#endif /* CONFIG_MODULES */
2842
2843struct notifier_block ftrace_module_nb = {
2844 .notifier_call = ftrace_module_notify,
2845 .priority = 0,
2846};
2847
2208extern unsigned long __start_mcount_loc[]; 2848extern unsigned long __start_mcount_loc[];
2209extern unsigned long __stop_mcount_loc[]; 2849extern unsigned long __stop_mcount_loc[];
2210 2850
@@ -2236,6 +2876,12 @@ void __init ftrace_init(void)
2236 __start_mcount_loc, 2876 __start_mcount_loc,
2237 __stop_mcount_loc); 2877 __stop_mcount_loc);
2238 2878
2879 ret = register_module_notifier(&ftrace_module_nb);
2880 if (ret)
2881 pr_warning("Failed to register trace ftrace module notifier\n");
2882
2883 set_ftrace_early_filters();
2884
2239 return; 2885 return;
2240 failed: 2886 failed:
2241 ftrace_disabled = 1; 2887 ftrace_disabled = 1;
@@ -2417,7 +3063,6 @@ static const struct file_operations ftrace_pid_fops = {
2417static __init int ftrace_init_debugfs(void) 3063static __init int ftrace_init_debugfs(void)
2418{ 3064{
2419 struct dentry *d_tracer; 3065 struct dentry *d_tracer;
2420 struct dentry *entry;
2421 3066
2422 d_tracer = tracing_init_dentry(); 3067 d_tracer = tracing_init_dentry();
2423 if (!d_tracer) 3068 if (!d_tracer)
@@ -2425,11 +3070,11 @@ static __init int ftrace_init_debugfs(void)
2425 3070
2426 ftrace_init_dyn_debugfs(d_tracer); 3071 ftrace_init_dyn_debugfs(d_tracer);
2427 3072
2428 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, 3073 trace_create_file("set_ftrace_pid", 0644, d_tracer,
2429 NULL, &ftrace_pid_fops); 3074 NULL, &ftrace_pid_fops);
2430 if (!entry) 3075
2431 pr_warning("Could not create debugfs " 3076 ftrace_profile_debugfs(d_tracer);
2432 "'set_ftrace_pid' entry\n"); 3077
2433 return 0; 3078 return 0;
2434} 3079}
2435fs_initcall(ftrace_init_debugfs); 3080fs_initcall(ftrace_init_debugfs);
@@ -2538,7 +3183,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
2538 3183
2539#ifdef CONFIG_FUNCTION_GRAPH_TRACER 3184#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2540 3185
2541static atomic_t ftrace_graph_active; 3186static int ftrace_graph_active;
2542static struct notifier_block ftrace_suspend_notifier; 3187static struct notifier_block ftrace_suspend_notifier;
2543 3188
2544int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 3189int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3225,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
2580 } 3225 }
2581 3226
2582 if (t->ret_stack == NULL) { 3227 if (t->ret_stack == NULL) {
2583 t->curr_ret_stack = -1;
2584 /* Make sure IRQs see the -1 first: */
2585 barrier();
2586 t->ret_stack = ret_stack_list[start++];
2587 atomic_set(&t->tracing_graph_pause, 0); 3228 atomic_set(&t->tracing_graph_pause, 0);
2588 atomic_set(&t->trace_overrun, 0); 3229 atomic_set(&t->trace_overrun, 0);
3230 t->curr_ret_stack = -1;
3231 /* Make sure the tasks see the -1 first: */
3232 smp_wmb();
3233 t->ret_stack = ret_stack_list[start++];
2589 } 3234 }
2590 } while_each_thread(g, t); 3235 } while_each_thread(g, t);
2591 3236
@@ -2643,8 +3288,10 @@ static int start_graph_tracing(void)
2643 return -ENOMEM; 3288 return -ENOMEM;
2644 3289
2645 /* The cpu_boot init_task->ret_stack will never be freed */ 3290 /* The cpu_boot init_task->ret_stack will never be freed */
2646 for_each_online_cpu(cpu) 3291 for_each_online_cpu(cpu) {
2647 ftrace_graph_init_task(idle_task(cpu)); 3292 if (!idle_task(cpu)->ret_stack)
3293 ftrace_graph_init_task(idle_task(cpu));
3294 }
2648 3295
2649 do { 3296 do {
2650 ret = alloc_retstack_tasklist(ret_stack_list); 3297 ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3337,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2690 mutex_lock(&ftrace_lock); 3337 mutex_lock(&ftrace_lock);
2691 3338
2692 /* we currently allow only one tracer registered at a time */ 3339 /* we currently allow only one tracer registered at a time */
2693 if (atomic_read(&ftrace_graph_active)) { 3340 if (ftrace_graph_active) {
2694 ret = -EBUSY; 3341 ret = -EBUSY;
2695 goto out; 3342 goto out;
2696 } 3343 }
@@ -2698,10 +3345,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2698 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 3345 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2699 register_pm_notifier(&ftrace_suspend_notifier); 3346 register_pm_notifier(&ftrace_suspend_notifier);
2700 3347
2701 atomic_inc(&ftrace_graph_active); 3348 ftrace_graph_active++;
2702 ret = start_graph_tracing(); 3349 ret = start_graph_tracing();
2703 if (ret) { 3350 if (ret) {
2704 atomic_dec(&ftrace_graph_active); 3351 ftrace_graph_active--;
2705 goto out; 3352 goto out;
2706 } 3353 }
2707 3354
@@ -2719,10 +3366,10 @@ void unregister_ftrace_graph(void)
2719{ 3366{
2720 mutex_lock(&ftrace_lock); 3367 mutex_lock(&ftrace_lock);
2721 3368
2722 if (!unlikely(atomic_read(&ftrace_graph_active))) 3369 if (unlikely(!ftrace_graph_active))
2723 goto out; 3370 goto out;
2724 3371
2725 atomic_dec(&ftrace_graph_active); 3372 ftrace_graph_active--;
2726 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); 3373 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
2727 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3374 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2728 ftrace_graph_entry = ftrace_graph_entry_stub; 3375 ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3383,25 @@ void unregister_ftrace_graph(void)
2736/* Allocate a return stack for newly created task */ 3383/* Allocate a return stack for newly created task */
2737void ftrace_graph_init_task(struct task_struct *t) 3384void ftrace_graph_init_task(struct task_struct *t)
2738{ 3385{
2739 if (atomic_read(&ftrace_graph_active)) { 3386 /* Make sure we do not use the parent ret_stack */
2740 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH 3387 t->ret_stack = NULL;
3388
3389 if (ftrace_graph_active) {
3390 struct ftrace_ret_stack *ret_stack;
3391
3392 ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2741 * sizeof(struct ftrace_ret_stack), 3393 * sizeof(struct ftrace_ret_stack),
2742 GFP_KERNEL); 3394 GFP_KERNEL);
2743 if (!t->ret_stack) 3395 if (!ret_stack)
2744 return; 3396 return;
2745 t->curr_ret_stack = -1; 3397 t->curr_ret_stack = -1;
2746 atomic_set(&t->tracing_graph_pause, 0); 3398 atomic_set(&t->tracing_graph_pause, 0);
2747 atomic_set(&t->trace_overrun, 0); 3399 atomic_set(&t->trace_overrun, 0);
2748 t->ftrace_timestamp = 0; 3400 t->ftrace_timestamp = 0;
2749 } else 3401 /* make curr_ret_stack visable before we add the ret_stack */
2750 t->ret_stack = NULL; 3402 smp_wmb();
3403 t->ret_stack = ret_stack;
3404 }
2751} 3405}
2752 3406
2753void ftrace_graph_exit_task(struct task_struct *t) 3407void ftrace_graph_exit_task(struct task_struct *t)
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e37..1edaa9516e81 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
12#include <linux/dcache.h> 12#include <linux/dcache.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14 14
15#include <trace/kmemtrace.h> 15#include <linux/kmemtrace.h>
16 16
17#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h" 18#include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
42 gfp_t gfp_flags, 42 gfp_t gfp_flags,
43 int node) 43 int node)
44{ 44{
45 struct ftrace_event_call *call = &event_kmem_alloc;
45 struct trace_array *tr = kmemtrace_array; 46 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry; 47 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event; 48 struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
62 entry->gfp_flags = gfp_flags; 63 entry->gfp_flags = gfp_flags;
63 entry->node = node; 64 entry->node = node;
64 65
65 ring_buffer_unlock_commit(tr->buffer, event); 66 if (!filter_check_discard(call, entry, tr->buffer, event))
67 ring_buffer_unlock_commit(tr->buffer, event);
66 68
67 trace_wake_up(); 69 trace_wake_up();
68} 70}
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site, 73 unsigned long call_site,
72 const void *ptr) 74 const void *ptr)
73{ 75{
76 struct ftrace_event_call *call = &event_kmem_free;
74 struct trace_array *tr = kmemtrace_array; 77 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry; 78 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event; 79 struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
86 entry->call_site = call_site; 89 entry->call_site = call_site;
87 entry->ptr = ptr; 90 entry->ptr = ptr;
88 91
89 ring_buffer_unlock_commit(tr->buffer, event); 92 if (!filter_check_discard(call, entry, tr->buffer, event))
93 ring_buffer_unlock_commit(tr->buffer, event);
90 94
91 trace_wake_up(); 95 trace_wake_up();
92} 96}
@@ -182,7 +186,7 @@ static int kmem_trace_init(struct trace_array *tr)
182 int cpu; 186 int cpu;
183 kmemtrace_array = tr; 187 kmemtrace_array = tr;
184 188
185 for_each_cpu_mask(cpu, cpu_possible_map) 189 for_each_cpu(cpu, cpu_possible_mask)
186 tracing_reset(tr, cpu); 190 tracing_reset(tr, cpu);
187 191
188 kmemtrace_start_probes(); 192 kmemtrace_start_probes();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844..04dac2638258 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -22,6 +23,28 @@
22#include "trace.h" 23#include "trace.h"
23 24
24/* 25/*
26 * The ring buffer header is special. We must manually up keep it.
27 */
28int ring_buffer_print_entry_header(struct trace_seq *s)
29{
30 int ret;
31
32 ret = trace_seq_printf(s, "# compressed entry header\n");
33 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
34 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
35 ret = trace_seq_printf(s, "\tarray : 32 bits\n");
36 ret = trace_seq_printf(s, "\n");
37 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
38 RINGBUF_TYPE_PADDING);
39 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
40 RINGBUF_TYPE_TIME_EXTEND);
41 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
42 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
43
44 return ret;
45}
46
47/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is 48 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is 49 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read 50 * associated with the CPU it is currently executing on. A reader may read
@@ -182,7 +205,11 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
182 205
183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
184#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
210
211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
186 213
187enum { 214enum {
188 RB_LEN_TIME_EXTEND = 8, 215 RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +218,28 @@ enum {
191 218
192static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
193{ 220{
194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; 221 return event->type_len == RINGBUF_TYPE_PADDING
222 && event->time_delta == 0;
195} 223}
196 224
197static inline int rb_discarded_event(struct ring_buffer_event *event) 225static inline int rb_discarded_event(struct ring_buffer_event *event)
198{ 226{
199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta; 227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
200} 228}
201 229
202static void rb_event_set_padding(struct ring_buffer_event *event) 230static void rb_event_set_padding(struct ring_buffer_event *event)
203{ 231{
204 event->type = RINGBUF_TYPE_PADDING; 232 event->type_len = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0; 233 event->time_delta = 0;
206} 234}
207 235
208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
222{
223 event->type = RINGBUF_TYPE_PADDING;
224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
227}
228
229static unsigned 236static unsigned
230rb_event_data_length(struct ring_buffer_event *event) 237rb_event_data_length(struct ring_buffer_event *event)
231{ 238{
232 unsigned length; 239 unsigned length;
233 240
234 if (event->len) 241 if (event->type_len)
235 length = event->len * RB_ALIGNMENT; 242 length = event->type_len * RB_ALIGNMENT;
236 else 243 else
237 length = event->array[0]; 244 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE; 245 return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +249,12 @@ rb_event_data_length(struct ring_buffer_event *event)
242static unsigned 249static unsigned
243rb_event_length(struct ring_buffer_event *event) 250rb_event_length(struct ring_buffer_event *event)
244{ 251{
245 switch (event->type) { 252 switch (event->type_len) {
246 case RINGBUF_TYPE_PADDING: 253 case RINGBUF_TYPE_PADDING:
247 if (rb_null_event(event)) 254 if (rb_null_event(event))
248 /* undefined */ 255 /* undefined */
249 return -1; 256 return -1;
250 return rb_event_data_length(event); 257 return event->array[0] + RB_EVNT_HDR_SIZE;
251 258
252 case RINGBUF_TYPE_TIME_EXTEND: 259 case RINGBUF_TYPE_TIME_EXTEND:
253 return RB_LEN_TIME_EXTEND; 260 return RB_LEN_TIME_EXTEND;
@@ -271,7 +278,7 @@ rb_event_length(struct ring_buffer_event *event)
271unsigned ring_buffer_event_length(struct ring_buffer_event *event) 278unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272{ 279{
273 unsigned length = rb_event_length(event); 280 unsigned length = rb_event_length(event);
274 if (event->type != RINGBUF_TYPE_DATA) 281 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275 return length; 282 return length;
276 length -= RB_EVNT_HDR_SIZE; 283 length -= RB_EVNT_HDR_SIZE;
277 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) 284 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +291,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
284static void * 291static void *
285rb_event_data(struct ring_buffer_event *event) 292rb_event_data(struct ring_buffer_event *event)
286{ 293{
287 BUG_ON(event->type != RINGBUF_TYPE_DATA); 294 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288 /* If length is in len field, then array[0] has the data */ 295 /* If length is in len field, then array[0] has the data */
289 if (event->len) 296 if (event->type_len)
290 return (void *)&event->array[0]; 297 return (void *)&event->array[0];
291 /* Otherwise length is in array[0] and array[1] has the data */ 298 /* Otherwise length is in array[0] and array[1] has the data */
292 return (void *)&event->array[1]; 299 return (void *)&event->array[1];
@@ -316,9 +323,10 @@ struct buffer_data_page {
316}; 323};
317 324
318struct buffer_page { 325struct buffer_page {
326 struct list_head list; /* list of buffer pages */
319 local_t write; /* index for next write */ 327 local_t write; /* index for next write */
320 unsigned read; /* index for next read */ 328 unsigned read; /* index for next read */
321 struct list_head list; /* list of free pages */ 329 local_t entries; /* entries on this page */
322 struct buffer_data_page *page; /* Actual data page */ 330 struct buffer_data_page *page; /* Actual data page */
323}; 331};
324 332
@@ -361,6 +369,34 @@ static inline int test_time_stamp(u64 delta)
361 369
362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) 370#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
363 371
372/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
373#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
374
375/* Max number of timestamps that can fit on a page */
376#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
377
378int ring_buffer_print_page_header(struct trace_seq *s)
379{
380 struct buffer_data_page field;
381 int ret;
382
383 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
384 "offset:0;\tsize:%u;\n",
385 (unsigned int)sizeof(field.time_stamp));
386
387 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
388 "offset:%u;\tsize:%u;\n",
389 (unsigned int)offsetof(typeof(field), commit),
390 (unsigned int)sizeof(field.commit));
391
392 ret = trace_seq_printf(s, "\tfield: char data;\t"
393 "offset:%u;\tsize:%u;\n",
394 (unsigned int)offsetof(typeof(field), data),
395 (unsigned int)BUF_PAGE_SIZE);
396
397 return ret;
398}
399
364/* 400/*
365 * head_page == tail_page && head == tail then buffer is empty. 401 * head_page == tail_page && head == tail then buffer is empty.
366 */ 402 */
@@ -375,8 +411,13 @@ struct ring_buffer_per_cpu {
375 struct buffer_page *tail_page; /* write to tail */ 411 struct buffer_page *tail_page; /* write to tail */
376 struct buffer_page *commit_page; /* committed pages */ 412 struct buffer_page *commit_page; /* committed pages */
377 struct buffer_page *reader_page; 413 struct buffer_page *reader_page;
414 unsigned long nmi_dropped;
415 unsigned long commit_overrun;
378 unsigned long overrun; 416 unsigned long overrun;
379 unsigned long entries; 417 unsigned long read;
418 local_t entries;
419 local_t committing;
420 local_t commits;
380 u64 write_stamp; 421 u64 write_stamp;
381 u64 read_stamp; 422 u64 read_stamp;
382 atomic_t record_disabled; 423 atomic_t record_disabled;
@@ -389,6 +430,8 @@ struct ring_buffer {
389 atomic_t record_disabled; 430 atomic_t record_disabled;
390 cpumask_var_t cpumask; 431 cpumask_var_t cpumask;
391 432
433 struct lock_class_key *reader_lock_key;
434
392 struct mutex mutex; 435 struct mutex mutex;
393 436
394 struct ring_buffer_per_cpu **buffers; 437 struct ring_buffer_per_cpu **buffers;
@@ -420,13 +463,18 @@ struct ring_buffer_iter {
420/* Up this if you want to test the TIME_EXTENTS and normalization */ 463/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0 464#define DEBUG_SHIFT 0
422 465
466static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
467{
468 /* shift to debug/test normalization and TIME_EXTENTS */
469 return buffer->clock() << DEBUG_SHIFT;
470}
471
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) 472u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{ 473{
425 u64 time; 474 u64 time;
426 475
427 preempt_disable_notrace(); 476 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */ 477 time = rb_time_stamp(buffer, cpu);
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace(); 478 preempt_enable_no_resched_notrace();
431 479
432 return time; 480 return time;
@@ -523,6 +571,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
523 cpu_buffer->cpu = cpu; 571 cpu_buffer->cpu = cpu;
524 cpu_buffer->buffer = buffer; 572 cpu_buffer->buffer = buffer;
525 spin_lock_init(&cpu_buffer->reader_lock); 573 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
526 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
527 INIT_LIST_HEAD(&cpu_buffer->pages); 576 INIT_LIST_HEAD(&cpu_buffer->pages);
528 577
@@ -572,12 +621,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
572 kfree(cpu_buffer); 621 kfree(cpu_buffer);
573} 622}
574 623
575/*
576 * Causes compile errors if the struct buffer_page gets bigger
577 * than the struct page.
578 */
579extern int ring_buffer_page_too_big(void);
580
581#ifdef CONFIG_HOTPLUG_CPU 624#ifdef CONFIG_HOTPLUG_CPU
582static int rb_cpu_notify(struct notifier_block *self, 625static int rb_cpu_notify(struct notifier_block *self,
583 unsigned long action, void *hcpu); 626 unsigned long action, void *hcpu);
@@ -593,17 +636,13 @@ static int rb_cpu_notify(struct notifier_block *self,
593 * when the buffer wraps. If this flag is not set, the buffer will 636 * when the buffer wraps. If this flag is not set, the buffer will
594 * drop data when the tail hits the head. 637 * drop data when the tail hits the head.
595 */ 638 */
596struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 639struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
640 struct lock_class_key *key)
597{ 641{
598 struct ring_buffer *buffer; 642 struct ring_buffer *buffer;
599 int bsize; 643 int bsize;
600 int cpu; 644 int cpu;
601 645
602 /* Paranoid! Optimizes out when all is well */
603 if (sizeof(struct buffer_page) > sizeof(struct page))
604 ring_buffer_page_too_big();
605
606
607 /* keep it in its own cache line */ 646 /* keep it in its own cache line */
608 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 647 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
609 GFP_KERNEL); 648 GFP_KERNEL);
@@ -616,10 +655,11 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 655 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
617 buffer->flags = flags; 656 buffer->flags = flags;
618 buffer->clock = trace_clock_local; 657 buffer->clock = trace_clock_local;
658 buffer->reader_lock_key = key;
619 659
620 /* need at least two pages */ 660 /* need at least two pages */
621 if (buffer->pages == 1) 661 if (buffer->pages < 2)
622 buffer->pages++; 662 buffer->pages = 2;
623 663
624 /* 664 /*
625 * In case of non-hotplug cpu, if the ring-buffer is allocated 665 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -673,7 +713,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
673 kfree(buffer); 713 kfree(buffer);
674 return NULL; 714 return NULL;
675} 715}
676EXPORT_SYMBOL_GPL(ring_buffer_alloc); 716EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
677 717
678/** 718/**
679 * ring_buffer_free - free a ring buffer. 719 * ring_buffer_free - free a ring buffer.
@@ -947,31 +987,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
947 return rb_page_commit(cpu_buffer->head_page); 987 return rb_page_commit(cpu_buffer->head_page);
948} 988}
949 989
950/*
951 * When the tail hits the head and the buffer is in overwrite mode,
952 * the head jumps to the next page and all content on the previous
953 * page is discarded. But before doing so, we update the overrun
954 * variable of the buffer.
955 */
956static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
957{
958 struct ring_buffer_event *event;
959 unsigned long head;
960
961 for (head = 0; head < rb_head_size(cpu_buffer);
962 head += rb_event_length(event)) {
963
964 event = __rb_page_index(cpu_buffer->head_page, head);
965 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
966 return;
967 /* Only count data entries */
968 if (event->type != RINGBUF_TYPE_DATA)
969 continue;
970 cpu_buffer->overrun++;
971 cpu_buffer->entries--;
972 }
973}
974
975static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 990static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
976 struct buffer_page **bpage) 991 struct buffer_page **bpage)
977{ 992{
@@ -988,12 +1003,12 @@ rb_event_index(struct ring_buffer_event *event)
988{ 1003{
989 unsigned long addr = (unsigned long)event; 1004 unsigned long addr = (unsigned long)event;
990 1005
991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1006 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
992} 1007}
993 1008
994static int 1009static inline int
995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1010rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
996 struct ring_buffer_event *event) 1011 struct ring_buffer_event *event)
997{ 1012{
998 unsigned long addr = (unsigned long)event; 1013 unsigned long addr = (unsigned long)event;
999 unsigned long index; 1014 unsigned long index;
@@ -1006,31 +1021,6 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1006} 1021}
1007 1022
1008static void 1023static void
1009rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1010 struct ring_buffer_event *event)
1011{
1012 unsigned long addr = (unsigned long)event;
1013 unsigned long index;
1014
1015 index = rb_event_index(event);
1016 addr &= PAGE_MASK;
1017
1018 while (cpu_buffer->commit_page->page != (void *)addr) {
1019 if (RB_WARN_ON(cpu_buffer,
1020 cpu_buffer->commit_page == cpu_buffer->tail_page))
1021 return;
1022 cpu_buffer->commit_page->page->commit =
1023 cpu_buffer->commit_page->write;
1024 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1025 cpu_buffer->write_stamp =
1026 cpu_buffer->commit_page->page->time_stamp;
1027 }
1028
1029 /* Now set the commit to the event's index */
1030 local_set(&cpu_buffer->commit_page->page->commit, index);
1031}
1032
1033static void
1034rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1024rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1035{ 1025{
1036 /* 1026 /*
@@ -1110,28 +1100,21 @@ static void
1110rb_update_event(struct ring_buffer_event *event, 1100rb_update_event(struct ring_buffer_event *event,
1111 unsigned type, unsigned length) 1101 unsigned type, unsigned length)
1112{ 1102{
1113 event->type = type; 1103 event->type_len = type;
1114 1104
1115 switch (type) { 1105 switch (type) {
1116 1106
1117 case RINGBUF_TYPE_PADDING: 1107 case RINGBUF_TYPE_PADDING:
1118 break;
1119
1120 case RINGBUF_TYPE_TIME_EXTEND: 1108 case RINGBUF_TYPE_TIME_EXTEND:
1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
1122 break;
1123
1124 case RINGBUF_TYPE_TIME_STAMP: 1109 case RINGBUF_TYPE_TIME_STAMP:
1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
1126 break; 1110 break;
1127 1111
1128 case RINGBUF_TYPE_DATA: 1112 case 0:
1129 length -= RB_EVNT_HDR_SIZE; 1113 length -= RB_EVNT_HDR_SIZE;
1130 if (length > RB_MAX_SMALL_DATA) { 1114 if (length > RB_MAX_SMALL_DATA)
1131 event->len = 0;
1132 event->array[0] = length; 1115 event->array[0] = length;
1133 } else 1116 else
1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1117 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1135 break; 1118 break;
1136 default: 1119 default:
1137 BUG(); 1120 BUG();
@@ -1155,158 +1138,241 @@ static unsigned rb_calculate_event_length(unsigned length)
1155 return length; 1138 return length;
1156} 1139}
1157 1140
1158static struct ring_buffer_event * 1141static inline void
1159__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1142rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1160 unsigned type, unsigned long length, u64 *ts) 1143 struct buffer_page *tail_page,
1144 unsigned long tail, unsigned long length)
1161{ 1145{
1162 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
1163 unsigned long tail, write;
1164 struct ring_buffer *buffer = cpu_buffer->buffer;
1165 struct ring_buffer_event *event; 1146 struct ring_buffer_event *event;
1166 unsigned long flags;
1167 bool lock_taken = false;
1168 1147
1169 commit_page = cpu_buffer->commit_page; 1148 /*
1170 /* we just need to protect against interrupts */ 1149 * Only the event that crossed the page boundary
1171 barrier(); 1150 * must fill the old tail_page with padding.
1172 tail_page = cpu_buffer->tail_page; 1151 */
1173 write = local_add_return(length, &tail_page->write); 1152 if (tail >= BUF_PAGE_SIZE) {
1174 tail = write - length; 1153 local_sub(length, &tail_page->write);
1154 return;
1155 }
1175 1156
1176 /* See if we shot pass the end of this buffer page */ 1157 event = __rb_page_index(tail_page, tail);
1177 if (write > BUF_PAGE_SIZE) { 1158 kmemcheck_annotate_bitfield(event, bitfield);
1178 struct buffer_page *next_page = tail_page;
1179 1159
1180 local_irq_save(flags); 1160 /*
1181 /* 1161 * If this event is bigger than the minimum size, then
1182 * Since the write to the buffer is still not 1162 * we need to be careful that we don't subtract the
1183 * fully lockless, we must be careful with NMIs. 1163 * write counter enough to allow another writer to slip
1184 * The locks in the writers are taken when a write 1164 * in on this page.
1185 * crosses to a new page. The locks protect against 1165 * We put in a discarded commit instead, to make sure
1186 * races with the readers (this will soon be fixed 1166 * that this space is not used again.
1187 * with a lockless solution). 1167 *
1188 * 1168 * If we are less than the minimum size, we don't need to
1189 * Because we can not protect against NMIs, and we 1169 * worry about it.
1190 * want to keep traces reentrant, we need to manage 1170 */
1191 * what happens when we are in an NMI. 1171 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1192 * 1172 /* No room for any events */
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203 1173
1204 lock_taken = true; 1174 /* Mark the rest of the page with padding */
1175 rb_event_set_padding(event);
1205 1176
1206 rb_inc_page(cpu_buffer, &next_page); 1177 /* Set the write back to the previous setting */
1178 local_sub(length, &tail_page->write);
1179 return;
1180 }
1207 1181
1208 head_page = cpu_buffer->head_page; 1182 /* Put in a discarded event */
1209 reader_page = cpu_buffer->reader_page; 1183 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1184 event->type_len = RINGBUF_TYPE_PADDING;
1185 /* time delta must be non zero */
1186 event->time_delta = 1;
1187 /* Account for this as an entry */
1188 local_inc(&tail_page->entries);
1189 local_inc(&cpu_buffer->entries);
1210 1190
1211 /* we grabbed the lock before incrementing */ 1191 /* Set write to end of buffer */
1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1192 length = (tail + length) - BUF_PAGE_SIZE;
1213 goto out_reset; 1193 local_sub(length, &tail_page->write);
1194}
1214 1195
1215 /* 1196static struct ring_buffer_event *
1216 * If for some reason, we had an interrupt storm that made 1197rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1217 * it all the way around the buffer, bail, and warn 1198 unsigned long length, unsigned long tail,
1218 * about it. 1199 struct buffer_page *commit_page,
1219 */ 1200 struct buffer_page *tail_page, u64 *ts)
1220 if (unlikely(next_page == commit_page)) { 1201{
1221 WARN_ON_ONCE(1); 1202 struct buffer_page *next_page, *head_page, *reader_page;
1203 struct ring_buffer *buffer = cpu_buffer->buffer;
1204 bool lock_taken = false;
1205 unsigned long flags;
1206
1207 next_page = tail_page;
1208
1209 local_irq_save(flags);
1210 /*
1211 * Since the write to the buffer is still not
1212 * fully lockless, we must be careful with NMIs.
1213 * The locks in the writers are taken when a write
1214 * crosses to a new page. The locks protect against
1215 * races with the readers (this will soon be fixed
1216 * with a lockless solution).
1217 *
1218 * Because we can not protect against NMIs, and we
1219 * want to keep traces reentrant, we need to manage
1220 * what happens when we are in an NMI.
1221 *
1222 * NMIs can happen after we take the lock.
1223 * If we are in an NMI, only take the lock
1224 * if it is not already taken. Otherwise
1225 * simply fail.
1226 */
1227 if (unlikely(in_nmi())) {
1228 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1229 cpu_buffer->nmi_dropped++;
1222 goto out_reset; 1230 goto out_reset;
1223 } 1231 }
1232 } else
1233 __raw_spin_lock(&cpu_buffer->lock);
1224 1234
1225 if (next_page == head_page) { 1235 lock_taken = true;
1226 if (!(buffer->flags & RB_FL_OVERWRITE))
1227 goto out_reset;
1228 1236
1229 /* tail_page has not moved yet? */ 1237 rb_inc_page(cpu_buffer, &next_page);
1230 if (tail_page == cpu_buffer->tail_page) {
1231 /* count overflows */
1232 rb_update_overflow(cpu_buffer);
1233 1238
1234 rb_inc_page(cpu_buffer, &head_page); 1239 head_page = cpu_buffer->head_page;
1235 cpu_buffer->head_page = head_page; 1240 reader_page = cpu_buffer->reader_page;
1236 cpu_buffer->head_page->read = 0;
1237 }
1238 }
1239 1241
1240 /* 1242 /* we grabbed the lock before incrementing */
1241 * If the tail page is still the same as what we think 1243 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1242 * it is, then it is up to us to update the tail 1244 goto out_reset;
1243 * pointer. 1245
1244 */ 1246 /*
1247 * If for some reason, we had an interrupt storm that made
1248 * it all the way around the buffer, bail, and warn
1249 * about it.
1250 */
1251 if (unlikely(next_page == commit_page)) {
1252 cpu_buffer->commit_overrun++;
1253 goto out_reset;
1254 }
1255
1256 if (next_page == head_page) {
1257 if (!(buffer->flags & RB_FL_OVERWRITE))
1258 goto out_reset;
1259
1260 /* tail_page has not moved yet? */
1245 if (tail_page == cpu_buffer->tail_page) { 1261 if (tail_page == cpu_buffer->tail_page) {
1246 local_set(&next_page->write, 0); 1262 /* count overflows */
1247 local_set(&next_page->page->commit, 0); 1263 cpu_buffer->overrun +=
1248 cpu_buffer->tail_page = next_page; 1264 local_read(&head_page->entries);
1249 1265
1250 /* reread the time stamp */ 1266 rb_inc_page(cpu_buffer, &head_page);
1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); 1267 cpu_buffer->head_page = head_page;
1252 cpu_buffer->tail_page->page->time_stamp = *ts; 1268 cpu_buffer->head_page->read = 0;
1253 } 1269 }
1270 }
1254 1271
1255 /* 1272 /*
1256 * The actual tail page has moved forward. 1273 * If the tail page is still the same as what we think
1257 */ 1274 * it is, then it is up to us to update the tail
1258 if (tail < BUF_PAGE_SIZE) { 1275 * pointer.
1259 /* Mark the rest of the page with padding */ 1276 */
1260 event = __rb_page_index(tail_page, tail); 1277 if (tail_page == cpu_buffer->tail_page) {
1261 rb_event_set_padding(event); 1278 local_set(&next_page->write, 0);
1262 } 1279 local_set(&next_page->entries, 0);
1280 local_set(&next_page->page->commit, 0);
1281 cpu_buffer->tail_page = next_page;
1282
1283 /* reread the time stamp */
1284 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1285 cpu_buffer->tail_page->page->time_stamp = *ts;
1286 }
1263 1287
1264 if (tail <= BUF_PAGE_SIZE) 1288 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1265 /* Set the write back to the previous setting */
1266 local_set(&tail_page->write, tail);
1267 1289
1268 /* 1290 __raw_spin_unlock(&cpu_buffer->lock);
1269 * If this was a commit entry that failed, 1291 local_irq_restore(flags);
1270 * increment that too 1292
1271 */ 1293 /* fail and let the caller try again */
1272 if (tail_page == cpu_buffer->commit_page && 1294 return ERR_PTR(-EAGAIN);
1273 tail == rb_commit_index(cpu_buffer)) {
1274 rb_set_commit_to_write(cpu_buffer);
1275 }
1276 1295
1296 out_reset:
1297 /* reset write */
1298 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299
1300 if (likely(lock_taken))
1277 __raw_spin_unlock(&cpu_buffer->lock); 1301 __raw_spin_unlock(&cpu_buffer->lock);
1278 local_irq_restore(flags); 1302 local_irq_restore(flags);
1303 return NULL;
1304}
1279 1305
1280 /* fail and let the caller try again */ 1306static struct ring_buffer_event *
1281 return ERR_PTR(-EAGAIN); 1307__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1282 } 1308 unsigned type, unsigned long length, u64 *ts)
1309{
1310 struct buffer_page *tail_page, *commit_page;
1311 struct ring_buffer_event *event;
1312 unsigned long tail, write;
1283 1313
1284 /* We reserved something on the buffer */ 1314 commit_page = cpu_buffer->commit_page;
1315 /* we just need to protect against interrupts */
1316 barrier();
1317 tail_page = cpu_buffer->tail_page;
1318 write = local_add_return(length, &tail_page->write);
1319 tail = write - length;
1285 1320
1286 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) 1321 /* See if we shot pass the end of this buffer page */
1287 return NULL; 1322 if (write > BUF_PAGE_SIZE)
1323 return rb_move_tail(cpu_buffer, length, tail,
1324 commit_page, tail_page, ts);
1325
1326 /* We reserved something on the buffer */
1288 1327
1289 event = __rb_page_index(tail_page, tail); 1328 event = __rb_page_index(tail_page, tail);
1329 kmemcheck_annotate_bitfield(event, bitfield);
1290 rb_update_event(event, type, length); 1330 rb_update_event(event, type, length);
1291 1331
1332 /* The passed in type is zero for DATA */
1333 if (likely(!type))
1334 local_inc(&tail_page->entries);
1335
1292 /* 1336 /*
1293 * If this is a commit and the tail is zero, then update 1337 * If this is the first commit on the page, then update
1294 * this page's time stamp. 1338 * its timestamp.
1295 */ 1339 */
1296 if (!tail && rb_is_commit(cpu_buffer, event)) 1340 if (!tail)
1297 cpu_buffer->commit_page->page->time_stamp = *ts; 1341 tail_page->page->time_stamp = *ts;
1298 1342
1299 return event; 1343 return event;
1344}
1300 1345
1301 out_reset: 1346static inline int
1302 /* reset write */ 1347rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1303 if (tail <= BUF_PAGE_SIZE) 1348 struct ring_buffer_event *event)
1304 local_set(&tail_page->write, tail); 1349{
1350 unsigned long new_index, old_index;
1351 struct buffer_page *bpage;
1352 unsigned long index;
1353 unsigned long addr;
1305 1354
1306 if (likely(lock_taken)) 1355 new_index = rb_event_index(event);
1307 __raw_spin_unlock(&cpu_buffer->lock); 1356 old_index = new_index + rb_event_length(event);
1308 local_irq_restore(flags); 1357 addr = (unsigned long)event;
1309 return NULL; 1358 addr &= PAGE_MASK;
1359
1360 bpage = cpu_buffer->tail_page;
1361
1362 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1363 /*
1364 * This is on the tail page. It is possible that
1365 * a write could come in and move the tail page
1366 * and write to the next page. That is fine
1367 * because we just shorten what is on this page.
1368 */
1369 index = local_cmpxchg(&bpage->write, old_index, new_index);
1370 if (index == old_index)
1371 return 1;
1372 }
1373
1374 /* could not discard */
1375 return 0;
1310} 1376}
1311 1377
1312static int 1378static int
@@ -1341,26 +1407,33 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1341 return -EAGAIN; 1407 return -EAGAIN;
1342 1408
1343 /* Only a commited time event can update the write stamp */ 1409 /* Only a commited time event can update the write stamp */
1344 if (rb_is_commit(cpu_buffer, event)) { 1410 if (rb_event_is_commit(cpu_buffer, event)) {
1345 /* 1411 /*
1346 * If this is the first on the page, then we need to 1412 * If this is the first on the page, then it was
1347 * update the page itself, and just put in a zero. 1413 * updated with the page itself. Try to discard it
1414 * and if we can't just make it zero.
1348 */ 1415 */
1349 if (rb_event_index(event)) { 1416 if (rb_event_index(event)) {
1350 event->time_delta = *delta & TS_MASK; 1417 event->time_delta = *delta & TS_MASK;
1351 event->array[0] = *delta >> TS_SHIFT; 1418 event->array[0] = *delta >> TS_SHIFT;
1352 } else { 1419 } else {
1353 cpu_buffer->commit_page->page->time_stamp = *ts; 1420 /* try to discard, since we do not need this */
1354 event->time_delta = 0; 1421 if (!rb_try_to_discard(cpu_buffer, event)) {
1355 event->array[0] = 0; 1422 /* nope, just zero it */
1423 event->time_delta = 0;
1424 event->array[0] = 0;
1425 }
1356 } 1426 }
1357 cpu_buffer->write_stamp = *ts; 1427 cpu_buffer->write_stamp = *ts;
1358 /* let the caller know this was the commit */ 1428 /* let the caller know this was the commit */
1359 ret = 1; 1429 ret = 1;
1360 } else { 1430 } else {
1361 /* Darn, this is just wasted space */ 1431 /* Try to discard the event */
1362 event->time_delta = 0; 1432 if (!rb_try_to_discard(cpu_buffer, event)) {
1363 event->array[0] = 0; 1433 /* Darn, this is just wasted space */
1434 event->time_delta = 0;
1435 event->array[0] = 0;
1436 }
1364 ret = 0; 1437 ret = 0;
1365 } 1438 }
1366 1439
@@ -1369,15 +1442,56 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1369 return ret; 1442 return ret;
1370} 1443}
1371 1444
1445static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
1446{
1447 local_inc(&cpu_buffer->committing);
1448 local_inc(&cpu_buffer->commits);
1449}
1450
1451static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1452{
1453 unsigned long commits;
1454
1455 if (RB_WARN_ON(cpu_buffer,
1456 !local_read(&cpu_buffer->committing)))
1457 return;
1458
1459 again:
1460 commits = local_read(&cpu_buffer->commits);
1461 /* synchronize with interrupts */
1462 barrier();
1463 if (local_read(&cpu_buffer->committing) == 1)
1464 rb_set_commit_to_write(cpu_buffer);
1465
1466 local_dec(&cpu_buffer->committing);
1467
1468 /* synchronize with interrupts */
1469 barrier();
1470
1471 /*
1472 * Need to account for interrupts coming in between the
1473 * updating of the commit page and the clearing of the
1474 * committing counter.
1475 */
1476 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
1477 !local_read(&cpu_buffer->committing)) {
1478 local_inc(&cpu_buffer->committing);
1479 goto again;
1480 }
1481}
1482
1372static struct ring_buffer_event * 1483static struct ring_buffer_event *
1373rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1484rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1374 unsigned type, unsigned long length) 1485 unsigned long length)
1375{ 1486{
1376 struct ring_buffer_event *event; 1487 struct ring_buffer_event *event;
1377 u64 ts, delta; 1488 u64 ts, delta = 0;
1378 int commit = 0; 1489 int commit = 0;
1379 int nr_loops = 0; 1490 int nr_loops = 0;
1380 1491
1492 rb_start_commit(cpu_buffer);
1493
1494 length = rb_calculate_event_length(length);
1381 again: 1495 again:
1382 /* 1496 /*
1383 * We allow for interrupts to reenter here and do a trace. 1497 * We allow for interrupts to reenter here and do a trace.
@@ -1389,9 +1503,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1389 * Bail! 1503 * Bail!
1390 */ 1504 */
1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1505 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1392 return NULL; 1506 goto out_fail;
1393 1507
1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1508 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1395 1509
1396 /* 1510 /*
1397 * Only the first commit can update the timestamp. 1511 * Only the first commit can update the timestamp.
@@ -1401,61 +1515,82 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1401 * also be made. But only the entry that did the actual 1515 * also be made. But only the entry that did the actual
1402 * commit will be something other than zero. 1516 * commit will be something other than zero.
1403 */ 1517 */
1404 if (cpu_buffer->tail_page == cpu_buffer->commit_page && 1518 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
1405 rb_page_write(cpu_buffer->tail_page) == 1519 rb_page_write(cpu_buffer->tail_page) ==
1406 rb_commit_index(cpu_buffer)) { 1520 rb_commit_index(cpu_buffer))) {
1521 u64 diff;
1407 1522
1408 delta = ts - cpu_buffer->write_stamp; 1523 diff = ts - cpu_buffer->write_stamp;
1409 1524
1410 /* make sure this delta is calculated here */ 1525 /* make sure this diff is calculated here */
1411 barrier(); 1526 barrier();
1412 1527
1413 /* Did the write stamp get updated already? */ 1528 /* Did the write stamp get updated already? */
1414 if (unlikely(ts < cpu_buffer->write_stamp)) 1529 if (unlikely(ts < cpu_buffer->write_stamp))
1415 delta = 0; 1530 goto get_event;
1416 1531
1417 if (test_time_stamp(delta)) { 1532 delta = diff;
1533 if (unlikely(test_time_stamp(delta))) {
1418 1534
1419 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1535 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1420
1421 if (commit == -EBUSY) 1536 if (commit == -EBUSY)
1422 return NULL; 1537 goto out_fail;
1423 1538
1424 if (commit == -EAGAIN) 1539 if (commit == -EAGAIN)
1425 goto again; 1540 goto again;
1426 1541
1427 RB_WARN_ON(cpu_buffer, commit < 0); 1542 RB_WARN_ON(cpu_buffer, commit < 0);
1428 } 1543 }
1429 } else 1544 }
1430 /* Non commits have zero deltas */
1431 delta = 0;
1432 1545
1433 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1546 get_event:
1434 if (PTR_ERR(event) == -EAGAIN) 1547 event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
1548 if (unlikely(PTR_ERR(event) == -EAGAIN))
1435 goto again; 1549 goto again;
1436 1550
1437 if (!event) { 1551 if (!event)
1438 if (unlikely(commit)) 1552 goto out_fail;
1439 /*
1440 * Ouch! We needed a timestamp and it was commited. But
1441 * we didn't get our event reserved.
1442 */
1443 rb_set_commit_to_write(cpu_buffer);
1444 return NULL;
1445 }
1446 1553
1447 /* 1554 if (!rb_event_is_commit(cpu_buffer, event))
1448 * If the timestamp was commited, make the commit our entry
1449 * now so that we will update it when needed.
1450 */
1451 if (commit)
1452 rb_set_commit_event(cpu_buffer, event);
1453 else if (!rb_is_commit(cpu_buffer, event))
1454 delta = 0; 1555 delta = 0;
1455 1556
1456 event->time_delta = delta; 1557 event->time_delta = delta;
1457 1558
1458 return event; 1559 return event;
1560
1561 out_fail:
1562 rb_end_commit(cpu_buffer);
1563 return NULL;
1564}
1565
1566#define TRACE_RECURSIVE_DEPTH 16
1567
1568static int trace_recursive_lock(void)
1569{
1570 current->trace_recursion++;
1571
1572 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
1573 return 0;
1574
1575 /* Disable all tracing before we do anything else */
1576 tracing_off_permanent();
1577
1578 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
1579 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
1580 current->trace_recursion,
1581 hardirq_count() >> HARDIRQ_SHIFT,
1582 softirq_count() >> SOFTIRQ_SHIFT,
1583 in_nmi());
1584
1585 WARN_ON_ONCE(1);
1586 return -1;
1587}
1588
1589static void trace_recursive_unlock(void)
1590{
1591 WARN_ON_ONCE(!current->trace_recursion);
1592
1593 current->trace_recursion--;
1459} 1594}
1460 1595
1461static DEFINE_PER_CPU(int, rb_need_resched); 1596static DEFINE_PER_CPU(int, rb_need_resched);
@@ -1491,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1491 /* If we are tracing schedule, we don't want to recurse */ 1626 /* If we are tracing schedule, we don't want to recurse */
1492 resched = ftrace_preempt_disable(); 1627 resched = ftrace_preempt_disable();
1493 1628
1629 if (trace_recursive_lock())
1630 goto out_nocheck;
1631
1494 cpu = raw_smp_processor_id(); 1632 cpu = raw_smp_processor_id();
1495 1633
1496 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1634 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1639,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1501 if (atomic_read(&cpu_buffer->record_disabled)) 1639 if (atomic_read(&cpu_buffer->record_disabled))
1502 goto out; 1640 goto out;
1503 1641
1504 length = rb_calculate_event_length(length); 1642 if (length > BUF_MAX_DATA_SIZE)
1505 if (length > BUF_PAGE_SIZE)
1506 goto out; 1643 goto out;
1507 1644
1508 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1645 event = rb_reserve_next_event(cpu_buffer, length);
1509 if (!event) 1646 if (!event)
1510 goto out; 1647 goto out;
1511 1648
@@ -1520,6 +1657,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1520 return event; 1657 return event;
1521 1658
1522 out: 1659 out:
1660 trace_recursive_unlock();
1661
1662 out_nocheck:
1523 ftrace_preempt_enable(resched); 1663 ftrace_preempt_enable(resched);
1524 return NULL; 1664 return NULL;
1525} 1665}
@@ -1528,15 +1668,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1528static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1668static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1529 struct ring_buffer_event *event) 1669 struct ring_buffer_event *event)
1530{ 1670{
1531 cpu_buffer->entries++; 1671 local_inc(&cpu_buffer->entries);
1532
1533 /* Only process further if we own the commit */
1534 if (!rb_is_commit(cpu_buffer, event))
1535 return;
1536 1672
1537 cpu_buffer->write_stamp += event->time_delta; 1673 /*
1674 * The event first in the commit queue updates the
1675 * time stamp.
1676 */
1677 if (rb_event_is_commit(cpu_buffer, event))
1678 cpu_buffer->write_stamp += event->time_delta;
1538 1679
1539 rb_set_commit_to_write(cpu_buffer); 1680 rb_end_commit(cpu_buffer);
1540} 1681}
1541 1682
1542/** 1683/**
@@ -1558,6 +1699,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1558 1699
1559 rb_commit(cpu_buffer, event); 1700 rb_commit(cpu_buffer, event);
1560 1701
1702 trace_recursive_unlock();
1703
1561 /* 1704 /*
1562 * Only the last preempt count needs to restore preemption. 1705 * Only the last preempt count needs to restore preemption.
1563 */ 1706 */
@@ -1570,6 +1713,93 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1570} 1713}
1571EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); 1714EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1572 1715
1716static inline void rb_event_discard(struct ring_buffer_event *event)
1717{
1718 /* array[0] holds the actual length for the discarded event */
1719 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
1720 event->type_len = RINGBUF_TYPE_PADDING;
1721 /* time delta must be non zero */
1722 if (!event->time_delta)
1723 event->time_delta = 1;
1724}
1725
1726/**
1727 * ring_buffer_event_discard - discard any event in the ring buffer
1728 * @event: the event to discard
1729 *
1730 * Sometimes a event that is in the ring buffer needs to be ignored.
1731 * This function lets the user discard an event in the ring buffer
1732 * and then that event will not be read later.
1733 *
1734 * Note, it is up to the user to be careful with this, and protect
1735 * against races. If the user discards an event that has been consumed
1736 * it is possible that it could corrupt the ring buffer.
1737 */
1738void ring_buffer_event_discard(struct ring_buffer_event *event)
1739{
1740 rb_event_discard(event);
1741}
1742EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1743
1744/**
1745 * ring_buffer_commit_discard - discard an event that has not been committed
1746 * @buffer: the ring buffer
1747 * @event: non committed event to discard
1748 *
1749 * This is similar to ring_buffer_event_discard but must only be
1750 * performed on an event that has not been committed yet. The difference
1751 * is that this will also try to free the event from the ring buffer
1752 * if another event has not been added behind it.
1753 *
1754 * If another event has been added behind it, it will set the event
1755 * up as discarded, and perform the commit.
1756 *
1757 * If this function is called, do not call ring_buffer_unlock_commit on
1758 * the event.
1759 */
1760void ring_buffer_discard_commit(struct ring_buffer *buffer,
1761 struct ring_buffer_event *event)
1762{
1763 struct ring_buffer_per_cpu *cpu_buffer;
1764 int cpu;
1765
1766 /* The event is discarded regardless */
1767 rb_event_discard(event);
1768
1769 cpu = smp_processor_id();
1770 cpu_buffer = buffer->buffers[cpu];
1771
1772 /*
1773 * This must only be called if the event has not been
1774 * committed yet. Thus we can assume that preemption
1775 * is still disabled.
1776 */
1777 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1778
1779 if (!rb_try_to_discard(cpu_buffer, event))
1780 goto out;
1781
1782 /*
1783 * The commit is still visible by the reader, so we
1784 * must increment entries.
1785 */
1786 local_inc(&cpu_buffer->entries);
1787 out:
1788 rb_end_commit(cpu_buffer);
1789
1790 trace_recursive_unlock();
1791
1792 /*
1793 * Only the last preempt count needs to restore preemption.
1794 */
1795 if (preempt_count() == 1)
1796 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1797 else
1798 preempt_enable_no_resched_notrace();
1799
1800}
1801EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
1802
1573/** 1803/**
1574 * ring_buffer_write - write data to the buffer without reserving 1804 * ring_buffer_write - write data to the buffer without reserving
1575 * @buffer: The ring buffer to write to. 1805 * @buffer: The ring buffer to write to.
@@ -1589,7 +1819,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
1589{ 1819{
1590 struct ring_buffer_per_cpu *cpu_buffer; 1820 struct ring_buffer_per_cpu *cpu_buffer;
1591 struct ring_buffer_event *event; 1821 struct ring_buffer_event *event;
1592 unsigned long event_length;
1593 void *body; 1822 void *body;
1594 int ret = -EBUSY; 1823 int ret = -EBUSY;
1595 int cpu, resched; 1824 int cpu, resched;
@@ -1612,9 +1841,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
1612 if (atomic_read(&cpu_buffer->record_disabled)) 1841 if (atomic_read(&cpu_buffer->record_disabled))
1613 goto out; 1842 goto out;
1614 1843
1615 event_length = rb_calculate_event_length(length); 1844 if (length > BUF_MAX_DATA_SIZE)
1616 event = rb_reserve_next_event(cpu_buffer, 1845 goto out;
1617 RINGBUF_TYPE_DATA, event_length); 1846
1847 event = rb_reserve_next_event(cpu_buffer, length);
1618 if (!event) 1848 if (!event)
1619 goto out; 1849 goto out;
1620 1850
@@ -1728,7 +1958,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1728 return 0; 1958 return 0;
1729 1959
1730 cpu_buffer = buffer->buffers[cpu]; 1960 cpu_buffer = buffer->buffers[cpu];
1731 ret = cpu_buffer->entries; 1961 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
1962 - cpu_buffer->read;
1732 1963
1733 return ret; 1964 return ret;
1734} 1965}
@@ -1755,6 +1986,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1986EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1756 1987
1757/** 1988/**
1989 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1990 * @buffer: The ring buffer
1991 * @cpu: The per CPU buffer to get the number of overruns from
1992 */
1993unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1994{
1995 struct ring_buffer_per_cpu *cpu_buffer;
1996 unsigned long ret;
1997
1998 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1999 return 0;
2000
2001 cpu_buffer = buffer->buffers[cpu];
2002 ret = cpu_buffer->nmi_dropped;
2003
2004 return ret;
2005}
2006EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2007
2008/**
2009 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2010 * @buffer: The ring buffer
2011 * @cpu: The per CPU buffer to get the number of overruns from
2012 */
2013unsigned long
2014ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2015{
2016 struct ring_buffer_per_cpu *cpu_buffer;
2017 unsigned long ret;
2018
2019 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2020 return 0;
2021
2022 cpu_buffer = buffer->buffers[cpu];
2023 ret = cpu_buffer->commit_overrun;
2024
2025 return ret;
2026}
2027EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
2028
2029/**
1758 * ring_buffer_entries - get the number of entries in a buffer 2030 * ring_buffer_entries - get the number of entries in a buffer
1759 * @buffer: The ring buffer 2031 * @buffer: The ring buffer
1760 * 2032 *
@@ -1770,7 +2042,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1770 /* if you care about this being correct, lock the buffer */ 2042 /* if you care about this being correct, lock the buffer */
1771 for_each_buffer_cpu(buffer, cpu) { 2043 for_each_buffer_cpu(buffer, cpu) {
1772 cpu_buffer = buffer->buffers[cpu]; 2044 cpu_buffer = buffer->buffers[cpu];
1773 entries += cpu_buffer->entries; 2045 entries += (local_read(&cpu_buffer->entries) -
2046 cpu_buffer->overrun) - cpu_buffer->read;
1774 } 2047 }
1775 2048
1776 return entries; 2049 return entries;
@@ -1862,7 +2135,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1862{ 2135{
1863 u64 delta; 2136 u64 delta;
1864 2137
1865 switch (event->type) { 2138 switch (event->type_len) {
1866 case RINGBUF_TYPE_PADDING: 2139 case RINGBUF_TYPE_PADDING:
1867 return; 2140 return;
1868 2141
@@ -1893,7 +2166,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1893{ 2166{
1894 u64 delta; 2167 u64 delta;
1895 2168
1896 switch (event->type) { 2169 switch (event->type_len) {
1897 case RINGBUF_TYPE_PADDING: 2170 case RINGBUF_TYPE_PADDING:
1898 return; 2171 return;
1899 2172
@@ -1966,6 +2239,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1966 cpu_buffer->reader_page->list.prev = reader->list.prev; 2239 cpu_buffer->reader_page->list.prev = reader->list.prev;
1967 2240
1968 local_set(&cpu_buffer->reader_page->write, 0); 2241 local_set(&cpu_buffer->reader_page->write, 0);
2242 local_set(&cpu_buffer->reader_page->entries, 0);
1969 local_set(&cpu_buffer->reader_page->page->commit, 0); 2243 local_set(&cpu_buffer->reader_page->page->commit, 0);
1970 2244
1971 /* Make the reader page now replace the head */ 2245 /* Make the reader page now replace the head */
@@ -2008,8 +2282,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2008 2282
2009 event = rb_reader_event(cpu_buffer); 2283 event = rb_reader_event(cpu_buffer);
2010 2284
2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) 2285 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
2012 cpu_buffer->entries--; 2286 || rb_discarded_event(event))
2287 cpu_buffer->read++;
2013 2288
2014 rb_update_read_stamp(cpu_buffer, event); 2289 rb_update_read_stamp(cpu_buffer, event);
2015 2290
@@ -2031,8 +2306,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2031 * Check if we are at the end of the buffer. 2306 * Check if we are at the end of the buffer.
2032 */ 2307 */
2033 if (iter->head >= rb_page_size(iter->head_page)) { 2308 if (iter->head >= rb_page_size(iter->head_page)) {
2034 if (RB_WARN_ON(buffer, 2309 /* discarded commits can make the page empty */
2035 iter->head_page == cpu_buffer->commit_page)) 2310 if (iter->head_page == cpu_buffer->commit_page)
2036 return; 2311 return;
2037 rb_inc_iter(iter); 2312 rb_inc_iter(iter);
2038 return; 2313 return;
@@ -2075,12 +2350,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2075 /* 2350 /*
2076 * We repeat when a timestamp is encountered. It is possible 2351 * We repeat when a timestamp is encountered. It is possible
2077 * to get multiple timestamps from an interrupt entering just 2352 * to get multiple timestamps from an interrupt entering just
2078 * as one timestamp is about to be written. The max times 2353 * as one timestamp is about to be written, or from discarded
2079 * that this can happen is the number of nested interrupts we 2354 * commits. The most that we can have is the number on a single page.
2080 * can have. Nesting 10 deep of interrupts is clearly
2081 * an anomaly.
2082 */ 2355 */
2083 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2356 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2084 return NULL; 2357 return NULL;
2085 2358
2086 reader = rb_get_reader_page(cpu_buffer); 2359 reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2362,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2089 2362
2090 event = rb_reader_event(cpu_buffer); 2363 event = rb_reader_event(cpu_buffer);
2091 2364
2092 switch (event->type) { 2365 switch (event->type_len) {
2093 case RINGBUF_TYPE_PADDING: 2366 case RINGBUF_TYPE_PADDING:
2094 if (rb_null_event(event)) 2367 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1); 2368 RB_WARN_ON(cpu_buffer, 1);
@@ -2146,14 +2419,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2146 2419
2147 again: 2420 again:
2148 /* 2421 /*
2149 * We repeat when a timestamp is encountered. It is possible 2422 * We repeat when a timestamp is encountered.
2150 * to get multiple timestamps from an interrupt entering just 2423 * We can get multiple timestamps by nested interrupts or also
2151 * as one timestamp is about to be written. The max times 2424 * if filtering is on (discarding commits). Since discarding
2152 * that this can happen is the number of nested interrupts we 2425 * commits can be frequent we can get a lot of timestamps.
2153 * can have. Nesting 10 deep of interrupts is clearly 2426 * But we limit them by not adding timestamps if they begin
2154 * an anomaly. 2427 * at the start of a page.
2155 */ 2428 */
2156 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2429 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2157 return NULL; 2430 return NULL;
2158 2431
2159 if (rb_per_cpu_empty(cpu_buffer)) 2432 if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2434,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2161 2434
2162 event = rb_iter_head_event(iter); 2435 event = rb_iter_head_event(iter);
2163 2436
2164 switch (event->type) { 2437 switch (event->type_len) {
2165 case RINGBUF_TYPE_PADDING: 2438 case RINGBUF_TYPE_PADDING:
2166 if (rb_null_event(event)) { 2439 if (rb_null_event(event)) {
2167 rb_inc_iter(iter); 2440 rb_inc_iter(iter);
@@ -2196,6 +2469,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2196} 2469}
2197EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 2470EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2198 2471
2472static inline int rb_ok_to_lock(void)
2473{
2474 /*
2475 * If an NMI die dumps out the content of the ring buffer
2476 * do not grab locks. We also permanently disable the ring
2477 * buffer too. A one time deal is all you get from reading
2478 * the ring buffer from an NMI.
2479 */
2480 if (likely(!in_nmi() && !oops_in_progress))
2481 return 1;
2482
2483 tracing_off_permanent();
2484 return 0;
2485}
2486
2199/** 2487/**
2200 * ring_buffer_peek - peek at the next event to be read 2488 * ring_buffer_peek - peek at the next event to be read
2201 * @buffer: The ring buffer to read 2489 * @buffer: The ring buffer to read
@@ -2211,16 +2499,22 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2211 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2499 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2212 struct ring_buffer_event *event; 2500 struct ring_buffer_event *event;
2213 unsigned long flags; 2501 unsigned long flags;
2502 int dolock;
2214 2503
2215 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2504 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2216 return NULL; 2505 return NULL;
2217 2506
2507 dolock = rb_ok_to_lock();
2218 again: 2508 again:
2219 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2509 local_irq_save(flags);
2510 if (dolock)
2511 spin_lock(&cpu_buffer->reader_lock);
2220 event = rb_buffer_peek(buffer, cpu, ts); 2512 event = rb_buffer_peek(buffer, cpu, ts);
2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2513 if (dolock)
2514 spin_unlock(&cpu_buffer->reader_lock);
2515 local_irq_restore(flags);
2222 2516
2223 if (event && event->type == RINGBUF_TYPE_PADDING) { 2517 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2224 cpu_relax(); 2518 cpu_relax();
2225 goto again; 2519 goto again;
2226 } 2520 }
@@ -2248,7 +2542,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2248 event = rb_iter_peek(iter, ts); 2542 event = rb_iter_peek(iter, ts);
2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2543 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2250 2544
2251 if (event && event->type == RINGBUF_TYPE_PADDING) { 2545 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2252 cpu_relax(); 2546 cpu_relax();
2253 goto again; 2547 goto again;
2254 } 2548 }
@@ -2270,6 +2564,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2270 struct ring_buffer_per_cpu *cpu_buffer; 2564 struct ring_buffer_per_cpu *cpu_buffer;
2271 struct ring_buffer_event *event = NULL; 2565 struct ring_buffer_event *event = NULL;
2272 unsigned long flags; 2566 unsigned long flags;
2567 int dolock;
2568
2569 dolock = rb_ok_to_lock();
2273 2570
2274 again: 2571 again:
2275 /* might be called in atomic */ 2572 /* might be called in atomic */
@@ -2279,7 +2576,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2279 goto out; 2576 goto out;
2280 2577
2281 cpu_buffer = buffer->buffers[cpu]; 2578 cpu_buffer = buffer->buffers[cpu];
2282 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2579 local_irq_save(flags);
2580 if (dolock)
2581 spin_lock(&cpu_buffer->reader_lock);
2283 2582
2284 event = rb_buffer_peek(buffer, cpu, ts); 2583 event = rb_buffer_peek(buffer, cpu, ts);
2285 if (!event) 2584 if (!event)
@@ -2288,12 +2587,14 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2288 rb_advance_reader(cpu_buffer); 2587 rb_advance_reader(cpu_buffer);
2289 2588
2290 out_unlock: 2589 out_unlock:
2291 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2590 if (dolock)
2591 spin_unlock(&cpu_buffer->reader_lock);
2592 local_irq_restore(flags);
2292 2593
2293 out: 2594 out:
2294 preempt_enable(); 2595 preempt_enable();
2295 2596
2296 if (event && event->type == RINGBUF_TYPE_PADDING) { 2597 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2297 cpu_relax(); 2598 cpu_relax();
2298 goto again; 2599 goto again;
2299 } 2600 }
@@ -2386,7 +2687,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2386 out: 2687 out:
2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2688 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2388 2689
2389 if (event && event->type == RINGBUF_TYPE_PADDING) { 2690 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2390 cpu_relax(); 2691 cpu_relax();
2391 goto again; 2692 goto again;
2392 } 2693 }
@@ -2411,6 +2712,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2411 cpu_buffer->head_page 2712 cpu_buffer->head_page
2412 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2713 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2413 local_set(&cpu_buffer->head_page->write, 0); 2714 local_set(&cpu_buffer->head_page->write, 0);
2715 local_set(&cpu_buffer->head_page->entries, 0);
2414 local_set(&cpu_buffer->head_page->page->commit, 0); 2716 local_set(&cpu_buffer->head_page->page->commit, 0);
2415 2717
2416 cpu_buffer->head_page->read = 0; 2718 cpu_buffer->head_page->read = 0;
@@ -2420,11 +2722,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2420 2722
2421 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2723 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2422 local_set(&cpu_buffer->reader_page->write, 0); 2724 local_set(&cpu_buffer->reader_page->write, 0);
2725 local_set(&cpu_buffer->reader_page->entries, 0);
2423 local_set(&cpu_buffer->reader_page->page->commit, 0); 2726 local_set(&cpu_buffer->reader_page->page->commit, 0);
2424 cpu_buffer->reader_page->read = 0; 2727 cpu_buffer->reader_page->read = 0;
2425 2728
2729 cpu_buffer->nmi_dropped = 0;
2730 cpu_buffer->commit_overrun = 0;
2426 cpu_buffer->overrun = 0; 2731 cpu_buffer->overrun = 0;
2427 cpu_buffer->entries = 0; 2732 cpu_buffer->read = 0;
2733 local_set(&cpu_buffer->entries, 0);
2734 local_set(&cpu_buffer->committing, 0);
2735 local_set(&cpu_buffer->commits, 0);
2428 2736
2429 cpu_buffer->write_stamp = 0; 2737 cpu_buffer->write_stamp = 0;
2430 cpu_buffer->read_stamp = 0; 2738 cpu_buffer->read_stamp = 0;
@@ -2443,6 +2751,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2443 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2751 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2444 return; 2752 return;
2445 2753
2754 atomic_inc(&cpu_buffer->record_disabled);
2755
2446 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2756 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2447 2757
2448 __raw_spin_lock(&cpu_buffer->lock); 2758 __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2762,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2452 __raw_spin_unlock(&cpu_buffer->lock); 2762 __raw_spin_unlock(&cpu_buffer->lock);
2453 2763
2454 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2764 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2765
2766 atomic_dec(&cpu_buffer->record_disabled);
2455} 2767}
2456EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 2768EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2457 2769
@@ -2475,12 +2787,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2475int ring_buffer_empty(struct ring_buffer *buffer) 2787int ring_buffer_empty(struct ring_buffer *buffer)
2476{ 2788{
2477 struct ring_buffer_per_cpu *cpu_buffer; 2789 struct ring_buffer_per_cpu *cpu_buffer;
2790 unsigned long flags;
2791 int dolock;
2478 int cpu; 2792 int cpu;
2793 int ret;
2794
2795 dolock = rb_ok_to_lock();
2479 2796
2480 /* yes this is racy, but if you don't like the race, lock the buffer */ 2797 /* yes this is racy, but if you don't like the race, lock the buffer */
2481 for_each_buffer_cpu(buffer, cpu) { 2798 for_each_buffer_cpu(buffer, cpu) {
2482 cpu_buffer = buffer->buffers[cpu]; 2799 cpu_buffer = buffer->buffers[cpu];
2483 if (!rb_per_cpu_empty(cpu_buffer)) 2800 local_irq_save(flags);
2801 if (dolock)
2802 spin_lock(&cpu_buffer->reader_lock);
2803 ret = rb_per_cpu_empty(cpu_buffer);
2804 if (dolock)
2805 spin_unlock(&cpu_buffer->reader_lock);
2806 local_irq_restore(flags);
2807
2808 if (!ret)
2484 return 0; 2809 return 0;
2485 } 2810 }
2486 2811
@@ -2496,14 +2821,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2496int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2821int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2497{ 2822{
2498 struct ring_buffer_per_cpu *cpu_buffer; 2823 struct ring_buffer_per_cpu *cpu_buffer;
2824 unsigned long flags;
2825 int dolock;
2499 int ret; 2826 int ret;
2500 2827
2501 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2828 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2502 return 1; 2829 return 1;
2503 2830
2831 dolock = rb_ok_to_lock();
2832
2504 cpu_buffer = buffer->buffers[cpu]; 2833 cpu_buffer = buffer->buffers[cpu];
2834 local_irq_save(flags);
2835 if (dolock)
2836 spin_lock(&cpu_buffer->reader_lock);
2505 ret = rb_per_cpu_empty(cpu_buffer); 2837 ret = rb_per_cpu_empty(cpu_buffer);
2506 2838 if (dolock)
2839 spin_unlock(&cpu_buffer->reader_lock);
2840 local_irq_restore(flags);
2507 2841
2508 return ret; 2842 return ret;
2509} 2843}
@@ -2578,28 +2912,6 @@ out:
2578} 2912}
2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2913EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2580 2914
2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2584{
2585 struct ring_buffer_event *event;
2586 unsigned long head;
2587
2588 __raw_spin_lock(&cpu_buffer->lock);
2589 for (head = offset; head < local_read(&bpage->commit);
2590 head += rb_event_length(event)) {
2591
2592 event = __rb_data_page_index(bpage, head);
2593 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2594 return;
2595 /* Only count data entries */
2596 if (event->type != RINGBUF_TYPE_DATA)
2597 continue;
2598 cpu_buffer->entries--;
2599 }
2600 __raw_spin_unlock(&cpu_buffer->lock);
2601}
2602
2603/** 2915/**
2604 * ring_buffer_alloc_read_page - allocate a page to read from buffer 2916 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2605 * @buffer: the buffer to allocate for. 2917 * @buffer: the buffer to allocate for.
@@ -2630,6 +2942,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2630 2942
2631 return bpage; 2943 return bpage;
2632} 2944}
2945EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
2633 2946
2634/** 2947/**
2635 * ring_buffer_free_read_page - free an allocated read page 2948 * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2955,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2642{ 2955{
2643 free_page((unsigned long)data); 2956 free_page((unsigned long)data);
2644} 2957}
2958EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
2645 2959
2646/** 2960/**
2647 * ring_buffer_read_page - extract a page from the ring buffer 2961 * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3082,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2768 /* we copied everything to the beginning */ 3082 /* we copied everything to the beginning */
2769 read = 0; 3083 read = 0;
2770 } else { 3084 } else {
3085 /* update the entry counter */
3086 cpu_buffer->read += local_read(&reader->entries);
3087
2771 /* swap the pages */ 3088 /* swap the pages */
2772 rb_init_page(bpage); 3089 rb_init_page(bpage);
2773 bpage = reader->page; 3090 bpage = reader->page;
2774 reader->page = *data_page; 3091 reader->page = *data_page;
2775 local_set(&reader->write, 0); 3092 local_set(&reader->write, 0);
3093 local_set(&reader->entries, 0);
2776 reader->read = 0; 3094 reader->read = 0;
2777 *data_page = bpage; 3095 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2781 } 3096 }
2782 ret = read; 3097 ret = read;
2783 3098
@@ -2787,6 +3102,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2787 out: 3102 out:
2788 return ret; 3103 return ret;
2789} 3104}
3105EXPORT_SYMBOL_GPL(ring_buffer_read_page);
2790 3106
2791static ssize_t 3107static ssize_t
2792rb_simple_read(struct file *filp, char __user *ubuf, 3108rb_simple_read(struct file *filp, char __user *ubuf,
@@ -2845,14 +3161,11 @@ static const struct file_operations rb_simple_fops = {
2845static __init int rb_init_debugfs(void) 3161static __init int rb_init_debugfs(void)
2846{ 3162{
2847 struct dentry *d_tracer; 3163 struct dentry *d_tracer;
2848 struct dentry *entry;
2849 3164
2850 d_tracer = tracing_init_dentry(); 3165 d_tracer = tracing_init_dentry();
2851 3166
2852 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 3167 trace_create_file("tracing_on", 0644, d_tracer,
2853 &ring_buffer_flags, &rb_simple_fops); 3168 &ring_buffer_flags, &rb_simple_fops);
2854 if (!entry)
2855 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2856 3169
2857 return 0; 3170 return 0;
2858} 3171}
@@ -2870,7 +3183,7 @@ static int rb_cpu_notify(struct notifier_block *self,
2870 switch (action) { 3183 switch (action) {
2871 case CPU_UP_PREPARE: 3184 case CPU_UP_PREPARE:
2872 case CPU_UP_PREPARE_FROZEN: 3185 case CPU_UP_PREPARE_FROZEN:
2873 if (cpu_isset(cpu, *buffer->cpumask)) 3186 if (cpumask_test_cpu(cpu, buffer->cpumask))
2874 return NOTIFY_OK; 3187 return NOTIFY_OK;
2875 3188
2876 buffer->buffers[cpu] = 3189 buffer->buffers[cpu] =
@@ -2881,7 +3194,7 @@ static int rb_cpu_notify(struct notifier_block *self,
2881 return NOTIFY_OK; 3194 return NOTIFY_OK;
2882 } 3195 }
2883 smp_wmb(); 3196 smp_wmb();
2884 cpu_set(cpu, *buffer->cpumask); 3197 cpumask_set_cpu(cpu, buffer->cpumask);
2885 break; 3198 break;
2886 case CPU_DOWN_PREPARE: 3199 case CPU_DOWN_PREPARE:
2887 case CPU_DOWN_PREPARE_FROZEN: 3200 case CPU_DOWN_PREPARE_FROZEN:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 000000000000..573d3cc762c3
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,419 @@
1/*
2 * ring buffer tester and benchmark
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/completion.h>
8#include <linux/kthread.h>
9#include <linux/module.h>
10#include <linux/time.h>
11
12struct rb_page {
13 u64 ts;
14 local_t commit;
15 char data[4080];
16};
17
18/* run time and sleep time in seconds */
19#define RUN_TIME 10
20#define SLEEP_TIME 10
21
22/* number of events for writer to wake up the reader */
23static int wakeup_interval = 100;
24
25static int reader_finish;
26static struct completion read_start;
27static struct completion read_done;
28
29static struct ring_buffer *buffer;
30static struct task_struct *producer;
31static struct task_struct *consumer;
32static unsigned long read;
33
34static int disable_reader;
35module_param(disable_reader, uint, 0644);
36MODULE_PARM_DESC(disable_reader, "only run producer");
37
38static int read_events;
39
40static int kill_test;
41
42#define KILL_TEST() \
43 do { \
44 if (!kill_test) { \
45 kill_test = 1; \
46 WARN_ON(1); \
47 } \
48 } while (0)
49
50enum event_status {
51 EVENT_FOUND,
52 EVENT_DROPPED,
53};
54
55static enum event_status read_event(int cpu)
56{
57 struct ring_buffer_event *event;
58 int *entry;
59 u64 ts;
60
61 event = ring_buffer_consume(buffer, cpu, &ts);
62 if (!event)
63 return EVENT_DROPPED;
64
65 entry = ring_buffer_event_data(event);
66 if (*entry != cpu) {
67 KILL_TEST();
68 return EVENT_DROPPED;
69 }
70
71 read++;
72 return EVENT_FOUND;
73}
74
75static enum event_status read_page(int cpu)
76{
77 struct ring_buffer_event *event;
78 struct rb_page *rpage;
79 unsigned long commit;
80 void *bpage;
81 int *entry;
82 int ret;
83 int inc;
84 int i;
85
86 bpage = ring_buffer_alloc_read_page(buffer);
87 if (!bpage)
88 return EVENT_DROPPED;
89
90 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
91 if (ret >= 0) {
92 rpage = bpage;
93 commit = local_read(&rpage->commit);
94 for (i = 0; i < commit && !kill_test; i += inc) {
95
96 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
97 KILL_TEST();
98 break;
99 }
100
101 inc = -1;
102 event = (void *)&rpage->data[i];
103 switch (event->type_len) {
104 case RINGBUF_TYPE_PADDING:
105 /* failed writes may be discarded events */
106 if (!event->time_delta)
107 KILL_TEST();
108 inc = event->array[0] + 4;
109 break;
110 case RINGBUF_TYPE_TIME_EXTEND:
111 inc = 8;
112 break;
113 case 0:
114 entry = ring_buffer_event_data(event);
115 if (*entry != cpu) {
116 KILL_TEST();
117 break;
118 }
119 read++;
120 if (!event->array[0]) {
121 KILL_TEST();
122 break;
123 }
124 inc = event->array[0] + 4;
125 break;
126 default:
127 entry = ring_buffer_event_data(event);
128 if (*entry != cpu) {
129 KILL_TEST();
130 break;
131 }
132 read++;
133 inc = ((event->type_len + 1) * 4);
134 }
135 if (kill_test)
136 break;
137
138 if (inc <= 0) {
139 KILL_TEST();
140 break;
141 }
142 }
143 }
144 ring_buffer_free_read_page(buffer, bpage);
145
146 if (ret < 0)
147 return EVENT_DROPPED;
148 return EVENT_FOUND;
149}
150
151static void ring_buffer_consumer(void)
152{
153 /* toggle between reading pages and events */
154 read_events ^= 1;
155
156 read = 0;
157 while (!reader_finish && !kill_test) {
158 int found;
159
160 do {
161 int cpu;
162
163 found = 0;
164 for_each_online_cpu(cpu) {
165 enum event_status stat;
166
167 if (read_events)
168 stat = read_event(cpu);
169 else
170 stat = read_page(cpu);
171
172 if (kill_test)
173 break;
174 if (stat == EVENT_FOUND)
175 found = 1;
176 }
177 } while (found && !kill_test);
178
179 set_current_state(TASK_INTERRUPTIBLE);
180 if (reader_finish)
181 break;
182
183 schedule();
184 __set_current_state(TASK_RUNNING);
185 }
186 reader_finish = 0;
187 complete(&read_done);
188}
189
190static void ring_buffer_producer(void)
191{
192 struct timeval start_tv;
193 struct timeval end_tv;
194 unsigned long long time;
195 unsigned long long entries;
196 unsigned long long overruns;
197 unsigned long missed = 0;
198 unsigned long hit = 0;
199 unsigned long avg;
200 int cnt = 0;
201
202 /*
203 * Hammer the buffer for 10 secs (this may
204 * make the system stall)
205 */
206 trace_printk("Starting ring buffer hammer\n");
207 do_gettimeofday(&start_tv);
208 do {
209 struct ring_buffer_event *event;
210 int *entry;
211
212 event = ring_buffer_lock_reserve(buffer, 10);
213 if (!event) {
214 missed++;
215 } else {
216 hit++;
217 entry = ring_buffer_event_data(event);
218 *entry = smp_processor_id();
219 ring_buffer_unlock_commit(buffer, event);
220 }
221 do_gettimeofday(&end_tv);
222
223 cnt++;
224 if (consumer && !(cnt % wakeup_interval))
225 wake_up_process(consumer);
226
227#ifndef CONFIG_PREEMPT
228 /*
229 * If we are a non preempt kernel, the 10 second run will
230 * stop everything while it runs. Instead, we will call
231 * cond_resched and also add any time that was lost by a
232 * rescedule.
233 *
234 * Do a cond resched at the same frequency we would wake up
235 * the reader.
236 */
237 if (cnt % wakeup_interval)
238 cond_resched();
239#endif
240
241 } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
242 trace_printk("End ring buffer hammer\n");
243
244 if (consumer) {
245 /* Init both completions here to avoid races */
246 init_completion(&read_start);
247 init_completion(&read_done);
248 /* the completions must be visible before the finish var */
249 smp_wmb();
250 reader_finish = 1;
251 /* finish var visible before waking up the consumer */
252 smp_wmb();
253 wake_up_process(consumer);
254 wait_for_completion(&read_done);
255 }
256
257 time = end_tv.tv_sec - start_tv.tv_sec;
258 time *= USEC_PER_SEC;
259 time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
260
261 entries = ring_buffer_entries(buffer);
262 overruns = ring_buffer_overruns(buffer);
263
264 if (kill_test)
265 trace_printk("ERROR!\n");
266 trace_printk("Time: %lld (usecs)\n", time);
267 trace_printk("Overruns: %lld\n", overruns);
268 if (disable_reader)
269 trace_printk("Read: (reader disabled)\n");
270 else
271 trace_printk("Read: %ld (by %s)\n", read,
272 read_events ? "events" : "pages");
273 trace_printk("Entries: %lld\n", entries);
274 trace_printk("Total: %lld\n", entries + overruns + read);
275 trace_printk("Missed: %ld\n", missed);
276 trace_printk("Hit: %ld\n", hit);
277
278 /* Convert time from usecs to millisecs */
279 do_div(time, USEC_PER_MSEC);
280 if (time)
281 hit /= (long)time;
282 else
283 trace_printk("TIME IS ZERO??\n");
284
285 trace_printk("Entries per millisec: %ld\n", hit);
286
287 if (hit) {
288 /* Calculate the average time in nanosecs */
289 avg = NSEC_PER_MSEC / hit;
290 trace_printk("%ld ns per entry\n", avg);
291 }
292
293 if (missed) {
294 if (time)
295 missed /= (long)time;
296
297 trace_printk("Total iterations per millisec: %ld\n",
298 hit + missed);
299
300 /* it is possible that hit + missed will overflow and be zero */
301 if (!(hit + missed)) {
302 trace_printk("hit + missed overflowed and totalled zero!\n");
303 hit--; /* make it non zero */
304 }
305
306 /* Caculate the average time in nanosecs */
307 avg = NSEC_PER_MSEC / (hit + missed);
308 trace_printk("%ld ns per entry\n", avg);
309 }
310}
311
312static void wait_to_die(void)
313{
314 set_current_state(TASK_INTERRUPTIBLE);
315 while (!kthread_should_stop()) {
316 schedule();
317 set_current_state(TASK_INTERRUPTIBLE);
318 }
319 __set_current_state(TASK_RUNNING);
320}
321
322static int ring_buffer_consumer_thread(void *arg)
323{
324 while (!kthread_should_stop() && !kill_test) {
325 complete(&read_start);
326
327 ring_buffer_consumer();
328
329 set_current_state(TASK_INTERRUPTIBLE);
330 if (kthread_should_stop() || kill_test)
331 break;
332
333 schedule();
334 __set_current_state(TASK_RUNNING);
335 }
336 __set_current_state(TASK_RUNNING);
337
338 if (kill_test)
339 wait_to_die();
340
341 return 0;
342}
343
344static int ring_buffer_producer_thread(void *arg)
345{
346 init_completion(&read_start);
347
348 while (!kthread_should_stop() && !kill_test) {
349 ring_buffer_reset(buffer);
350
351 if (consumer) {
352 smp_wmb();
353 wake_up_process(consumer);
354 wait_for_completion(&read_start);
355 }
356
357 ring_buffer_producer();
358
359 trace_printk("Sleeping for 10 secs\n");
360 set_current_state(TASK_INTERRUPTIBLE);
361 schedule_timeout(HZ * SLEEP_TIME);
362 __set_current_state(TASK_RUNNING);
363 }
364
365 if (kill_test)
366 wait_to_die();
367
368 return 0;
369}
370
371static int __init ring_buffer_benchmark_init(void)
372{
373 int ret;
374
375 /* make a one meg buffer in overwite mode */
376 buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
377 if (!buffer)
378 return -ENOMEM;
379
380 if (!disable_reader) {
381 consumer = kthread_create(ring_buffer_consumer_thread,
382 NULL, "rb_consumer");
383 ret = PTR_ERR(consumer);
384 if (IS_ERR(consumer))
385 goto out_fail;
386 }
387
388 producer = kthread_run(ring_buffer_producer_thread,
389 NULL, "rb_producer");
390 ret = PTR_ERR(producer);
391
392 if (IS_ERR(producer))
393 goto out_kill;
394
395 return 0;
396
397 out_kill:
398 if (consumer)
399 kthread_stop(consumer);
400
401 out_fail:
402 ring_buffer_free(buffer);
403 return ret;
404}
405
406static void __exit ring_buffer_benchmark_exit(void)
407{
408 kthread_stop(producer);
409 if (consumer)
410 kthread_stop(consumer);
411 ring_buffer_free(buffer);
412}
413
414module_init(ring_buffer_benchmark_init);
415module_exit(ring_buffer_benchmark_exit);
416
417MODULE_AUTHOR("Steven Rostedt");
418MODULE_DESCRIPTION("ring_buffer_benchmark");
419MODULE_LICENSE("GPL");
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a884c09006c4..076fa6f0ee48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,13 @@ static struct trace_array global_trace;
171 171
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 173
174int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
175 struct ring_buffer_event *event)
176{
177 return filter_check_discard(call, rec, global_trace.buffer, event);
178}
179EXPORT_SYMBOL_GPL(filter_current_check_discard);
180
174cycle_t ftrace_now(int cpu) 181cycle_t ftrace_now(int cpu)
175{ 182{
176 u64 ts; 183 u64 ts;
@@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
255 262
256/* trace_flags holds trace_options default values */ 263/* trace_flags holds trace_options default values */
257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 264unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; 265 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
266 TRACE_ITER_GRAPH_TIME;
259 267
260/** 268/**
261 * trace_wake_up - wake up tasks waiting for trace input 269 * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
317 "latency-format", 325 "latency-format",
318 "global-clock", 326 "global-clock",
319 "sleep-time", 327 "sleep-time",
328 "graph-time",
320 NULL 329 NULL
321}; 330};
322 331
@@ -335,7 +344,7 @@ static raw_spinlock_t ftrace_max_lock =
335/* 344/*
336 * Copy the new maximum trace into the separate maximum-trace 345 * Copy the new maximum trace into the separate maximum-trace
337 * structure. (this way the maximum trace is permanently saved, 346 * structure. (this way the maximum trace is permanently saved,
338 * for later retrieval via /debugfs/tracing/latency_trace) 347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
339 */ 348 */
340static void 349static void
341__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
402 return cnt; 411 return cnt;
403} 412}
404 413
405static void
406trace_print_seq(struct seq_file *m, struct trace_seq *s)
407{
408 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
409
410 s->buffer[len] = 0;
411 seq_puts(m, s->buffer);
412
413 trace_seq_init(s);
414}
415
416/** 414/**
417 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
418 * @tr: tracer 416 * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
641 tracing_reset(tr, cpu); 639 tracing_reset(tr, cpu);
642} 640}
643 641
642void tracing_reset_current(int cpu)
643{
644 tracing_reset(&global_trace, cpu);
645}
646
647void tracing_reset_current_online_cpus(void)
648{
649 tracing_reset_online_cpus(&global_trace);
650}
651
644#define SAVED_CMDLINES 128 652#define SAVED_CMDLINES 128
645#define NO_CMDLINE_MAP UINT_MAX 653#define NO_CMDLINE_MAP UINT_MAX
646static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 654static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
800 return; 808 return;
801 } 809 }
802 810
811 preempt_disable();
803 __raw_spin_lock(&trace_cmdline_lock); 812 __raw_spin_lock(&trace_cmdline_lock);
804 map = map_pid_to_cmdline[pid]; 813 map = map_pid_to_cmdline[pid];
805 if (map != NO_CMDLINE_MAP) 814 if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
808 strcpy(comm, "<...>"); 817 strcpy(comm, "<...>");
809 818
810 __raw_spin_unlock(&trace_cmdline_lock); 819 __raw_spin_unlock(&trace_cmdline_lock);
820 preempt_enable();
811} 821}
812 822
813void tracing_record_cmdline(struct task_struct *tsk) 823void tracing_record_cmdline(struct task_struct *tsk)
@@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
840} 850}
841 851
842struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
843 unsigned char type, 853 int type,
844 unsigned long len, 854 unsigned long len,
845 unsigned long flags, int pc) 855 unsigned long flags, int pc)
846{ 856{
@@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
883} 893}
884 894
885struct ring_buffer_event * 895struct ring_buffer_event *
886trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, 896trace_current_buffer_lock_reserve(int type, unsigned long len,
887 unsigned long flags, int pc) 897 unsigned long flags, int pc)
888{ 898{
889 return trace_buffer_lock_reserve(&global_trace, 899 return trace_buffer_lock_reserve(&global_trace,
890 type, len, flags, pc); 900 type, len, flags, pc);
891} 901}
902EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
892 903
893void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 904void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
894 unsigned long flags, int pc) 905 unsigned long flags, int pc)
895{ 906{
896 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 907 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
897} 908}
909EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
898 910
899void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 911void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
900 unsigned long flags, int pc) 912 unsigned long flags, int pc)
901{ 913{
902 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 914 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
915}
916EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
917
918void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
919{
920 ring_buffer_discard_commit(global_trace.buffer, event);
903} 921}
922EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
904 923
905void 924void
906trace_function(struct trace_array *tr, 925trace_function(struct trace_array *tr,
907 unsigned long ip, unsigned long parent_ip, unsigned long flags, 926 unsigned long ip, unsigned long parent_ip, unsigned long flags,
908 int pc) 927 int pc)
909{ 928{
929 struct ftrace_event_call *call = &event_function;
910 struct ring_buffer_event *event; 930 struct ring_buffer_event *event;
911 struct ftrace_entry *entry; 931 struct ftrace_entry *entry;
912 932
@@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,
921 entry = ring_buffer_event_data(event); 941 entry = ring_buffer_event_data(event);
922 entry->ip = ip; 942 entry->ip = ip;
923 entry->parent_ip = parent_ip; 943 entry->parent_ip = parent_ip;
924 ring_buffer_unlock_commit(tr->buffer, event); 944
945 if (!filter_check_discard(call, entry, tr->buffer, event))
946 ring_buffer_unlock_commit(tr->buffer, event);
925} 947}
926 948
927#ifdef CONFIG_FUNCTION_GRAPH_TRACER 949#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,
930 unsigned long flags, 952 unsigned long flags,
931 int pc) 953 int pc)
932{ 954{
955 struct ftrace_event_call *call = &event_funcgraph_entry;
933 struct ring_buffer_event *event; 956 struct ring_buffer_event *event;
934 struct ftrace_graph_ent_entry *entry; 957 struct ftrace_graph_ent_entry *entry;
935 958
@@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,
942 return 0; 965 return 0;
943 entry = ring_buffer_event_data(event); 966 entry = ring_buffer_event_data(event);
944 entry->graph_ent = *trace; 967 entry->graph_ent = *trace;
945 ring_buffer_unlock_commit(global_trace.buffer, event); 968 if (!filter_current_check_discard(call, entry, event))
969 ring_buffer_unlock_commit(global_trace.buffer, event);
946 970
947 return 1; 971 return 1;
948} 972}
@@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,
952 unsigned long flags, 976 unsigned long flags,
953 int pc) 977 int pc)
954{ 978{
979 struct ftrace_event_call *call = &event_funcgraph_exit;
955 struct ring_buffer_event *event; 980 struct ring_buffer_event *event;
956 struct ftrace_graph_ret_entry *entry; 981 struct ftrace_graph_ret_entry *entry;
957 982
@@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,
964 return; 989 return;
965 entry = ring_buffer_event_data(event); 990 entry = ring_buffer_event_data(event);
966 entry->ret = *trace; 991 entry->ret = *trace;
967 ring_buffer_unlock_commit(global_trace.buffer, event); 992 if (!filter_current_check_discard(call, entry, event))
993 ring_buffer_unlock_commit(global_trace.buffer, event);
968} 994}
969#endif 995#endif
970 996
@@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
982 int skip, int pc) 1008 int skip, int pc)
983{ 1009{
984#ifdef CONFIG_STACKTRACE 1010#ifdef CONFIG_STACKTRACE
1011 struct ftrace_event_call *call = &event_kernel_stack;
985 struct ring_buffer_event *event; 1012 struct ring_buffer_event *event;
986 struct stack_entry *entry; 1013 struct stack_entry *entry;
987 struct stack_trace trace; 1014 struct stack_trace trace;
@@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
999 trace.entries = entry->caller; 1026 trace.entries = entry->caller;
1000 1027
1001 save_stack_trace(&trace); 1028 save_stack_trace(&trace);
1002 ring_buffer_unlock_commit(tr->buffer, event); 1029 if (!filter_check_discard(call, entry, tr->buffer, event))
1030 ring_buffer_unlock_commit(tr->buffer, event);
1003#endif 1031#endif
1004} 1032}
1005 1033
@@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1024 unsigned long flags, int pc) 1052 unsigned long flags, int pc)
1025{ 1053{
1026#ifdef CONFIG_STACKTRACE 1054#ifdef CONFIG_STACKTRACE
1055 struct ftrace_event_call *call = &event_user_stack;
1027 struct ring_buffer_event *event; 1056 struct ring_buffer_event *event;
1028 struct userstack_entry *entry; 1057 struct userstack_entry *entry;
1029 struct stack_trace trace; 1058 struct stack_trace trace;
@@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1045 trace.entries = entry->caller; 1074 trace.entries = entry->caller;
1046 1075
1047 save_stack_trace_user(&trace); 1076 save_stack_trace_user(&trace);
1048 ring_buffer_unlock_commit(tr->buffer, event); 1077 if (!filter_check_discard(call, entry, tr->buffer, event))
1078 ring_buffer_unlock_commit(tr->buffer, event);
1049#endif 1079#endif
1050} 1080}
1051 1081
@@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
1089 struct task_struct *next, 1119 struct task_struct *next,
1090 unsigned long flags, int pc) 1120 unsigned long flags, int pc)
1091{ 1121{
1122 struct ftrace_event_call *call = &event_context_switch;
1092 struct ring_buffer_event *event; 1123 struct ring_buffer_event *event;
1093 struct ctx_switch_entry *entry; 1124 struct ctx_switch_entry *entry;
1094 1125
@@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
1104 entry->next_prio = next->prio; 1135 entry->next_prio = next->prio;
1105 entry->next_state = next->state; 1136 entry->next_state = next->state;
1106 entry->next_cpu = task_cpu(next); 1137 entry->next_cpu = task_cpu(next);
1107 trace_buffer_unlock_commit(tr, event, flags, pc); 1138
1139 if (!filter_check_discard(call, entry, tr->buffer, event))
1140 trace_buffer_unlock_commit(tr, event, flags, pc);
1108} 1141}
1109 1142
1110void 1143void
@@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1113 struct task_struct *curr, 1146 struct task_struct *curr,
1114 unsigned long flags, int pc) 1147 unsigned long flags, int pc)
1115{ 1148{
1149 struct ftrace_event_call *call = &event_wakeup;
1116 struct ring_buffer_event *event; 1150 struct ring_buffer_event *event;
1117 struct ctx_switch_entry *entry; 1151 struct ctx_switch_entry *entry;
1118 1152
@@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1129 entry->next_state = wakee->state; 1163 entry->next_state = wakee->state;
1130 entry->next_cpu = task_cpu(wakee); 1164 entry->next_cpu = task_cpu(wakee);
1131 1165
1132 ring_buffer_unlock_commit(tr->buffer, event); 1166 if (!filter_check_discard(call, entry, tr->buffer, event))
1167 ring_buffer_unlock_commit(tr->buffer, event);
1133 ftrace_trace_stack(tr, flags, 6, pc); 1168 ftrace_trace_stack(tr, flags, 6, pc);
1134 ftrace_trace_userstack(tr, flags, pc); 1169 ftrace_trace_userstack(tr, flags, pc);
1135} 1170}
@@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1230 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1265 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1231 static u32 trace_buf[TRACE_BUF_SIZE]; 1266 static u32 trace_buf[TRACE_BUF_SIZE];
1232 1267
1268 struct ftrace_event_call *call = &event_bprint;
1233 struct ring_buffer_event *event; 1269 struct ring_buffer_event *event;
1234 struct trace_array *tr = &global_trace; 1270 struct trace_array *tr = &global_trace;
1235 struct trace_array_cpu *data; 1271 struct trace_array_cpu *data;
1236 struct bprint_entry *entry; 1272 struct bprint_entry *entry;
1237 unsigned long flags; 1273 unsigned long flags;
1274 int disable;
1238 int resched; 1275 int resched;
1239 int cpu, len = 0, size, pc; 1276 int cpu, len = 0, size, pc;
1240 1277
@@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1249 cpu = raw_smp_processor_id(); 1286 cpu = raw_smp_processor_id();
1250 data = tr->data[cpu]; 1287 data = tr->data[cpu];
1251 1288
1252 if (unlikely(atomic_read(&data->disabled))) 1289 disable = atomic_inc_return(&data->disabled);
1290 if (unlikely(disable != 1))
1253 goto out; 1291 goto out;
1254 1292
1255 /* Lockdep uses trace_printk for lock tracing */ 1293 /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1269 entry->fmt = fmt; 1307 entry->fmt = fmt;
1270 1308
1271 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1309 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1272 ring_buffer_unlock_commit(tr->buffer, event); 1310 if (!filter_check_discard(call, entry, tr->buffer, event))
1311 ring_buffer_unlock_commit(tr->buffer, event);
1273 1312
1274out_unlock: 1313out_unlock:
1275 __raw_spin_unlock(&trace_buf_lock); 1314 __raw_spin_unlock(&trace_buf_lock);
1276 local_irq_restore(flags); 1315 local_irq_restore(flags);
1277 1316
1278out: 1317out:
1318 atomic_dec_return(&data->disabled);
1279 ftrace_preempt_enable(resched); 1319 ftrace_preempt_enable(resched);
1280 unpause_graph_tracing(); 1320 unpause_graph_tracing();
1281 1321
@@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1288 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1328 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1289 static char trace_buf[TRACE_BUF_SIZE]; 1329 static char trace_buf[TRACE_BUF_SIZE];
1290 1330
1331 struct ftrace_event_call *call = &event_print;
1291 struct ring_buffer_event *event; 1332 struct ring_buffer_event *event;
1292 struct trace_array *tr = &global_trace; 1333 struct trace_array *tr = &global_trace;
1293 struct trace_array_cpu *data; 1334 struct trace_array_cpu *data;
1294 int cpu, len = 0, size, pc; 1335 int cpu, len = 0, size, pc;
1295 struct print_entry *entry; 1336 struct print_entry *entry;
1296 unsigned long irq_flags; 1337 unsigned long irq_flags;
1338 int disable;
1297 1339
1298 if (tracing_disabled || tracing_selftest_running) 1340 if (tracing_disabled || tracing_selftest_running)
1299 return 0; 1341 return 0;
@@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1303 cpu = raw_smp_processor_id(); 1345 cpu = raw_smp_processor_id();
1304 data = tr->data[cpu]; 1346 data = tr->data[cpu];
1305 1347
1306 if (unlikely(atomic_read(&data->disabled))) 1348 disable = atomic_inc_return(&data->disabled);
1349 if (unlikely(disable != 1))
1307 goto out; 1350 goto out;
1308 1351
1309 pause_graph_tracing(); 1352 pause_graph_tracing();
@@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1323 1366
1324 memcpy(&entry->buf, trace_buf, len); 1367 memcpy(&entry->buf, trace_buf, len);
1325 entry->buf[len] = 0; 1368 entry->buf[len] = 0;
1326 ring_buffer_unlock_commit(tr->buffer, event); 1369 if (!filter_check_discard(call, entry, tr->buffer, event))
1370 ring_buffer_unlock_commit(tr->buffer, event);
1327 1371
1328 out_unlock: 1372 out_unlock:
1329 __raw_spin_unlock(&trace_buf_lock); 1373 __raw_spin_unlock(&trace_buf_lock);
1330 raw_local_irq_restore(irq_flags); 1374 raw_local_irq_restore(irq_flags);
1331 unpause_graph_tracing(); 1375 unpause_graph_tracing();
1332 out: 1376 out:
1377 atomic_dec_return(&data->disabled);
1333 preempt_enable_notrace(); 1378 preempt_enable_notrace();
1334 1379
1335 return len; 1380 return len;
@@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1526 p = s_next(m, p, &l); 1571 p = s_next(m, p, &l);
1527 } 1572 }
1528 1573
1574 trace_event_read_lock();
1529 return p; 1575 return p;
1530} 1576}
1531 1577
1532static void s_stop(struct seq_file *m, void *p) 1578static void s_stop(struct seq_file *m, void *p)
1533{ 1579{
1534 atomic_dec(&trace_record_cmdline_disabled); 1580 atomic_dec(&trace_record_cmdline_disabled);
1581 trace_event_read_unlock();
1535} 1582}
1536 1583
1537static void print_lat_help_header(struct seq_file *m) 1584static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)
1774 return 1; 1821 return 1;
1775} 1822}
1776 1823
1824/* Called with trace_event_read_lock() held. */
1777static enum print_line_t print_trace_line(struct trace_iterator *iter) 1825static enum print_line_t print_trace_line(struct trace_iterator *iter)
1778{ 1826{
1779 enum print_line_t ret; 1827 enum print_line_t ret;
@@ -2143,11 +2191,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2143 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) 2191 if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
2144 return -ENOMEM; 2192 return -ENOMEM;
2145 2193
2146 mutex_lock(&tracing_cpumask_update_lock);
2147 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); 2194 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2148 if (err) 2195 if (err)
2149 goto err_unlock; 2196 goto err_unlock;
2150 2197
2198 mutex_lock(&tracing_cpumask_update_lock);
2199
2151 local_irq_disable(); 2200 local_irq_disable();
2152 __raw_spin_lock(&ftrace_max_lock); 2201 __raw_spin_lock(&ftrace_max_lock);
2153 for_each_tracing_cpu(cpu) { 2202 for_each_tracing_cpu(cpu) {
@@ -2175,8 +2224,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2175 return count; 2224 return count;
2176 2225
2177err_unlock: 2226err_unlock:
2178 mutex_unlock(&tracing_cpumask_update_lock); 2227 free_cpumask_var(tracing_cpumask_new);
2179 free_cpumask_var(tracing_cpumask);
2180 2228
2181 return err; 2229 return err;
2182} 2230}
@@ -2366,21 +2414,20 @@ static const struct file_operations tracing_iter_fops = {
2366 2414
2367static const char readme_msg[] = 2415static const char readme_msg[] =
2368 "tracing mini-HOWTO:\n\n" 2416 "tracing mini-HOWTO:\n\n"
2369 "# mkdir /debug\n" 2417 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2370 "# mount -t debugfs nodev /debug\n\n" 2418 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2371 "# cat /debug/tracing/available_tracers\n"
2372 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2419 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2373 "# cat /debug/tracing/current_tracer\n" 2420 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2374 "nop\n" 2421 "nop\n"
2375 "# echo sched_switch > /debug/tracing/current_tracer\n" 2422 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n"
2376 "# cat /debug/tracing/current_tracer\n" 2423 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2377 "sched_switch\n" 2424 "sched_switch\n"
2378 "# cat /debug/tracing/trace_options\n" 2425 "# cat /sys/kernel/debug/tracing/trace_options\n"
2379 "noprint-parent nosym-offset nosym-addr noverbose\n" 2426 "noprint-parent nosym-offset nosym-addr noverbose\n"
2380 "# echo print-parent > /debug/tracing/trace_options\n" 2427 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
2381 "# echo 1 > /debug/tracing/tracing_enabled\n" 2428 "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
2382 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2429 "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
2383 "echo 0 > /debug/tracing/tracing_enabled\n" 2430 "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
2384; 2431;
2385 2432
2386static ssize_t 2433static ssize_t
@@ -2397,6 +2444,56 @@ static const struct file_operations tracing_readme_fops = {
2397}; 2444};
2398 2445
2399static ssize_t 2446static ssize_t
2447tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
2448 size_t cnt, loff_t *ppos)
2449{
2450 char *buf_comm;
2451 char *file_buf;
2452 char *buf;
2453 int len = 0;
2454 int pid;
2455 int i;
2456
2457 file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
2458 if (!file_buf)
2459 return -ENOMEM;
2460
2461 buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
2462 if (!buf_comm) {
2463 kfree(file_buf);
2464 return -ENOMEM;
2465 }
2466
2467 buf = file_buf;
2468
2469 for (i = 0; i < SAVED_CMDLINES; i++) {
2470 int r;
2471
2472 pid = map_cmdline_to_pid[i];
2473 if (pid == -1 || pid == NO_CMDLINE_MAP)
2474 continue;
2475
2476 trace_find_cmdline(pid, buf_comm);
2477 r = sprintf(buf, "%d %s\n", pid, buf_comm);
2478 buf += r;
2479 len += r;
2480 }
2481
2482 len = simple_read_from_buffer(ubuf, cnt, ppos,
2483 file_buf, len);
2484
2485 kfree(file_buf);
2486 kfree(buf_comm);
2487
2488 return len;
2489}
2490
2491static const struct file_operations tracing_saved_cmdlines_fops = {
2492 .open = tracing_open_generic,
2493 .read = tracing_saved_cmdlines_read,
2494};
2495
2496static ssize_t
2400tracing_ctrl_read(struct file *filp, char __user *ubuf, 2497tracing_ctrl_read(struct file *filp, char __user *ubuf,
2401 size_t cnt, loff_t *ppos) 2498 size_t cnt, loff_t *ppos)
2402{ 2499{
@@ -2728,6 +2825,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2728 /* trace pipe does not show start of buffer */ 2825 /* trace pipe does not show start of buffer */
2729 cpumask_setall(iter->started); 2826 cpumask_setall(iter->started);
2730 2827
2828 if (trace_flags & TRACE_ITER_LATENCY_FMT)
2829 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2830
2731 iter->cpu_file = cpu_file; 2831 iter->cpu_file = cpu_file;
2732 iter->tr = &global_trace; 2832 iter->tr = &global_trace;
2733 mutex_init(&iter->mutex); 2833 mutex_init(&iter->mutex);
@@ -2915,6 +3015,7 @@ waitagain:
2915 offsetof(struct trace_iterator, seq)); 3015 offsetof(struct trace_iterator, seq));
2916 iter->pos = -1; 3016 iter->pos = -1;
2917 3017
3018 trace_event_read_lock();
2918 while (find_next_entry_inc(iter) != NULL) { 3019 while (find_next_entry_inc(iter) != NULL) {
2919 enum print_line_t ret; 3020 enum print_line_t ret;
2920 int len = iter->seq.len; 3021 int len = iter->seq.len;
@@ -2931,6 +3032,7 @@ waitagain:
2931 if (iter->seq.len >= cnt) 3032 if (iter->seq.len >= cnt)
2932 break; 3033 break;
2933 } 3034 }
3035 trace_event_read_unlock();
2934 3036
2935 /* Now copy what we have to the user */ 3037 /* Now copy what we have to the user */
2936 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 3038 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3053,6 +3155,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3053 goto out_err; 3155 goto out_err;
3054 } 3156 }
3055 3157
3158 trace_event_read_lock();
3159
3056 /* Fill as many pages as possible. */ 3160 /* Fill as many pages as possible. */
3057 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3161 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3058 pages[i] = alloc_page(GFP_KERNEL); 3162 pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3179,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3075 trace_seq_init(&iter->seq); 3179 trace_seq_init(&iter->seq);
3076 } 3180 }
3077 3181
3182 trace_event_read_unlock();
3078 mutex_unlock(&iter->mutex); 3183 mutex_unlock(&iter->mutex);
3079 3184
3080 spd.nr_pages = i; 3185 spd.nr_pages = i;
@@ -3425,7 +3530,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3425 .spd_release = buffer_spd_release, 3530 .spd_release = buffer_spd_release,
3426 }; 3531 };
3427 struct buffer_ref *ref; 3532 struct buffer_ref *ref;
3428 int size, i; 3533 int entries, size, i;
3429 size_t ret; 3534 size_t ret;
3430 3535
3431 if (*ppos & (PAGE_SIZE - 1)) { 3536 if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3545,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3440 len &= PAGE_MASK; 3545 len &= PAGE_MASK;
3441 } 3546 }
3442 3547
3443 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { 3548 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3549
3550 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
3444 struct page *page; 3551 struct page *page;
3445 int r; 3552 int r;
3446 3553
@@ -3457,7 +3564,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3457 } 3564 }
3458 3565
3459 r = ring_buffer_read_page(ref->buffer, &ref->page, 3566 r = ring_buffer_read_page(ref->buffer, &ref->page,
3460 len, info->cpu, 0); 3567 len, info->cpu, 1);
3461 if (r < 0) { 3568 if (r < 0) {
3462 ring_buffer_free_read_page(ref->buffer, 3569 ring_buffer_free_read_page(ref->buffer,
3463 ref->page); 3570 ref->page);
@@ -3481,6 +3588,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3481 spd.partial[i].private = (unsigned long)ref; 3588 spd.partial[i].private = (unsigned long)ref;
3482 spd.nr_pages++; 3589 spd.nr_pages++;
3483 *ppos += PAGE_SIZE; 3590 *ppos += PAGE_SIZE;
3591
3592 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3484 } 3593 }
3485 3594
3486 spd.nr_pages = i; 3595 spd.nr_pages = i;
@@ -3508,6 +3617,45 @@ static const struct file_operations tracing_buffers_fops = {
3508 .llseek = no_llseek, 3617 .llseek = no_llseek,
3509}; 3618};
3510 3619
3620static ssize_t
3621tracing_stats_read(struct file *filp, char __user *ubuf,
3622 size_t count, loff_t *ppos)
3623{
3624 unsigned long cpu = (unsigned long)filp->private_data;
3625 struct trace_array *tr = &global_trace;
3626 struct trace_seq *s;
3627 unsigned long cnt;
3628
3629 s = kmalloc(sizeof(*s), GFP_KERNEL);
3630 if (!s)
3631 return ENOMEM;
3632
3633 trace_seq_init(s);
3634
3635 cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
3636 trace_seq_printf(s, "entries: %ld\n", cnt);
3637
3638 cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
3639 trace_seq_printf(s, "overrun: %ld\n", cnt);
3640
3641 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3642 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3643
3644 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3645 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3646
3647 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3648
3649 kfree(s);
3650
3651 return count;
3652}
3653
3654static const struct file_operations tracing_stats_fops = {
3655 .open = tracing_open_generic,
3656 .read = tracing_stats_read,
3657};
3658
3511#ifdef CONFIG_DYNAMIC_FTRACE 3659#ifdef CONFIG_DYNAMIC_FTRACE
3512 3660
3513int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3661int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3745,7 @@ struct dentry *tracing_dentry_percpu(void)
3597static void tracing_init_debugfs_percpu(long cpu) 3745static void tracing_init_debugfs_percpu(long cpu)
3598{ 3746{
3599 struct dentry *d_percpu = tracing_dentry_percpu(); 3747 struct dentry *d_percpu = tracing_dentry_percpu();
3600 struct dentry *entry, *d_cpu; 3748 struct dentry *d_cpu;
3601 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3749 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3602 char cpu_dir[7]; 3750 char cpu_dir[7];
3603 3751
@@ -3612,21 +3760,18 @@ static void tracing_init_debugfs_percpu(long cpu)
3612 } 3760 }
3613 3761
3614 /* per cpu trace_pipe */ 3762 /* per cpu trace_pipe */
3615 entry = debugfs_create_file("trace_pipe", 0444, d_cpu, 3763 trace_create_file("trace_pipe", 0444, d_cpu,
3616 (void *) cpu, &tracing_pipe_fops); 3764 (void *) cpu, &tracing_pipe_fops);
3617 if (!entry)
3618 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3619 3765
3620 /* per cpu trace */ 3766 /* per cpu trace */
3621 entry = debugfs_create_file("trace", 0644, d_cpu, 3767 trace_create_file("trace", 0644, d_cpu,
3622 (void *) cpu, &tracing_fops); 3768 (void *) cpu, &tracing_fops);
3623 if (!entry)
3624 pr_warning("Could not create debugfs 'trace' entry\n");
3625 3769
3626 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, 3770 trace_create_file("trace_pipe_raw", 0444, d_cpu,
3627 (void *) cpu, &tracing_buffers_fops); 3771 (void *) cpu, &tracing_buffers_fops);
3628 if (!entry) 3772
3629 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); 3773 trace_create_file("stats", 0444, d_cpu,
3774 (void *) cpu, &tracing_stats_fops);
3630} 3775}
3631 3776
3632#ifdef CONFIG_FTRACE_SELFTEST 3777#ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3927,22 @@ static const struct file_operations trace_options_core_fops = {
3782 .write = trace_options_core_write, 3927 .write = trace_options_core_write,
3783}; 3928};
3784 3929
3930struct dentry *trace_create_file(const char *name,
3931 mode_t mode,
3932 struct dentry *parent,
3933 void *data,
3934 const struct file_operations *fops)
3935{
3936 struct dentry *ret;
3937
3938 ret = debugfs_create_file(name, mode, parent, data, fops);
3939 if (!ret)
3940 pr_warning("Could not create debugfs '%s' entry\n", name);
3941
3942 return ret;
3943}
3944
3945
3785static struct dentry *trace_options_init_dentry(void) 3946static struct dentry *trace_options_init_dentry(void)
3786{ 3947{
3787 struct dentry *d_tracer; 3948 struct dentry *d_tracer;
@@ -3809,7 +3970,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
3809 struct tracer_opt *opt) 3970 struct tracer_opt *opt)
3810{ 3971{
3811 struct dentry *t_options; 3972 struct dentry *t_options;
3812 struct dentry *entry;
3813 3973
3814 t_options = trace_options_init_dentry(); 3974 t_options = trace_options_init_dentry();
3815 if (!t_options) 3975 if (!t_options)
@@ -3818,11 +3978,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
3818 topt->flags = flags; 3978 topt->flags = flags;
3819 topt->opt = opt; 3979 topt->opt = opt;
3820 3980
3821 entry = debugfs_create_file(opt->name, 0644, t_options, topt, 3981 topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
3822 &trace_options_fops); 3982 &trace_options_fops);
3823 3983
3824 topt->entry = entry;
3825
3826} 3984}
3827 3985
3828static struct trace_option_dentry * 3986static struct trace_option_dentry *
@@ -3877,123 +4035,84 @@ static struct dentry *
3877create_trace_option_core_file(const char *option, long index) 4035create_trace_option_core_file(const char *option, long index)
3878{ 4036{
3879 struct dentry *t_options; 4037 struct dentry *t_options;
3880 struct dentry *entry;
3881 4038
3882 t_options = trace_options_init_dentry(); 4039 t_options = trace_options_init_dentry();
3883 if (!t_options) 4040 if (!t_options)
3884 return NULL; 4041 return NULL;
3885 4042
3886 entry = debugfs_create_file(option, 0644, t_options, (void *)index, 4043 return trace_create_file(option, 0644, t_options, (void *)index,
3887 &trace_options_core_fops); 4044 &trace_options_core_fops);
3888
3889 return entry;
3890} 4045}
3891 4046
3892static __init void create_trace_options_dir(void) 4047static __init void create_trace_options_dir(void)
3893{ 4048{
3894 struct dentry *t_options; 4049 struct dentry *t_options;
3895 struct dentry *entry;
3896 int i; 4050 int i;
3897 4051
3898 t_options = trace_options_init_dentry(); 4052 t_options = trace_options_init_dentry();
3899 if (!t_options) 4053 if (!t_options)
3900 return; 4054 return;
3901 4055
3902 for (i = 0; trace_options[i]; i++) { 4056 for (i = 0; trace_options[i]; i++)
3903 entry = create_trace_option_core_file(trace_options[i], i); 4057 create_trace_option_core_file(trace_options[i], i);
3904 if (!entry)
3905 pr_warning("Could not create debugfs %s entry\n",
3906 trace_options[i]);
3907 }
3908} 4058}
3909 4059
3910static __init int tracer_init_debugfs(void) 4060static __init int tracer_init_debugfs(void)
3911{ 4061{
3912 struct dentry *d_tracer; 4062 struct dentry *d_tracer;
3913 struct dentry *entry;
3914 int cpu; 4063 int cpu;
3915 4064
3916 d_tracer = tracing_init_dentry(); 4065 d_tracer = tracing_init_dentry();
3917 4066
3918 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, 4067 trace_create_file("tracing_enabled", 0644, d_tracer,
3919 &global_trace, &tracing_ctrl_fops); 4068 &global_trace, &tracing_ctrl_fops);
3920 if (!entry)
3921 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
3922 4069
3923 entry = debugfs_create_file("trace_options", 0644, d_tracer, 4070 trace_create_file("trace_options", 0644, d_tracer,
3924 NULL, &tracing_iter_fops); 4071 NULL, &tracing_iter_fops);
3925 if (!entry)
3926 pr_warning("Could not create debugfs 'trace_options' entry\n");
3927 4072
3928 create_trace_options_dir(); 4073 trace_create_file("tracing_cpumask", 0644, d_tracer,
4074 NULL, &tracing_cpumask_fops);
3929 4075
3930 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 4076 trace_create_file("trace", 0644, d_tracer,
3931 NULL, &tracing_cpumask_fops); 4077 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
3932 if (!entry) 4078
3933 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 4079 trace_create_file("available_tracers", 0444, d_tracer,
3934 4080 &global_trace, &show_traces_fops);
3935 entry = debugfs_create_file("trace", 0644, d_tracer, 4081
3936 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); 4082 trace_create_file("current_tracer", 0644, d_tracer,
3937 if (!entry) 4083 &global_trace, &set_tracer_fops);
3938 pr_warning("Could not create debugfs 'trace' entry\n"); 4084
3939 4085 trace_create_file("tracing_max_latency", 0644, d_tracer,
3940 entry = debugfs_create_file("available_tracers", 0444, d_tracer, 4086 &tracing_max_latency, &tracing_max_lat_fops);
3941 &global_trace, &show_traces_fops); 4087
3942 if (!entry) 4088 trace_create_file("tracing_thresh", 0644, d_tracer,
3943 pr_warning("Could not create debugfs 'available_tracers' entry\n"); 4089 &tracing_thresh, &tracing_max_lat_fops);
3944 4090
3945 entry = debugfs_create_file("current_tracer", 0444, d_tracer, 4091 trace_create_file("README", 0444, d_tracer,
3946 &global_trace, &set_tracer_fops); 4092 NULL, &tracing_readme_fops);
3947 if (!entry) 4093
3948 pr_warning("Could not create debugfs 'current_tracer' entry\n"); 4094 trace_create_file("trace_pipe", 0444, d_tracer,
3949
3950 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
3951 &tracing_max_latency,
3952 &tracing_max_lat_fops);
3953 if (!entry)
3954 pr_warning("Could not create debugfs "
3955 "'tracing_max_latency' entry\n");
3956
3957 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
3958 &tracing_thresh, &tracing_max_lat_fops);
3959 if (!entry)
3960 pr_warning("Could not create debugfs "
3961 "'tracing_thresh' entry\n");
3962 entry = debugfs_create_file("README", 0644, d_tracer,
3963 NULL, &tracing_readme_fops);
3964 if (!entry)
3965 pr_warning("Could not create debugfs 'README' entry\n");
3966
3967 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3968 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); 4095 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3969 if (!entry) 4096
3970 pr_warning("Could not create debugfs " 4097 trace_create_file("buffer_size_kb", 0644, d_tracer,
3971 "'trace_pipe' entry\n"); 4098 &global_trace, &tracing_entries_fops);
3972 4099
3973 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, 4100 trace_create_file("trace_marker", 0220, d_tracer,
3974 &global_trace, &tracing_entries_fops); 4101 NULL, &tracing_mark_fops);
3975 if (!entry) 4102
3976 pr_warning("Could not create debugfs " 4103 trace_create_file("saved_cmdlines", 0444, d_tracer,
3977 "'buffer_size_kb' entry\n"); 4104 NULL, &tracing_saved_cmdlines_fops);
3978
3979 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
3980 NULL, &tracing_mark_fops);
3981 if (!entry)
3982 pr_warning("Could not create debugfs "
3983 "'trace_marker' entry\n");
3984 4105
3985#ifdef CONFIG_DYNAMIC_FTRACE 4106#ifdef CONFIG_DYNAMIC_FTRACE
3986 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4107 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
3987 &ftrace_update_tot_cnt, 4108 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
3988 &tracing_dyn_info_fops);
3989 if (!entry)
3990 pr_warning("Could not create debugfs "
3991 "'dyn_ftrace_total_info' entry\n");
3992#endif 4109#endif
3993#ifdef CONFIG_SYSPROF_TRACER 4110#ifdef CONFIG_SYSPROF_TRACER
3994 init_tracer_sysprof_debugfs(d_tracer); 4111 init_tracer_sysprof_debugfs(d_tracer);
3995#endif 4112#endif
3996 4113
4114 create_trace_options_dir();
4115
3997 for_each_tracing_cpu(cpu) 4116 for_each_tracing_cpu(cpu)
3998 tracing_init_debugfs_percpu(cpu); 4117 tracing_init_debugfs_percpu(cpu);
3999 4118
@@ -4064,7 +4183,8 @@ trace_printk_seq(struct trace_seq *s)
4064 4183
4065static void __ftrace_dump(bool disable_tracing) 4184static void __ftrace_dump(bool disable_tracing)
4066{ 4185{
4067 static DEFINE_SPINLOCK(ftrace_dump_lock); 4186 static raw_spinlock_t ftrace_dump_lock =
4187 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
4068 /* use static because iter can be a bit big for the stack */ 4188 /* use static because iter can be a bit big for the stack */
4069 static struct trace_iterator iter; 4189 static struct trace_iterator iter;
4070 unsigned int old_userobj; 4190 unsigned int old_userobj;
@@ -4073,7 +4193,8 @@ static void __ftrace_dump(bool disable_tracing)
4073 int cnt = 0, cpu; 4193 int cnt = 0, cpu;
4074 4194
4075 /* only one dump */ 4195 /* only one dump */
4076 spin_lock_irqsave(&ftrace_dump_lock, flags); 4196 local_irq_save(flags);
4197 __raw_spin_lock(&ftrace_dump_lock);
4077 if (dump_ran) 4198 if (dump_ran)
4078 goto out; 4199 goto out;
4079 4200
@@ -4145,7 +4266,8 @@ static void __ftrace_dump(bool disable_tracing)
4145 } 4266 }
4146 4267
4147 out: 4268 out:
4148 spin_unlock_irqrestore(&ftrace_dump_lock, flags); 4269 __raw_spin_unlock(&ftrace_dump_lock);
4270 local_irq_restore(flags);
4149} 4271}
4150 4272
4151/* By default: disable tracing after the dump */ 4273/* By default: disable tracing after the dump */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e685ac2b2ba1..6e735d4771f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,9 +9,12 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h> 12#include <linux/kmemtrace.h>
13#include <trace/power.h> 13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h>
17
15enum trace_type { 18enum trace_type {
16 __TRACE_FIRST_TYPE = 0, 19 __TRACE_FIRST_TYPE = 0,
17 20
@@ -42,20 +45,6 @@ enum trace_type {
42}; 45};
43 46
44/* 47/*
45 * The trace entry - the most basic unit of tracing. This is what
46 * is printed in the end as a single line in the trace output, such as:
47 *
48 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
49 */
50struct trace_entry {
51 unsigned char type;
52 unsigned char flags;
53 unsigned char preempt_count;
54 int pid;
55 int tgid;
56};
57
58/*
59 * Function trace entry - function address and parent function addres: 48 * Function trace entry - function address and parent function addres:
60 */ 49 */
61struct ftrace_entry { 50struct ftrace_entry {
@@ -263,8 +252,6 @@ struct trace_array_cpu {
263 char comm[TASK_COMM_LEN]; 252 char comm[TASK_COMM_LEN];
264}; 253};
265 254
266struct trace_iterator;
267
268/* 255/*
269 * The trace array - an array of per-CPU trace arrays. This is the 256 * The trace array - an array of per-CPU trace arrays. This is the
270 * highest level data structure that individual tracers deal with. 257 * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
339 __ftrace_bad_type(); \ 326 __ftrace_bad_type(); \
340 } while (0) 327 } while (0)
341 328
342/* Return values for print_line callback */
343enum print_line_t {
344 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
345 TRACE_TYPE_HANDLED = 1,
346 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
347 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
348};
349
350
351/* 329/*
352 * An option specific to a tracer. This is a boolean value. 330 * An option specific to a tracer. This is a boolean value.
353 * The bit is the bit index that sets its value on the 331 * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
423 struct tracer_stat *stats; 401 struct tracer_stat *stats;
424}; 402};
425 403
426struct trace_seq {
427 unsigned char buffer[PAGE_SIZE];
428 unsigned int len;
429 unsigned int readpos;
430};
431
432static inline void
433trace_seq_init(struct trace_seq *s)
434{
435 s->len = 0;
436 s->readpos = 0;
437}
438
439 404
440#define TRACE_PIPE_ALL_CPU -1 405#define TRACE_PIPE_ALL_CPU -1
441 406
442/*
443 * Trace iterator - used by printout routines who present trace
444 * results to users and which routines might sleep, etc:
445 */
446struct trace_iterator {
447 struct trace_array *tr;
448 struct tracer *trace;
449 void *private;
450 int cpu_file;
451 struct mutex mutex;
452 struct ring_buffer_iter *buffer_iter[NR_CPUS];
453
454 /* The below is zeroed out in pipe_read */
455 struct trace_seq seq;
456 struct trace_entry *ent;
457 int cpu;
458 u64 ts;
459
460 unsigned long iter_flags;
461 loff_t pos;
462 long idx;
463
464 cpumask_var_t started;
465};
466
467int tracer_init(struct tracer *t, struct trace_array *tr); 407int tracer_init(struct tracer *t, struct trace_array *tr);
468int tracing_is_enabled(void); 408int tracing_is_enabled(void);
469void trace_wake_up(void); 409void trace_wake_up(void);
470void tracing_reset(struct trace_array *tr, int cpu); 410void tracing_reset(struct trace_array *tr, int cpu);
471void tracing_reset_online_cpus(struct trace_array *tr); 411void tracing_reset_online_cpus(struct trace_array *tr);
412void tracing_reset_current(int cpu);
413void tracing_reset_current_online_cpus(void);
472int tracing_open_generic(struct inode *inode, struct file *filp); 414int tracing_open_generic(struct inode *inode, struct file *filp);
415struct dentry *trace_create_file(const char *name,
416 mode_t mode,
417 struct dentry *parent,
418 void *data,
419 const struct file_operations *fops);
420
473struct dentry *tracing_init_dentry(void); 421struct dentry *tracing_init_dentry(void);
474void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 422void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
475 423
476struct ring_buffer_event; 424struct ring_buffer_event;
477 425
478struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
479 unsigned char type, 427 int type,
480 unsigned long len, 428 unsigned long len,
481 unsigned long flags, 429 unsigned long flags,
482 int pc); 430 int pc);
@@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
484 struct ring_buffer_event *event, 432 struct ring_buffer_event *event,
485 unsigned long flags, int pc); 433 unsigned long flags, int pc);
486 434
487struct ring_buffer_event *
488trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
489 unsigned long flags, int pc);
490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
494
495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 435struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
496 struct trace_array_cpu *data); 436 struct trace_array_cpu *data);
497 437
@@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
514 struct task_struct *prev, 454 struct task_struct *prev,
515 struct task_struct *next, 455 struct task_struct *next,
516 unsigned long flags, int pc); 456 unsigned long flags, int pc);
517void tracing_record_cmdline(struct task_struct *tsk);
518 457
519void tracing_sched_wakeup_trace(struct trace_array *tr, 458void tracing_sched_wakeup_trace(struct trace_array *tr,
520 struct task_struct *wakee, 459 struct task_struct *wakee,
@@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
599 struct trace_array *tr); 538 struct trace_array *tr);
600extern int trace_selftest_startup_branch(struct tracer *trace, 539extern int trace_selftest_startup_branch(struct tracer *trace,
601 struct trace_array *tr); 540 struct trace_array *tr);
541extern int trace_selftest_startup_hw_branches(struct tracer *trace,
542 struct trace_array *tr);
602#endif /* CONFIG_FTRACE_STARTUP_TEST */ 543#endif /* CONFIG_FTRACE_STARTUP_TEST */
603 544
604extern void *head_page(struct trace_array_cpu *data); 545extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +554,8 @@ extern unsigned long trace_flags;
613/* Standard output formatting function used for function return traces */ 554/* Standard output formatting function used for function return traces */
614#ifdef CONFIG_FUNCTION_GRAPH_TRACER 555#ifdef CONFIG_FUNCTION_GRAPH_TRACER
615extern enum print_line_t print_graph_function(struct trace_iterator *iter); 556extern enum print_line_t print_graph_function(struct trace_iterator *iter);
557extern enum print_line_t
558trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
616 559
617#ifdef CONFIG_DYNAMIC_FTRACE 560#ifdef CONFIG_DYNAMIC_FTRACE
618/* TODO: make this variable */ 561/* TODO: make this variable */
@@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
644 return 1; 587 return 1;
645} 588}
646#endif /* CONFIG_DYNAMIC_FTRACE */ 589#endif /* CONFIG_DYNAMIC_FTRACE */
647
648#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 590#else /* CONFIG_FUNCTION_GRAPH_TRACER */
649static inline enum print_line_t 591static inline enum print_line_t
650print_graph_function(struct trace_iterator *iter) 592print_graph_function(struct trace_iterator *iter)
@@ -692,6 +634,7 @@ enum trace_iterator_flags {
692 TRACE_ITER_LATENCY_FMT = 0x40000, 634 TRACE_ITER_LATENCY_FMT = 0x40000,
693 TRACE_ITER_GLOBAL_CLK = 0x80000, 635 TRACE_ITER_GLOBAL_CLK = 0x80000,
694 TRACE_ITER_SLEEP_TIME = 0x100000, 636 TRACE_ITER_SLEEP_TIME = 0x100000,
637 TRACE_ITER_GRAPH_TIME = 0x200000,
695}; 638};
696 639
697/* 640/*
@@ -790,103 +733,113 @@ struct ftrace_event_field {
790 char *type; 733 char *type;
791 int offset; 734 int offset;
792 int size; 735 int size;
736 int is_signed;
793}; 737};
794 738
795struct ftrace_event_call { 739struct event_filter {
796 char *name; 740 int n_preds;
797 char *system;
798 struct dentry *dir;
799 int enabled;
800 int (*regfunc)(void);
801 void (*unregfunc)(void);
802 int id;
803 int (*raw_init)(void);
804 int (*show_format)(struct trace_seq *s);
805 int (*define_fields)(void);
806 struct list_head fields;
807 struct filter_pred **preds; 741 struct filter_pred **preds;
808 742 char *filter_string;
809#ifdef CONFIG_EVENT_PROFILE
810 atomic_t profile_count;
811 int (*profile_enable)(struct ftrace_event_call *);
812 void (*profile_disable)(struct ftrace_event_call *);
813#endif
814}; 743};
815 744
816struct event_subsystem { 745struct event_subsystem {
817 struct list_head list; 746 struct list_head list;
818 const char *name; 747 const char *name;
819 struct dentry *entry; 748 struct dentry *entry;
820 struct filter_pred **preds; 749 void *filter;
821}; 750};
822 751
823#define events_for_each(event) \
824 for (event = __start_ftrace_events; \
825 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
826 event++)
827
828#define MAX_FILTER_PRED 8
829
830struct filter_pred; 752struct filter_pred;
831 753
832typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); 754typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
755 int val1, int val2);
833 756
834struct filter_pred { 757struct filter_pred {
835 filter_pred_fn_t fn; 758 filter_pred_fn_t fn;
836 u64 val; 759 u64 val;
837 char *str_val; 760 char str_val[MAX_FILTER_STR_VAL];
838 int str_len; 761 int str_len;
839 char *field_name; 762 char *field_name;
840 int offset; 763 int offset;
841 int not; 764 int not;
842 int or; 765 int op;
843 int compound; 766 int pop_n;
844 int clear;
845}; 767};
846 768
847int trace_define_field(struct ftrace_event_call *call, char *type, 769extern void print_event_filter(struct ftrace_event_call *call,
848 char *name, int offset, int size);
849extern void filter_free_pred(struct filter_pred *pred);
850extern void filter_print_preds(struct filter_pred **preds,
851 struct trace_seq *s); 770 struct trace_seq *s);
852extern int filter_parse(char **pbuf, struct filter_pred *pred); 771extern int apply_event_filter(struct ftrace_event_call *call,
853extern int filter_add_pred(struct ftrace_event_call *call, 772 char *filter_string);
854 struct filter_pred *pred); 773extern int apply_subsystem_event_filter(struct event_subsystem *system,
855extern void filter_free_preds(struct ftrace_event_call *call); 774 char *filter_string);
856extern int filter_match_preds(struct ftrace_event_call *call, void *rec); 775extern void print_subsystem_event_filter(struct event_subsystem *system,
857extern void filter_free_subsystem_preds(struct event_subsystem *system); 776 struct trace_seq *s);
858extern int filter_add_subsystem_pred(struct event_subsystem *system, 777
859 struct filter_pred *pred); 778static inline int
860 779filter_check_discard(struct ftrace_event_call *call, void *rec,
861void event_trace_printk(unsigned long ip, const char *fmt, ...); 780 struct ring_buffer *buffer,
862extern struct ftrace_event_call __start_ftrace_events[]; 781 struct ring_buffer_event *event)
863extern struct ftrace_event_call __stop_ftrace_events[]; 782{
864 783 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
865#define for_each_event(event) \ 784 ring_buffer_discard_commit(buffer, event);
866 for (event = __start_ftrace_events; \ 785 return 1;
867 (unsigned long)event < (unsigned long)__stop_ftrace_events; \ 786 }
868 event++) 787
788 return 0;
789}
790
791#define DEFINE_COMPARISON_PRED(type) \
792static int filter_pred_##type(struct filter_pred *pred, void *event, \
793 int val1, int val2) \
794{ \
795 type *addr = (type *)(event + pred->offset); \
796 type val = (type)pred->val; \
797 int match = 0; \
798 \
799 switch (pred->op) { \
800 case OP_LT: \
801 match = (*addr < val); \
802 break; \
803 case OP_LE: \
804 match = (*addr <= val); \
805 break; \
806 case OP_GT: \
807 match = (*addr > val); \
808 break; \
809 case OP_GE: \
810 match = (*addr >= val); \
811 break; \
812 default: \
813 break; \
814 } \
815 \
816 return match; \
817}
818
819#define DEFINE_EQUALITY_PRED(size) \
820static int filter_pred_##size(struct filter_pred *pred, void *event, \
821 int val1, int val2) \
822{ \
823 u##size *addr = (u##size *)(event + pred->offset); \
824 u##size val = (u##size)pred->val; \
825 int match; \
826 \
827 match = (val == *addr) ^ pred->not; \
828 \
829 return match; \
830}
831
832extern struct mutex event_mutex;
833extern struct list_head ftrace_events;
869 834
870extern const char *__start___trace_bprintk_fmt[]; 835extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 836extern const char *__stop___trace_bprintk_fmt[];
872 837
873/* 838#undef TRACE_EVENT_FORMAT
874 * The double __builtin_constant_p is because gcc will give us an error 839#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
875 * if we try to allocate the static variable to fmt if it is not a 840 extern struct ftrace_event_call event_##call;
876 * constant. Even with the outer if statement optimizing out. 841#undef TRACE_EVENT_FORMAT_NOFILTER
877 */ 842#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
878#define event_trace_printk(ip, fmt, args...) \ 843#include "trace_event_types.h"
879do { \
880 __trace_printk_check_format(fmt, ##args); \
881 tracing_record_cmdline(current); \
882 if (__builtin_constant_p(fmt)) { \
883 static const char *trace_printk_fmt \
884 __attribute__((section("__trace_printk_fmt"))) = \
885 __builtin_constant_p(fmt) ? fmt : NULL; \
886 \
887 __trace_bprintk(ip, trace_printk_fmt, ##args); \
888 } else \
889 __trace_printk(ip, fmt, ##args); \
890} while (0)
891 844
892#endif /* _LINUX_KERNEL_TRACE_H */ 845#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c3642..a29ef23ffb47 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/time.h>
12 13
13#include "trace.h" 14#include "trace.h"
14#include "trace_output.h" 15#include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
67 trace_assign_type(field, entry); 68 trace_assign_type(field, entry);
68 call = &field->boot_call; 69 call = &field->boot_call;
69 ts = iter->ts; 70 ts = iter->ts;
70 nsec_rem = do_div(ts, 1000000000); 71 nsec_rem = do_div(ts, NSEC_PER_SEC);
71 72
72 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 73 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
73 (unsigned long)ts, nsec_rem, call->func, call->caller); 74 (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
92 trace_assign_type(field, entry); 93 trace_assign_type(field, entry);
93 init_ret = &field->boot_ret; 94 init_ret = &field->boot_ret;
94 ts = iter->ts; 95 ts = iter->ts;
95 nsec_rem = do_div(ts, 1000000000); 96 nsec_rem = do_div(ts, NSEC_PER_SEC);
96 97
97 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 98 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
98 "returned %d after %llu msecs\n", 99 "returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 8333715e4066..7a7a9fd249a9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
30static void 30static void
31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) 31probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
32{ 32{
33 struct ftrace_event_call *call = &event_branch;
33 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
34 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
35 struct trace_branch *entry; 36 struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
73 entry->line = f->line; 74 entry->line = f->line;
74 entry->correct = val == expect; 75 entry->correct = val == expect;
75 76
76 ring_buffer_unlock_commit(tr->buffer, event); 77 if (!filter_check_discard(call, entry, tr->buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event);
77 79
78 out: 80 out:
79 atomic_dec(&tr->data[cpu]->disabled); 81 atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
271 return 0; 273 return 0;
272} 274}
273 275
274static void *annotated_branch_stat_start(void) 276static void *annotated_branch_stat_start(struct tracer_stat *trace)
275{ 277{
276 return __start_annotated_branch_profile; 278 return __start_annotated_branch_profile;
277} 279}
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
346 return 0; 348 return 0;
347} 349}
348 350
349static void *all_branch_stat_start(void) 351static void *all_branch_stat_start(struct tracer_stat *trace)
350{ 352{
351 return __start_branch_profile; 353 return __start_branch_profile;
352} 354}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba9970776..5b5895afecfe 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,22 +10,30 @@
10int ftrace_profile_enable(int event_id) 10int ftrace_profile_enable(int event_id)
11{ 11{
12 struct ftrace_event_call *event; 12 struct ftrace_event_call *event;
13 int ret = -EINVAL;
13 14
14 for_each_event(event) { 15 mutex_lock(&event_mutex);
15 if (event->id == event_id) 16 list_for_each_entry(event, &ftrace_events, list) {
16 return event->profile_enable(event); 17 if (event->id == event_id) {
18 ret = event->profile_enable(event);
19 break;
20 }
17 } 21 }
22 mutex_unlock(&event_mutex);
18 23
19 return -EINVAL; 24 return ret;
20} 25}
21 26
22void ftrace_profile_disable(int event_id) 27void ftrace_profile_disable(int event_id)
23{ 28{
24 struct ftrace_event_call *event; 29 struct ftrace_event_call *event;
25 30
26 for_each_event(event) { 31 mutex_lock(&event_mutex);
27 if (event->id == event_id) 32 list_for_each_entry(event, &ftrace_events, list) {
28 return event->profile_disable(event); 33 if (event->id == event_id) {
34 event->profile_disable(event);
35 break;
36 }
29 } 37 }
38 mutex_unlock(&event_mutex);
30} 39}
31
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd7..5e32e375134d 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") 57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58); 58);
59 59
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, 60TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT( 61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1) 62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2) 63 TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, 122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT( 123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line) 124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) 125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
126 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) 126 TRACE_FUNC_SIZE+1, func)
127 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
128 TRACE_FUNC_SIZE+1, file)
127 TRACE_FIELD(char, correct, correct) 129 TRACE_FIELD(char, correct, correct)
128 ), 130 ),
129 TP_RAW_FMT("%u:%s:%s (%u)") 131 TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
139 141
140TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, 142TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
141 TRACE_STRUCT( 143 TRACE_STRUCT(
142 TRACE_FIELD(ktime_t, state_data.stamp, stamp) 144 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
143 TRACE_FIELD(ktime_t, state_data.end, end) 145 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
144 TRACE_FIELD(int, state_data.type, type) 146 TRACE_FIELD(int, state_data.type, type)
145 TRACE_FIELD(int, state_data.state, state) 147 TRACE_FIELD(int, state_data.state, state)
146 ), 148 ),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 576f4fa2af0d..aa08be69a1b6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,19 +8,25 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/workqueue.h>
12#include <linux/spinlock.h>
13#include <linux/kthread.h>
11#include <linux/debugfs.h> 14#include <linux/debugfs.h>
12#include <linux/uaccess.h> 15#include <linux/uaccess.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h>
15 19
16#include "trace_output.h" 20#include "trace_output.h"
17 21
18#define TRACE_SYSTEM "TRACE_SYSTEM" 22#define TRACE_SYSTEM "TRACE_SYSTEM"
19 23
20static DEFINE_MUTEX(event_mutex); 24DEFINE_MUTEX(event_mutex);
25
26LIST_HEAD(ftrace_events);
21 27
22int trace_define_field(struct ftrace_event_call *call, char *type, 28int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size) 29 char *name, int offset, int size, int is_signed)
24{ 30{
25 struct ftrace_event_field *field; 31 struct ftrace_event_field *field;
26 32
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
38 44
39 field->offset = offset; 45 field->offset = offset;
40 field->size = size; 46 field->size = size;
47 field->is_signed = is_signed;
41 list_add(&field->link, &call->fields); 48 list_add(&field->link, &call->fields);
42 49
43 return 0; 50 return 0;
@@ -51,47 +58,94 @@ err:
51 58
52 return -ENOMEM; 59 return -ENOMEM;
53} 60}
61EXPORT_SYMBOL_GPL(trace_define_field);
54 62
55static void ftrace_clear_events(void) 63#ifdef CONFIG_MODULES
56{
57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
58
59 64
60 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { 65static void trace_destroy_fields(struct ftrace_event_call *call)
66{
67 struct ftrace_event_field *field, *next;
61 68
62 if (call->enabled) { 69 list_for_each_entry_safe(field, next, &call->fields, link) {
63 call->enabled = 0; 70 list_del(&field->link);
64 call->unregfunc(); 71 kfree(field->type);
65 } 72 kfree(field->name);
66 call++; 73 kfree(field);
67 } 74 }
68} 75}
69 76
77#endif /* CONFIG_MODULES */
78
70static void ftrace_event_enable_disable(struct ftrace_event_call *call, 79static void ftrace_event_enable_disable(struct ftrace_event_call *call,
71 int enable) 80 int enable)
72{ 81{
73
74 switch (enable) { 82 switch (enable) {
75 case 0: 83 case 0:
76 if (call->enabled) { 84 if (call->enabled) {
77 call->enabled = 0; 85 call->enabled = 0;
86 tracing_stop_cmdline_record();
78 call->unregfunc(); 87 call->unregfunc();
79 } 88 }
80 break; 89 break;
81 case 1: 90 case 1:
82 if (!call->enabled) { 91 if (!call->enabled) {
83 call->enabled = 1; 92 call->enabled = 1;
93 tracing_start_cmdline_record();
84 call->regfunc(); 94 call->regfunc();
85 } 95 }
86 break; 96 break;
87 } 97 }
88} 98}
89 99
100static void ftrace_clear_events(void)
101{
102 struct ftrace_event_call *call;
103
104 mutex_lock(&event_mutex);
105 list_for_each_entry(call, &ftrace_events, list) {
106 ftrace_event_enable_disable(call, 0);
107 }
108 mutex_unlock(&event_mutex);
109}
110
111/*
112 * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
113 */
114static int __ftrace_set_clr_event(const char *match, const char *sub,
115 const char *event, int set)
116{
117 struct ftrace_event_call *call;
118 int ret = -EINVAL;
119
120 mutex_lock(&event_mutex);
121 list_for_each_entry(call, &ftrace_events, list) {
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142
143 return ret;
144}
145
90static int ftrace_set_clr_event(char *buf, int set) 146static int ftrace_set_clr_event(char *buf, int set)
91{ 147{
92 struct ftrace_event_call *call = __start_ftrace_events;
93 char *event = NULL, *sub = NULL, *match; 148 char *event = NULL, *sub = NULL, *match;
94 int ret = -EINVAL;
95 149
96 /* 150 /*
97 * The buf format can be <subsystem>:<event-name> 151 * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
117 event = NULL; 171 event = NULL;
118 } 172 }
119 173
120 mutex_lock(&event_mutex); 174 return __ftrace_set_clr_event(match, sub, event, set);
121 for_each_event(call) { 175}
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142 176
143 return ret; 177/**
178 * trace_set_clr_event - enable or disable an event
179 * @system: system name to match (NULL for any system)
180 * @event: event name to match (NULL for all events, within system)
181 * @set: 1 to enable, 0 to disable
182 *
183 * This is a way for other parts of the kernel to enable or disable
184 * event recording.
185 *
186 * Returns 0 on success, -EINVAL if the parameters do not match any
187 * registered events.
188 */
189int trace_set_clr_event(const char *system, const char *event, int set)
190{
191 return __ftrace_set_clr_event(NULL, system, event, set);
144} 192}
145 193
146/* 128 should be much more than enough */ 194/* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
224static void * 272static void *
225t_next(struct seq_file *m, void *v, loff_t *pos) 273t_next(struct seq_file *m, void *v, loff_t *pos)
226{ 274{
227 struct ftrace_event_call *call = m->private; 275 struct list_head *list = m->private;
228 struct ftrace_event_call *next = call; 276 struct ftrace_event_call *call;
229 277
230 (*pos)++; 278 (*pos)++;
231 279
232 for (;;) { 280 for (;;) {
233 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 281 if (list == &ftrace_events)
234 return NULL; 282 return NULL;
235 283
284 call = list_entry(list, struct ftrace_event_call, list);
285
236 /* 286 /*
237 * The ftrace subsystem is for showing formats only. 287 * The ftrace subsystem is for showing formats only.
238 * They can not be enabled or disabled via the event files. 288 * They can not be enabled or disabled via the event files.
@@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
240 if (call->regfunc) 290 if (call->regfunc)
241 break; 291 break;
242 292
243 call++; 293 list = list->next;
244 next = call;
245 } 294 }
246 295
247 m->private = ++next; 296 m->private = list->next;
248 297
249 return call; 298 return call;
250} 299}
251 300
252static void *t_start(struct seq_file *m, loff_t *pos) 301static void *t_start(struct seq_file *m, loff_t *pos)
253{ 302{
303 mutex_lock(&event_mutex);
304 if (*pos == 0)
305 m->private = ftrace_events.next;
254 return t_next(m, NULL, pos); 306 return t_next(m, NULL, pos);
255} 307}
256 308
257static void * 309static void *
258s_next(struct seq_file *m, void *v, loff_t *pos) 310s_next(struct seq_file *m, void *v, loff_t *pos)
259{ 311{
260 struct ftrace_event_call *call = m->private; 312 struct list_head *list = m->private;
261 struct ftrace_event_call *next; 313 struct ftrace_event_call *call;
262 314
263 (*pos)++; 315 (*pos)++;
264 316
265 retry: 317 retry:
266 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) 318 if (list == &ftrace_events)
267 return NULL; 319 return NULL;
268 320
321 call = list_entry(list, struct ftrace_event_call, list);
322
269 if (!call->enabled) { 323 if (!call->enabled) {
270 call++; 324 list = list->next;
271 goto retry; 325 goto retry;
272 } 326 }
273 327
274 next = call; 328 m->private = list->next;
275 m->private = ++next;
276 329
277 return call; 330 return call;
278} 331}
279 332
280static void *s_start(struct seq_file *m, loff_t *pos) 333static void *s_start(struct seq_file *m, loff_t *pos)
281{ 334{
335 mutex_lock(&event_mutex);
336 if (*pos == 0)
337 m->private = ftrace_events.next;
282 return s_next(m, NULL, pos); 338 return s_next(m, NULL, pos);
283} 339}
284 340
@@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)
295 351
296static void t_stop(struct seq_file *m, void *p) 352static void t_stop(struct seq_file *m, void *p)
297{ 353{
354 mutex_unlock(&event_mutex);
298} 355}
299 356
300static int 357static int
301ftrace_event_seq_open(struct inode *inode, struct file *file) 358ftrace_event_seq_open(struct inode *inode, struct file *file)
302{ 359{
303 int ret;
304 const struct seq_operations *seq_ops; 360 const struct seq_operations *seq_ops;
305 361
306 if ((file->f_mode & FMODE_WRITE) && 362 if ((file->f_mode & FMODE_WRITE) &&
@@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
308 ftrace_clear_events(); 364 ftrace_clear_events();
309 365
310 seq_ops = inode->i_private; 366 seq_ops = inode->i_private;
311 ret = seq_open(file, seq_ops); 367 return seq_open(file, seq_ops);
312 if (!ret) {
313 struct seq_file *m = file->private_data;
314
315 m->private = __start_ftrace_events;
316 }
317 return ret;
318} 368}
319 369
320static ssize_t 370static ssize_t
@@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
374 return cnt; 424 return cnt;
375} 425}
376 426
427static ssize_t
428system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
429 loff_t *ppos)
430{
431 const char set_to_char[4] = { '?', '0', '1', 'X' };
432 const char *system = filp->private_data;
433 struct ftrace_event_call *call;
434 char buf[2];
435 int set = 0;
436 int ret;
437
438 mutex_lock(&event_mutex);
439 list_for_each_entry(call, &ftrace_events, list) {
440 if (!call->name || !call->regfunc)
441 continue;
442
443 if (system && strcmp(call->system, system) != 0)
444 continue;
445
446 /*
447 * We need to find out if all the events are set
448 * or if all events or cleared, or if we have
449 * a mixture.
450 */
451 set |= (1 << !!call->enabled);
452
453 /*
454 * If we have a mixture, no need to look further.
455 */
456 if (set == 3)
457 break;
458 }
459 mutex_unlock(&event_mutex);
460
461 buf[0] = set_to_char[set];
462 buf[1] = '\n';
463
464 ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
465
466 return ret;
467}
468
469static ssize_t
470system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
471 loff_t *ppos)
472{
473 const char *system = filp->private_data;
474 unsigned long val;
475 char buf[64];
476 ssize_t ret;
477
478 if (cnt >= sizeof(buf))
479 return -EINVAL;
480
481 if (copy_from_user(&buf, ubuf, cnt))
482 return -EFAULT;
483
484 buf[cnt] = 0;
485
486 ret = strict_strtoul(buf, 10, &val);
487 if (ret < 0)
488 return ret;
489
490 ret = tracing_update_buffers();
491 if (ret < 0)
492 return ret;
493
494 if (val != 0 && val != 1)
495 return -EINVAL;
496
497 ret = __ftrace_set_clr_event(NULL, system, NULL, val);
498 if (ret)
499 goto out;
500
501 ret = cnt;
502
503out:
504 *ppos += cnt;
505
506 return ret;
507}
508
509extern char *__bad_type_size(void);
510
377#undef FIELD 511#undef FIELD
378#define FIELD(type, name) \ 512#define FIELD(type, name) \
513 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
379 #type, "common_" #name, offsetof(typeof(field), name), \ 514 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name) 515 sizeof(field.name)
381 516
@@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
391 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 526 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
392 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 527 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
393 "\n", 528 "\n",
394 FIELD(unsigned char, type), 529 FIELD(unsigned short, type),
395 FIELD(unsigned char, flags), 530 FIELD(unsigned char, flags),
396 FIELD(unsigned char, preempt_count), 531 FIELD(unsigned char, preempt_count),
397 FIELD(int, pid), 532 FIELD(int, pid),
@@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
481 616
482 trace_seq_init(s); 617 trace_seq_init(s);
483 618
484 filter_print_preds(call->preds, s); 619 print_event_filter(call, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 620 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486 621
487 kfree(s); 622 kfree(s);
@@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos) 629 loff_t *ppos)
495{ 630{
496 struct ftrace_event_call *call = filp->private_data; 631 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf; 632 char *buf;
498 struct filter_pred *pred;
499 int err; 633 int err;
500 634
501 if (cnt >= sizeof(buf)) 635 if (cnt >= PAGE_SIZE)
502 return -EINVAL; 636 return -EINVAL;
503 637
504 if (copy_from_user(&buf, ubuf, cnt)) 638 buf = (char *)__get_free_page(GFP_TEMPORARY);
505 return -EFAULT; 639 if (!buf)
506 buf[cnt] = '\0';
507
508 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
509 if (!pred)
510 return -ENOMEM; 640 return -ENOMEM;
511 641
512 err = filter_parse(&pbuf, pred); 642 if (copy_from_user(buf, ubuf, cnt)) {
513 if (err < 0) { 643 free_page((unsigned long) buf);
514 filter_free_pred(pred); 644 return -EFAULT;
515 return err;
516 }
517
518 if (pred->clear) {
519 filter_free_preds(call);
520 filter_free_pred(pred);
521 return cnt;
522 } 645 }
646 buf[cnt] = '\0';
523 647
524 err = filter_add_pred(call, pred); 648 err = apply_event_filter(call, buf);
525 if (err < 0) { 649 free_page((unsigned long) buf);
526 filter_free_pred(pred); 650 if (err < 0)
527 return err; 651 return err;
528 }
529 652
530 *ppos += cnt; 653 *ppos += cnt;
531 654
@@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
549 672
550 trace_seq_init(s); 673 trace_seq_init(s);
551 674
552 filter_print_preds(system->preds, s); 675 print_subsystem_event_filter(system, s);
553 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); 676 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
554 677
555 kfree(s); 678 kfree(s);
@@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
562 loff_t *ppos) 685 loff_t *ppos)
563{ 686{
564 struct event_subsystem *system = filp->private_data; 687 struct event_subsystem *system = filp->private_data;
565 char buf[64], *pbuf = buf; 688 char *buf;
566 struct filter_pred *pred;
567 int err; 689 int err;
568 690
569 if (cnt >= sizeof(buf)) 691 if (cnt >= PAGE_SIZE)
570 return -EINVAL; 692 return -EINVAL;
571 693
572 if (copy_from_user(&buf, ubuf, cnt)) 694 buf = (char *)__get_free_page(GFP_TEMPORARY);
573 return -EFAULT; 695 if (!buf)
574 buf[cnt] = '\0';
575
576 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
577 if (!pred)
578 return -ENOMEM; 696 return -ENOMEM;
579 697
580 err = filter_parse(&pbuf, pred); 698 if (copy_from_user(buf, ubuf, cnt)) {
581 if (err < 0) { 699 free_page((unsigned long) buf);
582 filter_free_pred(pred); 700 return -EFAULT;
583 return err;
584 }
585
586 if (pred->clear) {
587 filter_free_subsystem_preds(system);
588 filter_free_pred(pred);
589 return cnt;
590 } 701 }
702 buf[cnt] = '\0';
591 703
592 err = filter_add_subsystem_pred(system, pred); 704 err = apply_subsystem_event_filter(system, buf);
593 if (err < 0) { 705 free_page((unsigned long) buf);
594 filter_free_subsystem_preds(system); 706 if (err < 0)
595 filter_free_pred(pred);
596 return err; 707 return err;
597 }
598 708
599 *ppos += cnt; 709 *ppos += cnt;
600 710
601 return cnt; 711 return cnt;
602} 712}
603 713
714static ssize_t
715show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
716{
717 int (*func)(struct trace_seq *s) = filp->private_data;
718 struct trace_seq *s;
719 int r;
720
721 if (*ppos)
722 return 0;
723
724 s = kmalloc(sizeof(*s), GFP_KERNEL);
725 if (!s)
726 return -ENOMEM;
727
728 trace_seq_init(s);
729
730 func(s);
731 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
732
733 kfree(s);
734
735 return r;
736}
737
604static const struct seq_operations show_event_seq_ops = { 738static const struct seq_operations show_event_seq_ops = {
605 .start = t_start, 739 .start = t_start,
606 .next = t_next, 740 .next = t_next,
@@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
658 .write = subsystem_filter_write, 792 .write = subsystem_filter_write,
659}; 793};
660 794
795static const struct file_operations ftrace_system_enable_fops = {
796 .open = tracing_open_generic,
797 .read = system_enable_read,
798 .write = system_enable_write,
799};
800
801static const struct file_operations ftrace_show_header_fops = {
802 .open = tracing_open_generic,
803 .read = show_header,
804};
805
661static struct dentry *event_trace_events_dir(void) 806static struct dentry *event_trace_events_dir(void)
662{ 807{
663 static struct dentry *d_tracer; 808 static struct dentry *d_tracer;
@@ -684,6 +829,7 @@ static struct dentry *
684event_subsystem_dir(const char *name, struct dentry *d_events) 829event_subsystem_dir(const char *name, struct dentry *d_events)
685{ 830{
686 struct event_subsystem *system; 831 struct event_subsystem *system;
832 struct dentry *entry;
687 833
688 /* First see if we did not already create this dir */ 834 /* First see if we did not already create this dir */
689 list_for_each_entry(system, &event_subsystems, list) { 835 list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
707 return d_events; 853 return d_events;
708 } 854 }
709 855
710 system->name = name; 856 system->name = kstrdup(name, GFP_KERNEL);
857 if (!system->name) {
858 debugfs_remove(system->entry);
859 kfree(system);
860 return d_events;
861 }
862
711 list_add(&system->list, &event_subsystems); 863 list_add(&system->list, &event_subsystems);
712 864
713 system->preds = NULL; 865 system->filter = NULL;
866
867 system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
868 if (!system->filter) {
869 pr_warning("Could not allocate filter for subsystem "
870 "'%s'\n", name);
871 return system->entry;
872 }
873
874 entry = debugfs_create_file("filter", 0644, system->entry, system,
875 &ftrace_subsystem_filter_fops);
876 if (!entry) {
877 kfree(system->filter);
878 system->filter = NULL;
879 pr_warning("Could not create debugfs "
880 "'%s/filter' entry\n", name);
881 }
882
883 entry = trace_create_file("enable", 0644, system->entry,
884 (void *)system->name,
885 &ftrace_system_enable_fops);
714 886
715 return system->entry; 887 return system->entry;
716} 888}
717 889
718static int 890static int
719event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) 891event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
892 const struct file_operations *id,
893 const struct file_operations *enable,
894 const struct file_operations *filter,
895 const struct file_operations *format)
720{ 896{
721 struct dentry *entry; 897 struct dentry *entry;
722 int ret; 898 int ret;
@@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
725 * If the trace point header did not define TRACE_SYSTEM 901 * If the trace point header did not define TRACE_SYSTEM
726 * then the system would be called "TRACE_SYSTEM". 902 * then the system would be called "TRACE_SYSTEM".
727 */ 903 */
728 if (strcmp(call->system, "TRACE_SYSTEM") != 0) 904 if (strcmp(call->system, TRACE_SYSTEM) != 0)
729 d_events = event_subsystem_dir(call->system, d_events); 905 d_events = event_subsystem_dir(call->system, d_events);
730 906
731 if (call->raw_init) { 907 if (call->raw_init) {
@@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
744 return -1; 920 return -1;
745 } 921 }
746 922
747 if (call->regfunc) { 923 if (call->regfunc)
748 entry = debugfs_create_file("enable", 0644, call->dir, call, 924 entry = trace_create_file("enable", 0644, call->dir, call,
749 &ftrace_enable_fops); 925 enable);
750 if (!entry)
751 pr_warning("Could not create debugfs "
752 "'%s/enable' entry\n", call->name);
753 }
754 926
755 if (call->id) { 927 if (call->id)
756 entry = debugfs_create_file("id", 0444, call->dir, call, 928 entry = trace_create_file("id", 0444, call->dir, call,
757 &ftrace_event_id_fops); 929 id);
758 if (!entry)
759 pr_warning("Could not create debugfs '%s/id' entry\n",
760 call->name);
761 }
762 930
763 if (call->define_fields) { 931 if (call->define_fields) {
764 ret = call->define_fields(); 932 ret = call->define_fields();
@@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
767 " events/%s\n", call->name); 935 " events/%s\n", call->name);
768 return ret; 936 return ret;
769 } 937 }
770 entry = debugfs_create_file("filter", 0644, call->dir, call, 938 entry = trace_create_file("filter", 0644, call->dir, call,
771 &ftrace_event_filter_fops); 939 filter);
772 if (!entry)
773 pr_warning("Could not create debugfs "
774 "'%s/filter' entry\n", call->name);
775 } 940 }
776 941
777 /* A trace may not want to export its format */ 942 /* A trace may not want to export its format */
778 if (!call->show_format) 943 if (!call->show_format)
779 return 0; 944 return 0;
780 945
781 entry = debugfs_create_file("format", 0444, call->dir, call, 946 entry = trace_create_file("format", 0444, call->dir, call,
782 &ftrace_event_format_fops); 947 format);
783 if (!entry) 948
784 pr_warning("Could not create debugfs " 949 return 0;
785 "'%s/format' entry\n", call->name); 950}
951
952#define for_each_event(event, start, end) \
953 for (event = start; \
954 (unsigned long)event < (unsigned long)end; \
955 event++)
956
957#ifdef CONFIG_MODULES
958
959static LIST_HEAD(ftrace_module_file_list);
960
961/*
962 * Modules must own their file_operations to keep up with
963 * reference counting.
964 */
965struct ftrace_module_file_ops {
966 struct list_head list;
967 struct module *mod;
968 struct file_operations id;
969 struct file_operations enable;
970 struct file_operations format;
971 struct file_operations filter;
972};
973
974static struct ftrace_module_file_ops *
975trace_create_file_ops(struct module *mod)
976{
977 struct ftrace_module_file_ops *file_ops;
978
979 /*
980 * This is a bit of a PITA. To allow for correct reference
981 * counting, modules must "own" their file_operations.
982 * To do this, we allocate the file operations that will be
983 * used in the event directory.
984 */
985
986 file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
987 if (!file_ops)
988 return NULL;
989
990 file_ops->mod = mod;
991
992 file_ops->id = ftrace_event_id_fops;
993 file_ops->id.owner = mod;
994
995 file_ops->enable = ftrace_enable_fops;
996 file_ops->enable.owner = mod;
997
998 file_ops->filter = ftrace_event_filter_fops;
999 file_ops->filter.owner = mod;
1000
1001 file_ops->format = ftrace_event_format_fops;
1002 file_ops->format.owner = mod;
1003
1004 list_add(&file_ops->list, &ftrace_module_file_list);
1005
1006 return file_ops;
1007}
1008
1009static void trace_module_add_events(struct module *mod)
1010{
1011 struct ftrace_module_file_ops *file_ops = NULL;
1012 struct ftrace_event_call *call, *start, *end;
1013 struct dentry *d_events;
1014
1015 start = mod->trace_events;
1016 end = mod->trace_events + mod->num_trace_events;
1017
1018 if (start == end)
1019 return;
1020
1021 d_events = event_trace_events_dir();
1022 if (!d_events)
1023 return;
1024
1025 for_each_event(call, start, end) {
1026 /* The linker may leave blanks */
1027 if (!call->name)
1028 continue;
1029
1030 /*
1031 * This module has events, create file ops for this module
1032 * if not already done.
1033 */
1034 if (!file_ops) {
1035 file_ops = trace_create_file_ops(mod);
1036 if (!file_ops)
1037 return;
1038 }
1039 call->mod = mod;
1040 list_add(&call->list, &ftrace_events);
1041 event_create_dir(call, d_events,
1042 &file_ops->id, &file_ops->enable,
1043 &file_ops->filter, &file_ops->format);
1044 }
1045}
1046
1047static void trace_module_remove_events(struct module *mod)
1048{
1049 struct ftrace_module_file_ops *file_ops;
1050 struct ftrace_event_call *call, *p;
1051 bool found = false;
1052
1053 down_write(&trace_event_mutex);
1054 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1055 if (call->mod == mod) {
1056 found = true;
1057 ftrace_event_enable_disable(call, 0);
1058 if (call->event)
1059 __unregister_ftrace_event(call->event);
1060 debugfs_remove_recursive(call->dir);
1061 list_del(&call->list);
1062 trace_destroy_fields(call);
1063 destroy_preds(call);
1064 }
1065 }
1066
1067 /* Now free the file_operations */
1068 list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
1069 if (file_ops->mod == mod)
1070 break;
1071 }
1072 if (&file_ops->list != &ftrace_module_file_list) {
1073 list_del(&file_ops->list);
1074 kfree(file_ops);
1075 }
1076
1077 /*
1078 * It is safest to reset the ring buffer if the module being unloaded
1079 * registered any events.
1080 */
1081 if (found)
1082 tracing_reset_current_online_cpus();
1083 up_write(&trace_event_mutex);
1084}
1085
1086static int trace_module_notify(struct notifier_block *self,
1087 unsigned long val, void *data)
1088{
1089 struct module *mod = data;
1090
1091 mutex_lock(&event_mutex);
1092 switch (val) {
1093 case MODULE_STATE_COMING:
1094 trace_module_add_events(mod);
1095 break;
1096 case MODULE_STATE_GOING:
1097 trace_module_remove_events(mod);
1098 break;
1099 }
1100 mutex_unlock(&event_mutex);
786 1101
787 return 0; 1102 return 0;
788} 1103}
1104#else
1105static int trace_module_notify(struct notifier_block *self,
1106 unsigned long val, void *data)
1107{
1108 return 0;
1109}
1110#endif /* CONFIG_MODULES */
1111
1112struct notifier_block trace_module_nb = {
1113 .notifier_call = trace_module_notify,
1114 .priority = 0,
1115};
1116
1117extern struct ftrace_event_call __start_ftrace_events[];
1118extern struct ftrace_event_call __stop_ftrace_events[];
789 1119
790static __init int event_trace_init(void) 1120static __init int event_trace_init(void)
791{ 1121{
792 struct ftrace_event_call *call = __start_ftrace_events; 1122 struct ftrace_event_call *call;
793 struct dentry *d_tracer; 1123 struct dentry *d_tracer;
794 struct dentry *entry; 1124 struct dentry *entry;
795 struct dentry *d_events; 1125 struct dentry *d_events;
1126 int ret;
796 1127
797 d_tracer = tracing_init_dentry(); 1128 d_tracer = tracing_init_dentry();
798 if (!d_tracer) 1129 if (!d_tracer)
@@ -816,13 +1147,243 @@ static __init int event_trace_init(void)
816 if (!d_events) 1147 if (!d_events)
817 return 0; 1148 return 0;
818 1149
819 for_each_event(call) { 1150 /* ring buffer internal formats */
1151 trace_create_file("header_page", 0444, d_events,
1152 ring_buffer_print_page_header,
1153 &ftrace_show_header_fops);
1154
1155 trace_create_file("header_event", 0444, d_events,
1156 ring_buffer_print_entry_header,
1157 &ftrace_show_header_fops);
1158
1159 trace_create_file("enable", 0644, d_events,
1160 NULL, &ftrace_system_enable_fops);
1161
1162 for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
820 /* The linker may leave blanks */ 1163 /* The linker may leave blanks */
821 if (!call->name) 1164 if (!call->name)
822 continue; 1165 continue;
823 event_create_dir(call, d_events); 1166 list_add(&call->list, &ftrace_events);
1167 event_create_dir(call, d_events, &ftrace_event_id_fops,
1168 &ftrace_enable_fops, &ftrace_event_filter_fops,
1169 &ftrace_event_format_fops);
824 } 1170 }
825 1171
1172 ret = register_module_notifier(&trace_module_nb);
1173 if (ret)
1174 pr_warning("Failed to register trace events module notifier\n");
1175
826 return 0; 1176 return 0;
827} 1177}
828fs_initcall(event_trace_init); 1178fs_initcall(event_trace_init);
1179
1180#ifdef CONFIG_FTRACE_STARTUP_TEST
1181
1182static DEFINE_SPINLOCK(test_spinlock);
1183static DEFINE_SPINLOCK(test_spinlock_irq);
1184static DEFINE_MUTEX(test_mutex);
1185
1186static __init void test_work(struct work_struct *dummy)
1187{
1188 spin_lock(&test_spinlock);
1189 spin_lock_irq(&test_spinlock_irq);
1190 udelay(1);
1191 spin_unlock_irq(&test_spinlock_irq);
1192 spin_unlock(&test_spinlock);
1193
1194 mutex_lock(&test_mutex);
1195 msleep(1);
1196 mutex_unlock(&test_mutex);
1197}
1198
1199static __init int event_test_thread(void *unused)
1200{
1201 void *test_malloc;
1202
1203 test_malloc = kmalloc(1234, GFP_KERNEL);
1204 if (!test_malloc)
1205 pr_info("failed to kmalloc\n");
1206
1207 schedule_on_each_cpu(test_work);
1208
1209 kfree(test_malloc);
1210
1211 set_current_state(TASK_INTERRUPTIBLE);
1212 while (!kthread_should_stop())
1213 schedule();
1214
1215 return 0;
1216}
1217
1218/*
1219 * Do various things that may trigger events.
1220 */
1221static __init void event_test_stuff(void)
1222{
1223 struct task_struct *test_thread;
1224
1225 test_thread = kthread_run(event_test_thread, NULL, "test-events");
1226 msleep(1);
1227 kthread_stop(test_thread);
1228}
1229
1230/*
1231 * For every trace event defined, we will test each trace point separately,
1232 * and then by groups, and finally all trace points.
1233 */
1234static __init void event_trace_self_tests(void)
1235{
1236 struct ftrace_event_call *call;
1237 struct event_subsystem *system;
1238 int ret;
1239
1240 pr_info("Running tests on trace events:\n");
1241
1242 list_for_each_entry(call, &ftrace_events, list) {
1243
1244 /* Only test those that have a regfunc */
1245 if (!call->regfunc)
1246 continue;
1247
1248 pr_info("Testing event %s: ", call->name);
1249
1250 /*
1251 * If an event is already enabled, someone is using
1252 * it and the self test should not be on.
1253 */
1254 if (call->enabled) {
1255 pr_warning("Enabled event during self test!\n");
1256 WARN_ON_ONCE(1);
1257 continue;
1258 }
1259
1260 ftrace_event_enable_disable(call, 1);
1261 event_test_stuff();
1262 ftrace_event_enable_disable(call, 0);
1263
1264 pr_cont("OK\n");
1265 }
1266
1267 /* Now test at the sub system level */
1268
1269 pr_info("Running tests on trace event systems:\n");
1270
1271 list_for_each_entry(system, &event_subsystems, list) {
1272
1273 /* the ftrace system is special, skip it */
1274 if (strcmp(system->name, "ftrace") == 0)
1275 continue;
1276
1277 pr_info("Testing event system %s: ", system->name);
1278
1279 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
1280 if (WARN_ON_ONCE(ret)) {
1281 pr_warning("error enabling system %s\n",
1282 system->name);
1283 continue;
1284 }
1285
1286 event_test_stuff();
1287
1288 ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
1289 if (WARN_ON_ONCE(ret))
1290 pr_warning("error disabling system %s\n",
1291 system->name);
1292
1293 pr_cont("OK\n");
1294 }
1295
1296 /* Test with all events enabled */
1297
1298 pr_info("Running tests on all trace events:\n");
1299 pr_info("Testing all events: ");
1300
1301 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
1302 if (WARN_ON_ONCE(ret)) {
1303 pr_warning("error enabling all events\n");
1304 return;
1305 }
1306
1307 event_test_stuff();
1308
1309 /* reset sysname */
1310 ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
1311 if (WARN_ON_ONCE(ret)) {
1312 pr_warning("error disabling all events\n");
1313 return;
1314 }
1315
1316 pr_cont("OK\n");
1317}
1318
1319#ifdef CONFIG_FUNCTION_TRACER
1320
1321static DEFINE_PER_CPU(atomic_t, test_event_disable);
1322
1323static void
1324function_test_events_call(unsigned long ip, unsigned long parent_ip)
1325{
1326 struct ring_buffer_event *event;
1327 struct ftrace_entry *entry;
1328 unsigned long flags;
1329 long disabled;
1330 int resched;
1331 int cpu;
1332 int pc;
1333
1334 pc = preempt_count();
1335 resched = ftrace_preempt_disable();
1336 cpu = raw_smp_processor_id();
1337 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
1338
1339 if (disabled != 1)
1340 goto out;
1341
1342 local_save_flags(flags);
1343
1344 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
1345 flags, pc);
1346 if (!event)
1347 goto out;
1348 entry = ring_buffer_event_data(event);
1349 entry->ip = ip;
1350 entry->parent_ip = parent_ip;
1351
1352 trace_nowake_buffer_unlock_commit(event, flags, pc);
1353
1354 out:
1355 atomic_dec(&per_cpu(test_event_disable, cpu));
1356 ftrace_preempt_enable(resched);
1357}
1358
1359static struct ftrace_ops trace_ops __initdata =
1360{
1361 .func = function_test_events_call,
1362};
1363
1364static __init void event_trace_self_test_with_function(void)
1365{
1366 register_ftrace_function(&trace_ops);
1367 pr_info("Running tests again, along with the function tracer\n");
1368 event_trace_self_tests();
1369 unregister_ftrace_function(&trace_ops);
1370}
1371#else
1372static __init void event_trace_self_test_with_function(void)
1373{
1374}
1375#endif
1376
1377static __init int event_trace_self_tests_init(void)
1378{
1379
1380 event_trace_self_tests();
1381
1382 event_trace_self_test_with_function();
1383
1384 return 0;
1385}
1386
1387late_initcall(event_trace_self_tests_init);
1388
1389#endif
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e03cbf1e38f3..936c621bbf46 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,119 +22,295 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/mutex.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
28 29
29static int filter_pred_64(struct filter_pred *pred, void *event) 30enum filter_op_ids
30{ 31{
31 u64 *addr = (u64 *)(event + pred->offset); 32 OP_OR,
32 u64 val = (u64)pred->val; 33 OP_AND,
33 int match; 34 OP_NE,
35 OP_EQ,
36 OP_LT,
37 OP_LE,
38 OP_GT,
39 OP_GE,
40 OP_NONE,
41 OP_OPEN_PAREN,
42};
43
44struct filter_op {
45 int id;
46 char *string;
47 int precedence;
48};
49
50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 },
54 { OP_EQ, "==", 4 },
55 { OP_LT, "<", 5 },
56 { OP_LE, "<=", 5 },
57 { OP_GT, ">", 5 },
58 { OP_GE, ">=", 5 },
59 { OP_NONE, "OP_NONE", 0 },
60 { OP_OPEN_PAREN, "(", 0 },
61};
62
63enum {
64 FILT_ERR_NONE,
65 FILT_ERR_INVALID_OP,
66 FILT_ERR_UNBALANCED_PAREN,
67 FILT_ERR_TOO_MANY_OPERANDS,
68 FILT_ERR_OPERAND_TOO_LONG,
69 FILT_ERR_FIELD_NOT_FOUND,
70 FILT_ERR_ILLEGAL_FIELD_OP,
71 FILT_ERR_ILLEGAL_INTVAL,
72 FILT_ERR_BAD_SUBSYS_FILTER,
73 FILT_ERR_TOO_MANY_PREDS,
74 FILT_ERR_MISSING_FIELD,
75 FILT_ERR_INVALID_FILTER,
76};
77
78static char *err_text[] = {
79 "No error",
80 "Invalid operator",
81 "Unbalanced parens",
82 "Too many operands",
83 "Operand too long",
84 "Field not found",
85 "Illegal operation for field type",
86 "Illegal integer value",
87 "Couldn't find or set field in one of a subsystem's events",
88 "Too many terms in predicate expression",
89 "Missing field name and/or value",
90 "Meaningless filter expression",
91};
92
93struct opstack_op {
94 int op;
95 struct list_head list;
96};
97
98struct postfix_elt {
99 int op;
100 char *operand;
101 struct list_head list;
102};
103
104struct filter_parse_state {
105 struct filter_op *ops;
106 struct list_head opstack;
107 struct list_head postfix;
108 int lasterr;
109 int lasterr_pos;
110
111 struct {
112 char *string;
113 unsigned int cnt;
114 unsigned int tail;
115 } infix;
116
117 struct {
118 char string[MAX_FILTER_STR_VAL];
119 int pos;
120 unsigned int tail;
121 } operand;
122};
123
124DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32);
127DEFINE_COMPARISON_PRED(u32);
128DEFINE_COMPARISON_PRED(s16);
129DEFINE_COMPARISON_PRED(u16);
130DEFINE_COMPARISON_PRED(s8);
131DEFINE_COMPARISON_PRED(u8);
132
133DEFINE_EQUALITY_PRED(64);
134DEFINE_EQUALITY_PRED(32);
135DEFINE_EQUALITY_PRED(16);
136DEFINE_EQUALITY_PRED(8);
137
138static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
139 void *event __attribute((unused)),
140 int val1, int val2)
141{
142 return val1 && val2;
143}
144
145static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
146 void *event __attribute((unused)),
147 int val1, int val2)
148{
149 return val1 || val2;
150}
151
152/* Filter predicate for fixed sized arrays of characters */
153static int filter_pred_string(struct filter_pred *pred, void *event,
154 int val1, int val2)
155{
156 char *addr = (char *)(event + pred->offset);
157 int cmp, match;
34 158
35 match = (val == *addr) ^ pred->not; 159 cmp = strncmp(addr, pred->str_val, pred->str_len);
160
161 match = (!cmp) ^ pred->not;
36 162
37 return match; 163 return match;
38} 164}
39 165
40static int filter_pred_32(struct filter_pred *pred, void *event) 166/*
167 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end
169 * of the entry.
170 * Also each of these strings have a field in the entry which
171 * contains its offset from the beginning of the entry.
172 * We have then first to get this field, dereference it
173 * and add it to the address of the entry, and at last we have
174 * the address of the string.
175 */
176static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2)
41{ 178{
42 u32 *addr = (u32 *)(event + pred->offset); 179 unsigned short str_loc = *(unsigned short *)(event + pred->offset);
43 u32 val = (u32)pred->val; 180 char *addr = (char *)(event + str_loc);
44 int match; 181 int cmp, match;
45 182
46 match = (val == *addr) ^ pred->not; 183 cmp = strncmp(addr, pred->str_val, pred->str_len);
184
185 match = (!cmp) ^ pred->not;
47 186
48 return match; 187 return match;
49} 188}
50 189
51static int filter_pred_16(struct filter_pred *pred, void *event) 190static int filter_pred_none(struct filter_pred *pred, void *event,
191 int val1, int val2)
192{
193 return 0;
194}
195
196/* return 1 if event matches, 0 otherwise (discard) */
197int filter_match_preds(struct ftrace_event_call *call, void *rec)
52{ 198{
53 u16 *addr = (u16 *)(event + pred->offset); 199 struct event_filter *filter = call->filter;
54 u16 val = (u16)pred->val; 200 int match, top = 0, val1 = 0, val2 = 0;
55 int match; 201 int stack[MAX_FILTER_PRED];
202 struct filter_pred *pred;
203 int i;
56 204
57 match = (val == *addr) ^ pred->not; 205 for (i = 0; i < filter->n_preds; i++) {
206 pred = filter->preds[i];
207 if (!pred->pop_n) {
208 match = pred->fn(pred, rec, val1, val2);
209 stack[top++] = match;
210 continue;
211 }
212 if (pred->pop_n > top) {
213 WARN_ON_ONCE(1);
214 return 0;
215 }
216 val1 = stack[--top];
217 val2 = stack[--top];
218 match = pred->fn(pred, rec, val1, val2);
219 stack[top++] = match;
220 }
58 221
59 return match; 222 return stack[--top];
60} 223}
224EXPORT_SYMBOL_GPL(filter_match_preds);
61 225
62static int filter_pred_8(struct filter_pred *pred, void *event) 226static void parse_error(struct filter_parse_state *ps, int err, int pos)
63{ 227{
64 u8 *addr = (u8 *)(event + pred->offset); 228 ps->lasterr = err;
65 u8 val = (u8)pred->val; 229 ps->lasterr_pos = pos;
66 int match; 230}
67 231
68 match = (val == *addr) ^ pred->not; 232static void remove_filter_string(struct event_filter *filter)
233{
234 kfree(filter->filter_string);
235 filter->filter_string = NULL;
236}
69 237
70 return match; 238static int replace_filter_string(struct event_filter *filter,
239 char *filter_string)
240{
241 kfree(filter->filter_string);
242 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
243 if (!filter->filter_string)
244 return -ENOMEM;
245
246 return 0;
71} 247}
72 248
73static int filter_pred_string(struct filter_pred *pred, void *event) 249static int append_filter_string(struct event_filter *filter,
250 char *string)
74{ 251{
75 char *addr = (char *)(event + pred->offset); 252 int newlen;
76 int cmp, match; 253 char *new_filter_string;
77 254
78 cmp = strncmp(addr, pred->str_val, pred->str_len); 255 BUG_ON(!filter->filter_string);
256 newlen = strlen(filter->filter_string) + strlen(string) + 1;
257 new_filter_string = kmalloc(newlen, GFP_KERNEL);
258 if (!new_filter_string)
259 return -ENOMEM;
79 260
80 match = (!cmp) ^ pred->not; 261 strcpy(new_filter_string, filter->filter_string);
262 strcat(new_filter_string, string);
263 kfree(filter->filter_string);
264 filter->filter_string = new_filter_string;
81 265
82 return match; 266 return 0;
83} 267}
84 268
85/* return 1 if event matches, 0 otherwise (discard) */ 269static void append_filter_err(struct filter_parse_state *ps,
86int filter_match_preds(struct ftrace_event_call *call, void *rec) 270 struct event_filter *filter)
87{ 271{
88 int i, matched, and_failed = 0; 272 int pos = ps->lasterr_pos;
89 struct filter_pred *pred; 273 char *buf, *pbuf;
90 274
91 for (i = 0; i < MAX_FILTER_PRED; i++) { 275 buf = (char *)__get_free_page(GFP_TEMPORARY);
92 if (call->preds[i]) { 276 if (!buf)
93 pred = call->preds[i]; 277 return;
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105 278
106 if (and_failed) 279 append_filter_string(filter, "\n");
107 return 0; 280 memset(buf, ' ', PAGE_SIZE);
281 if (pos > PAGE_SIZE - 128)
282 pos = 0;
283 buf[pos] = '^';
284 pbuf = &buf[pos] + 1;
108 285
109 return 1; 286 sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
287 append_filter_string(filter, buf);
288 free_page((unsigned long) buf);
110} 289}
111 290
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) 291void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
113{ 292{
114 char *field_name; 293 struct event_filter *filter = call->filter;
115 struct filter_pred *pred;
116 int i;
117 294
118 if (!preds) { 295 mutex_lock(&event_mutex);
296 if (filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else
119 trace_seq_printf(s, "none\n"); 299 trace_seq_printf(s, "none\n");
120 return; 300 mutex_unlock(&event_mutex);
121 } 301}
122 302
123 for (i = 0; i < MAX_FILTER_PRED; i++) { 303void print_subsystem_event_filter(struct event_subsystem *system,
124 if (preds[i]) { 304 struct trace_seq *s)
125 pred = preds[i]; 305{
126 field_name = pred->field_name; 306 struct event_filter *filter = system->filter;
127 if (i) 307
128 trace_seq_printf(s, pred->or ? "|| " : "&& "); 308 mutex_lock(&event_mutex);
129 trace_seq_printf(s, "%s ", field_name); 309 if (filter->filter_string)
130 trace_seq_printf(s, pred->not ? "!= " : "== "); 310 trace_seq_printf(s, "%s\n", filter->filter_string);
131 if (pred->str_val) 311 else
132 trace_seq_printf(s, "%s\n", pred->str_val); 312 trace_seq_printf(s, "none\n");
133 else 313 mutex_unlock(&event_mutex);
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138} 314}
139 315
140static struct ftrace_event_field * 316static struct ftrace_event_field *
@@ -150,284 +326,829 @@ find_event_field(struct ftrace_event_call *call, char *name)
150 return NULL; 326 return NULL;
151} 327}
152 328
153void filter_free_pred(struct filter_pred *pred) 329static void filter_free_pred(struct filter_pred *pred)
154{ 330{
155 if (!pred) 331 if (!pred)
156 return; 332 return;
157 333
158 kfree(pred->field_name); 334 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred); 335 kfree(pred);
161} 336}
162 337
163void filter_free_preds(struct ftrace_event_call *call) 338static void filter_clear_pred(struct filter_pred *pred)
164{ 339{
165 int i; 340 kfree(pred->field_name);
341 pred->field_name = NULL;
342 pred->str_len = 0;
343}
166 344
167 if (call->preds) { 345static int filter_set_pred(struct filter_pred *dest,
168 for (i = 0; i < MAX_FILTER_PRED; i++) 346 struct filter_pred *src,
169 filter_free_pred(call->preds[i]); 347 filter_pred_fn_t fn)
170 kfree(call->preds); 348{
171 call->preds = NULL; 349 *dest = *src;
350 if (src->field_name) {
351 dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
352 if (!dest->field_name)
353 return -ENOMEM;
172 } 354 }
355 dest->fn = fn;
356
357 return 0;
173} 358}
174 359
175void filter_free_subsystem_preds(struct event_subsystem *system) 360static void filter_disable_preds(struct ftrace_event_call *call)
176{ 361{
177 struct ftrace_event_call *call = __start_ftrace_events; 362 struct event_filter *filter = call->filter;
178 int i; 363 int i;
179 364
180 if (system->preds) { 365 call->filter_active = 0;
181 for (i = 0; i < MAX_FILTER_PRED; i++) 366 filter->n_preds = 0;
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186 367
187 events_for_each(call) { 368 for (i = 0; i < MAX_FILTER_PRED; i++)
188 if (!call->name || !call->regfunc) 369 filter->preds[i]->fn = filter_pred_none;
189 continue; 370}
371
372void destroy_preds(struct ftrace_event_call *call)
373{
374 struct event_filter *filter = call->filter;
375 int i;
190 376
191 if (!strcmp(call->system, system->name)) 377 for (i = 0; i < MAX_FILTER_PRED; i++) {
192 filter_free_preds(call); 378 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]);
193 } 380 }
381 kfree(filter->preds);
382 kfree(filter->filter_string);
383 kfree(filter);
384 call->filter = NULL;
194} 385}
195 386
196static int __filter_add_pred(struct ftrace_event_call *call, 387int init_preds(struct ftrace_event_call *call)
197 struct filter_pred *pred)
198{ 388{
389 struct event_filter *filter;
390 struct filter_pred *pred;
199 int i; 391 int i;
200 392
201 if (call->preds && !pred->compound) 393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
202 filter_free_preds(call); 394 if (!call->filter)
395 return -ENOMEM;
203 396
204 if (!call->preds) { 397 call->filter_active = 0;
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), 398 filter->n_preds = 0;
206 GFP_KERNEL); 399
207 if (!call->preds) 400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
208 return -ENOMEM; 401 if (!filter->preds)
209 } 402 goto oom;
210 403
211 for (i = 0; i < MAX_FILTER_PRED; i++) { 404 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) { 405 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
213 call->preds[i] = pred; 406 if (!pred)
214 return 0; 407 goto oom;
408 pred->fn = filter_pred_none;
409 filter->preds[i] = pred;
410 }
411
412 return 0;
413
414oom:
415 destroy_preds(call);
416
417 return -ENOMEM;
418}
419EXPORT_SYMBOL_GPL(init_preds);
420
421static void filter_free_subsystem_preds(struct event_subsystem *system)
422{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call;
425 int i;
426
427 if (filter->n_preds) {
428 for (i = 0; i < filter->n_preds; i++)
429 filter_free_pred(filter->preds[i]);
430 kfree(filter->preds);
431 filter->preds = NULL;
432 filter->n_preds = 0;
433 }
434
435 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields)
437 continue;
438
439 if (!strcmp(call->system, system->name)) {
440 filter_disable_preds(call);
441 remove_filter_string(call->filter);
215 } 442 }
216 } 443 }
444}
445
446static int filter_add_pred_fn(struct filter_parse_state *ps,
447 struct ftrace_event_call *call,
448 struct filter_pred *pred,
449 filter_pred_fn_t fn)
450{
451 struct event_filter *filter = call->filter;
452 int idx, err;
453
454 if (filter->n_preds == MAX_FILTER_PRED) {
455 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
456 return -ENOSPC;
457 }
458
459 idx = filter->n_preds;
460 filter_clear_pred(filter->preds[idx]);
461 err = filter_set_pred(filter->preds[idx], pred, fn);
462 if (err)
463 return err;
464
465 filter->n_preds++;
466 call->filter_active = 1;
217 467
218 return -ENOSPC; 468 return 0;
219} 469}
220 470
471enum {
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
221static int is_string_field(const char *type) 476static int is_string_field(const char *type)
222{ 477{
478 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING;
480
223 if (strchr(type, '[') && strstr(type, "char")) 481 if (strchr(type, '[') && strstr(type, "char"))
224 return 1; 482 return FILTER_STATIC_STRING;
225 483
226 return 0; 484 return 0;
227} 485}
228 486
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) 487static int is_legal_op(struct ftrace_event_field *field, int op)
230{ 488{
231 struct ftrace_event_field *field; 489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
232 490 return 0;
233 field = find_event_field(call, pred->field_name);
234 if (!field)
235 return -EINVAL;
236 491
237 pred->offset = field->offset; 492 return 1;
493}
238 494
239 if (is_string_field(field->type)) { 495static filter_pred_fn_t select_comparison_fn(int op, int field_size,
240 if (!pred->str_val) 496 int field_is_signed)
241 return -EINVAL; 497{
242 pred->fn = filter_pred_string; 498 filter_pred_fn_t fn = NULL;
243 pred->str_len = field->size;
244 return __filter_add_pred(call, pred);
245 } else {
246 if (pred->str_val)
247 return -EINVAL;
248 }
249 499
250 switch (field->size) { 500 switch (field_size) {
251 case 8: 501 case 8:
252 pred->fn = filter_pred_64; 502 if (op == OP_EQ || op == OP_NE)
503 fn = filter_pred_64;
504 else if (field_is_signed)
505 fn = filter_pred_s64;
506 else
507 fn = filter_pred_u64;
253 break; 508 break;
254 case 4: 509 case 4:
255 pred->fn = filter_pred_32; 510 if (op == OP_EQ || op == OP_NE)
511 fn = filter_pred_32;
512 else if (field_is_signed)
513 fn = filter_pred_s32;
514 else
515 fn = filter_pred_u32;
256 break; 516 break;
257 case 2: 517 case 2:
258 pred->fn = filter_pred_16; 518 if (op == OP_EQ || op == OP_NE)
519 fn = filter_pred_16;
520 else if (field_is_signed)
521 fn = filter_pred_s16;
522 else
523 fn = filter_pred_u16;
259 break; 524 break;
260 case 1: 525 case 1:
261 pred->fn = filter_pred_8; 526 if (op == OP_EQ || op == OP_NE)
527 fn = filter_pred_8;
528 else if (field_is_signed)
529 fn = filter_pred_s8;
530 else
531 fn = filter_pred_u8;
262 break; 532 break;
263 default:
264 return -EINVAL;
265 } 533 }
266 534
267 return __filter_add_pred(call, pred); 535 return fn;
268} 536}
269 537
270static struct filter_pred *copy_pred(struct filter_pred *pred) 538static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call,
540 struct filter_pred *pred)
271{ 541{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); 542 struct ftrace_event_field *field;
273 if (!new_pred) 543 filter_pred_fn_t fn;
274 return NULL; 544 unsigned long long val;
545 int string_type;
546 int ret;
547
548 pred->fn = filter_pred_none;
549
550 if (pred->op == OP_AND) {
551 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and);
553 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or);
556 }
557
558 field = find_event_field(call, pred->field_name);
559 if (!field) {
560 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
561 return -EINVAL;
562 }
275 563
276 memcpy(new_pred, pred, sizeof(*pred)); 564 pred->offset = field->offset;
277 565
278 if (pred->field_name) { 566 if (!is_legal_op(field, pred->op)) {
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 567 parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
280 if (!new_pred->field_name) { 568 return -EINVAL;
281 kfree(new_pred);
282 return NULL;
283 }
284 } 569 }
285 570
286 if (pred->str_val) { 571 string_type = is_string_field(field->type);
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); 572 if (string_type) {
288 if (!new_pred->str_val) { 573 if (string_type == FILTER_STATIC_STRING)
289 filter_free_pred(new_pred); 574 fn = filter_pred_string;
290 return NULL; 575 else
576 fn = filter_pred_strloc;
577 pred->str_len = field->size;
578 if (pred->op == OP_NE)
579 pred->not = 1;
580 return filter_add_pred_fn(ps, call, pred, fn);
581 } else {
582 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val);
584 else
585 ret = strict_strtoull(pred->str_val, 0, &val);
586 if (ret) {
587 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
588 return -EINVAL;
291 } 589 }
590 pred->val = val;
591 }
592
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed);
594 if (!fn) {
595 parse_error(ps, FILT_ERR_INVALID_OP, 0);
596 return -EINVAL;
292 } 597 }
293 598
294 return new_pred; 599 if (pred->op == OP_NE)
600 pred->not = 1;
601
602 return filter_add_pred_fn(ps, call, pred, fn);
295} 603}
296 604
297int filter_add_subsystem_pred(struct event_subsystem *system, 605static int filter_add_subsystem_pred(struct filter_parse_state *ps,
298 struct filter_pred *pred) 606 struct event_subsystem *system,
607 struct filter_pred *pred,
608 char *filter_string)
299{ 609{
300 struct ftrace_event_call *call = __start_ftrace_events; 610 struct event_filter *filter = system->filter;
301 struct filter_pred *event_pred; 611 struct ftrace_event_call *call;
302 int i; 612 int err = 0;
303 613
304 if (system->preds && !pred->compound) 614 if (!filter->preds) {
305 filter_free_subsystem_preds(system); 615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
306
307 if (!system->preds) {
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
309 GFP_KERNEL); 616 GFP_KERNEL);
310 if (!system->preds) 617
618 if (!filter->preds)
311 return -ENOMEM; 619 return -ENOMEM;
312 } 620 }
313 621
314 for (i = 0; i < MAX_FILTER_PRED; i++) { 622 if (filter->n_preds == MAX_FILTER_PRED) {
315 if (!system->preds[i]) { 623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
316 system->preds[i] = pred; 624 return -ENOSPC;
317 break;
318 }
319 } 625 }
320 626
321 if (i == MAX_FILTER_PRED) 627 filter->preds[filter->n_preds] = pred;
322 return -ENOSPC; 628 filter->n_preds++;
323 629
324 events_for_each(call) { 630 list_for_each_entry(call, &ftrace_events, list) {
325 int err;
326 631
327 if (!call->name || !call->regfunc) 632 if (!call->define_fields)
328 continue; 633 continue;
329 634
330 if (strcmp(call->system, system->name)) 635 if (strcmp(call->system, system->name))
331 continue; 636 continue;
332 637
333 if (!find_event_field(call, pred->field_name)) 638 err = filter_add_pred(ps, call, pred);
334 continue; 639 if (err) {
640 filter_free_subsystem_preds(system);
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
642 goto out;
643 }
644 replace_filter_string(call->filter, filter_string);
645 }
646out:
647 return err;
648}
335 649
336 event_pred = copy_pred(pred); 650static void parse_init(struct filter_parse_state *ps,
337 if (!event_pred) 651 struct filter_op *ops,
338 goto oom; 652 char *infix_string)
653{
654 memset(ps, '\0', sizeof(*ps));
339 655
340 err = filter_add_pred(call, event_pred); 656 ps->infix.string = infix_string;
341 if (err) 657 ps->infix.cnt = strlen(infix_string);
342 filter_free_pred(event_pred); 658 ps->ops = ops;
343 if (err == -ENOMEM) 659
344 goto oom; 660 INIT_LIST_HEAD(&ps->opstack);
661 INIT_LIST_HEAD(&ps->postfix);
662}
663
664static char infix_next(struct filter_parse_state *ps)
665{
666 ps->infix.cnt--;
667
668 return ps->infix.string[ps->infix.tail++];
669}
670
671static char infix_peek(struct filter_parse_state *ps)
672{
673 if (ps->infix.tail == strlen(ps->infix.string))
674 return 0;
675
676 return ps->infix.string[ps->infix.tail];
677}
678
679static void infix_advance(struct filter_parse_state *ps)
680{
681 ps->infix.cnt--;
682 ps->infix.tail++;
683}
684
685static inline int is_precedence_lower(struct filter_parse_state *ps,
686 int a, int b)
687{
688 return ps->ops[a].precedence < ps->ops[b].precedence;
689}
690
691static inline int is_op_char(struct filter_parse_state *ps, char c)
692{
693 int i;
694
695 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
696 if (ps->ops[i].string[0] == c)
697 return 1;
345 } 698 }
346 699
347 return 0; 700 return 0;
701}
348 702
349oom: 703static int infix_get_op(struct filter_parse_state *ps, char firstc)
350 system->preds[i] = NULL; 704{
351 return -ENOMEM; 705 char nextc = infix_peek(ps);
706 char opstr[3];
707 int i;
708
709 opstr[0] = firstc;
710 opstr[1] = nextc;
711 opstr[2] = '\0';
712
713 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
714 if (!strcmp(opstr, ps->ops[i].string)) {
715 infix_advance(ps);
716 return ps->ops[i].id;
717 }
718 }
719
720 opstr[1] = '\0';
721
722 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
723 if (!strcmp(opstr, ps->ops[i].string))
724 return ps->ops[i].id;
725 }
726
727 return OP_NONE;
352} 728}
353 729
354int filter_parse(char **pbuf, struct filter_pred *pred) 730static inline void clear_operand_string(struct filter_parse_state *ps)
355{ 731{
356 char *tmp, *tok, *val_str = NULL; 732 memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
357 int tok_n = 0; 733 ps->operand.tail = 0;
734}
358 735
359 /* field ==/!= number, or/and field ==/!= number, number */ 736static inline int append_operand_char(struct filter_parse_state *ps, char c)
360 while ((tok = strsep(pbuf, " \n"))) { 737{
361 if (tok_n == 0) { 738 if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
362 if (!strcmp(tok, "0")) { 739 return -EINVAL;
363 pred->clear = 1; 740
364 return 0; 741 ps->operand.string[ps->operand.tail++] = c;
365 } else if (!strcmp(tok, "&&")) { 742
366 pred->or = 0; 743 return 0;
367 pred->compound = 1; 744}
368 } else if (!strcmp(tok, "||")) { 745
369 pred->or = 1; 746static int filter_opstack_push(struct filter_parse_state *ps, int op)
370 pred->compound = 1; 747{
371 } else 748 struct opstack_op *opstack_op;
372 pred->field_name = tok; 749
373 tok_n = 1; 750 opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
751 if (!opstack_op)
752 return -ENOMEM;
753
754 opstack_op->op = op;
755 list_add(&opstack_op->list, &ps->opstack);
756
757 return 0;
758}
759
760static int filter_opstack_empty(struct filter_parse_state *ps)
761{
762 return list_empty(&ps->opstack);
763}
764
765static int filter_opstack_top(struct filter_parse_state *ps)
766{
767 struct opstack_op *opstack_op;
768
769 if (filter_opstack_empty(ps))
770 return OP_NONE;
771
772 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
773
774 return opstack_op->op;
775}
776
777static int filter_opstack_pop(struct filter_parse_state *ps)
778{
779 struct opstack_op *opstack_op;
780 int op;
781
782 if (filter_opstack_empty(ps))
783 return OP_NONE;
784
785 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
786 op = opstack_op->op;
787 list_del(&opstack_op->list);
788
789 kfree(opstack_op);
790
791 return op;
792}
793
794static void filter_opstack_clear(struct filter_parse_state *ps)
795{
796 while (!filter_opstack_empty(ps))
797 filter_opstack_pop(ps);
798}
799
800static char *curr_operand(struct filter_parse_state *ps)
801{
802 return ps->operand.string;
803}
804
805static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
806{
807 struct postfix_elt *elt;
808
809 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
810 if (!elt)
811 return -ENOMEM;
812
813 elt->op = OP_NONE;
814 elt->operand = kstrdup(operand, GFP_KERNEL);
815 if (!elt->operand) {
816 kfree(elt);
817 return -ENOMEM;
818 }
819
820 list_add_tail(&elt->list, &ps->postfix);
821
822 return 0;
823}
824
825static int postfix_append_op(struct filter_parse_state *ps, int op)
826{
827 struct postfix_elt *elt;
828
829 elt = kmalloc(sizeof(*elt), GFP_KERNEL);
830 if (!elt)
831 return -ENOMEM;
832
833 elt->op = op;
834 elt->operand = NULL;
835
836 list_add_tail(&elt->list, &ps->postfix);
837
838 return 0;
839}
840
841static void postfix_clear(struct filter_parse_state *ps)
842{
843 struct postfix_elt *elt;
844
845 while (!list_empty(&ps->postfix)) {
846 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
847 kfree(elt->operand);
848 list_del(&elt->list);
849 }
850}
851
852static int filter_parse(struct filter_parse_state *ps)
853{
854 int in_string = 0;
855 int op, top_op;
856 char ch;
857
858 while ((ch = infix_next(ps))) {
859 if (ch == '"') {
860 in_string ^= 1;
374 continue; 861 continue;
375 } 862 }
376 if (tok_n == 1) { 863
377 if (!pred->field_name) 864 if (in_string)
378 pred->field_name = tok; 865 goto parse_operand;
379 else if (!strcmp(tok, "!=")) 866
380 pred->not = 1; 867 if (isspace(ch))
381 else if (!strcmp(tok, "==")) 868 continue;
382 pred->not = 0; 869
383 else { 870 if (is_op_char(ps, ch)) {
384 pred->field_name = NULL; 871 op = infix_get_op(ps, ch);
872 if (op == OP_NONE) {
873 parse_error(ps, FILT_ERR_INVALID_OP, 0);
385 return -EINVAL; 874 return -EINVAL;
386 } 875 }
387 tok_n = 2; 876
877 if (strlen(curr_operand(ps))) {
878 postfix_append_operand(ps, curr_operand(ps));
879 clear_operand_string(ps);
880 }
881
882 while (!filter_opstack_empty(ps)) {
883 top_op = filter_opstack_top(ps);
884 if (!is_precedence_lower(ps, top_op, op)) {
885 top_op = filter_opstack_pop(ps);
886 postfix_append_op(ps, top_op);
887 continue;
888 }
889 break;
890 }
891
892 filter_opstack_push(ps, op);
388 continue; 893 continue;
389 } 894 }
390 if (tok_n == 2) { 895
391 if (pred->compound) { 896 if (ch == '(') {
392 if (!strcmp(tok, "!=")) 897 filter_opstack_push(ps, OP_OPEN_PAREN);
393 pred->not = 1; 898 continue;
394 else if (!strcmp(tok, "==")) 899 }
395 pred->not = 0; 900
396 else { 901 if (ch == ')') {
397 pred->field_name = NULL; 902 if (strlen(curr_operand(ps))) {
398 return -EINVAL; 903 postfix_append_operand(ps, curr_operand(ps));
399 } 904 clear_operand_string(ps);
400 } else { 905 }
401 val_str = tok; 906
402 break; /* done */ 907 top_op = filter_opstack_pop(ps);
908 while (top_op != OP_NONE) {
909 if (top_op == OP_OPEN_PAREN)
910 break;
911 postfix_append_op(ps, top_op);
912 top_op = filter_opstack_pop(ps);
913 }
914 if (top_op == OP_NONE) {
915 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
916 return -EINVAL;
403 } 917 }
404 tok_n = 3;
405 continue; 918 continue;
406 } 919 }
407 if (tok_n == 3) { 920parse_operand:
408 val_str = tok; 921 if (append_operand_char(ps, ch)) {
409 break; /* done */ 922 parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
923 return -EINVAL;
410 } 924 }
411 } 925 }
412 926
413 if (!val_str) { 927 if (strlen(curr_operand(ps)))
414 pred->field_name = NULL; 928 postfix_append_operand(ps, curr_operand(ps));
415 return -EINVAL; 929
930 while (!filter_opstack_empty(ps)) {
931 top_op = filter_opstack_pop(ps);
932 if (top_op == OP_NONE)
933 break;
934 if (top_op == OP_OPEN_PAREN) {
935 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
936 return -EINVAL;
937 }
938 postfix_append_op(ps, top_op);
416 } 939 }
417 940
418 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 941 return 0;
419 if (!pred->field_name) 942}
420 return -ENOMEM;
421 943
422 pred->val = simple_strtoull(val_str, &tmp, 0); 944static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
423 if (tmp == val_str) { 945{
424 pred->str_val = kstrdup(val_str, GFP_KERNEL); 946 struct filter_pred *pred;
425 if (!pred->str_val) 947
426 return -ENOMEM; 948 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
427 } else if (*tmp != '\0') 949 if (!pred)
950 return NULL;
951
952 pred->field_name = kstrdup(operand1, GFP_KERNEL);
953 if (!pred->field_name) {
954 kfree(pred);
955 return NULL;
956 }
957
958 strcpy(pred->str_val, operand2);
959 pred->str_len = strlen(operand2);
960
961 pred->op = op;
962
963 return pred;
964}
965
966static struct filter_pred *create_logical_pred(int op)
967{
968 struct filter_pred *pred;
969
970 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
971 if (!pred)
972 return NULL;
973
974 pred->op = op;
975
976 return pred;
977}
978
979static int check_preds(struct filter_parse_state *ps)
980{
981 int n_normal_preds = 0, n_logical_preds = 0;
982 struct postfix_elt *elt;
983
984 list_for_each_entry(elt, &ps->postfix, list) {
985 if (elt->op == OP_NONE)
986 continue;
987
988 if (elt->op == OP_AND || elt->op == OP_OR) {
989 n_logical_preds++;
990 continue;
991 }
992 n_normal_preds++;
993 }
994
995 if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
996 parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
428 return -EINVAL; 997 return -EINVAL;
998 }
429 999
430 return 0; 1000 return 0;
431} 1001}
432 1002
1003static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps,
1006 char *filter_string)
1007{
1008 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred;
1010 struct postfix_elt *elt;
1011 int err;
1012
1013 err = check_preds(ps);
1014 if (err)
1015 return err;
1016
1017 list_for_each_entry(elt, &ps->postfix, list) {
1018 if (elt->op == OP_NONE) {
1019 if (!operand1)
1020 operand1 = elt->operand;
1021 else if (!operand2)
1022 operand2 = elt->operand;
1023 else {
1024 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1025 return -EINVAL;
1026 }
1027 continue;
1028 }
1029
1030 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op);
1032 if (call) {
1033 err = filter_add_pred(ps, call, pred);
1034 filter_free_pred(pred);
1035 } else
1036 err = filter_add_subsystem_pred(ps, system,
1037 pred, filter_string);
1038 if (err)
1039 return err;
1040
1041 operand1 = operand2 = NULL;
1042 continue;
1043 }
1044
1045 if (!operand1 || !operand2) {
1046 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1047 return -EINVAL;
1048 }
1049
1050 pred = create_pred(elt->op, operand1, operand2);
1051 if (call) {
1052 err = filter_add_pred(ps, call, pred);
1053 filter_free_pred(pred);
1054 } else
1055 err = filter_add_subsystem_pred(ps, system, pred,
1056 filter_string);
1057 if (err)
1058 return err;
1059
1060 operand1 = operand2 = NULL;
1061 }
1062
1063 return 0;
1064}
1065
1066int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1067{
1068 int err;
1069
1070 struct filter_parse_state *ps;
1071
1072 mutex_lock(&event_mutex);
1073
1074 if (!strcmp(strstrip(filter_string), "0")) {
1075 filter_disable_preds(call);
1076 remove_filter_string(call->filter);
1077 mutex_unlock(&event_mutex);
1078 return 0;
1079 }
1080
1081 err = -ENOMEM;
1082 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1083 if (!ps)
1084 goto out_unlock;
1085
1086 filter_disable_preds(call);
1087 replace_filter_string(call->filter, filter_string);
1088
1089 parse_init(ps, filter_ops, filter_string);
1090 err = filter_parse(ps);
1091 if (err) {
1092 append_filter_err(ps, call->filter);
1093 goto out;
1094 }
1095
1096 err = replace_preds(NULL, call, ps, filter_string);
1097 if (err)
1098 append_filter_err(ps, call->filter);
1099
1100out:
1101 filter_opstack_clear(ps);
1102 postfix_clear(ps);
1103 kfree(ps);
1104out_unlock:
1105 mutex_unlock(&event_mutex);
1106
1107 return err;
1108}
1109
1110int apply_subsystem_event_filter(struct event_subsystem *system,
1111 char *filter_string)
1112{
1113 int err;
1114
1115 struct filter_parse_state *ps;
1116
1117 mutex_lock(&event_mutex);
1118
1119 if (!strcmp(strstrip(filter_string), "0")) {
1120 filter_free_subsystem_preds(system);
1121 remove_filter_string(system->filter);
1122 mutex_unlock(&event_mutex);
1123 return 0;
1124 }
1125
1126 err = -ENOMEM;
1127 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1128 if (!ps)
1129 goto out_unlock;
1130
1131 filter_free_subsystem_preds(system);
1132 replace_filter_string(system->filter, filter_string);
1133
1134 parse_init(ps, filter_ops, filter_string);
1135 err = filter_parse(ps);
1136 if (err) {
1137 append_filter_err(ps, system->filter);
1138 goto out;
1139 }
1140
1141 err = replace_preds(system, NULL, ps, filter_string);
1142 if (err)
1143 append_filter_err(ps, system->filter);
1144
1145out:
1146 filter_opstack_clear(ps);
1147 postfix_clear(ps);
1148 kfree(ps);
1149out_unlock:
1150 mutex_unlock(&event_mutex);
1151
1152 return err;
1153}
433 1154
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 38985f9b379c..000000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index d363c6672c6c..000000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry REC
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644
index 9d2fa78cecca..000000000000
--- a/kernel/trace/trace_events_stage_3.h
+++ /dev/null
@@ -1,281 +0,0 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#ifdef CONFIG_EVENT_PROFILE
113#define _TRACE_PROFILE(call, proto, args) \
114static void ftrace_profile_##call(proto) \
115{ \
116 extern void perf_tpcounter_event(int); \
117 perf_tpcounter_event(event_##call.id); \
118} \
119 \
120static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
121{ \
122 int ret = 0; \
123 \
124 if (!atomic_inc_return(&call->profile_count)) \
125 ret = register_trace_##call(ftrace_profile_##call); \
126 \
127 return ret; \
128} \
129 \
130static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
131{ \
132 if (atomic_add_negative(-1, &call->profile_count)) \
133 unregister_trace_##call(ftrace_profile_##call); \
134}
135
136#define _TRACE_PROFILE_INIT(call) \
137 .profile_count = ATOMIC_INIT(-1), \
138 .profile_enable = ftrace_profile_enable_##call, \
139 .profile_disable = ftrace_profile_disable_##call,
140
141#else
142#define _TRACE_PROFILE(call, proto, args)
143#define _TRACE_PROFILE_INIT(call)
144#endif
145
146#define _TRACE_FORMAT(call, proto, args, fmt) \
147static void ftrace_event_##call(proto) \
148{ \
149 event_trace_printk(_RET_IP_, #call ": " fmt); \
150} \
151 \
152static int ftrace_reg_event_##call(void) \
153{ \
154 int ret; \
155 \
156 ret = register_trace_##call(ftrace_event_##call); \
157 if (ret) \
158 pr_info("event trace: Could not activate trace point " \
159 "probe to " #call "\n"); \
160 return ret; \
161} \
162 \
163static void ftrace_unreg_event_##call(void) \
164{ \
165 unregister_trace_##call(ftrace_event_##call); \
166} \
167 \
168static struct ftrace_event_call event_##call; \
169 \
170static int ftrace_init_event_##call(void) \
171{ \
172 int id; \
173 \
174 id = register_ftrace_event(NULL); \
175 if (!id) \
176 return -ENODEV; \
177 event_##call.id = id; \
178 return 0; \
179}
180
181#undef TRACE_FORMAT
182#define TRACE_FORMAT(call, proto, args, fmt) \
183_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
184_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
185static struct ftrace_event_call __used \
186__attribute__((__aligned__(4))) \
187__attribute__((section("_ftrace_events"))) event_##call = { \
188 .name = #call, \
189 .system = __stringify(TRACE_SYSTEM), \
190 .raw_init = ftrace_init_event_##call, \
191 .regfunc = ftrace_reg_event_##call, \
192 .unregfunc = ftrace_unreg_event_##call, \
193 _TRACE_PROFILE_INIT(call) \
194}
195
196#undef __entry
197#define __entry entry
198
199#undef TRACE_EVENT
200#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
201_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
202 \
203static struct ftrace_event_call event_##call; \
204 \
205static void ftrace_raw_event_##call(proto) \
206{ \
207 struct ftrace_event_call *call = &event_##call; \
208 struct ring_buffer_event *event; \
209 struct ftrace_raw_##call *entry; \
210 unsigned long irq_flags; \
211 int pc; \
212 \
213 local_save_flags(irq_flags); \
214 pc = preempt_count(); \
215 \
216 event = trace_current_buffer_lock_reserve(event_##call.id, \
217 sizeof(struct ftrace_raw_##call), \
218 irq_flags, pc); \
219 if (!event) \
220 return; \
221 entry = ring_buffer_event_data(event); \
222 \
223 assign; \
224 \
225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
230} \
231 \
232static int ftrace_raw_reg_event_##call(void) \
233{ \
234 int ret; \
235 \
236 ret = register_trace_##call(ftrace_raw_event_##call); \
237 if (ret) \
238 pr_info("event trace: Could not activate trace point " \
239 "probe to " #call "\n"); \
240 return ret; \
241} \
242 \
243static void ftrace_raw_unreg_event_##call(void) \
244{ \
245 unregister_trace_##call(ftrace_raw_event_##call); \
246} \
247 \
248static struct trace_event ftrace_event_type_##call = { \
249 .trace = ftrace_raw_output_##call, \
250}; \
251 \
252static int ftrace_raw_init_event_##call(void) \
253{ \
254 int id; \
255 \
256 id = register_ftrace_event(&ftrace_event_type_##call); \
257 if (!id) \
258 return -ENODEV; \
259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
261 return 0; \
262} \
263 \
264static struct ftrace_event_call __used \
265__attribute__((__aligned__(4))) \
266__attribute__((section("_ftrace_events"))) event_##call = { \
267 .name = #call, \
268 .system = __stringify(TRACE_SYSTEM), \
269 .raw_init = ftrace_raw_init_event_##call, \
270 .regfunc = ftrace_raw_reg_event_##call, \
271 .unregfunc = ftrace_raw_unreg_event_##call, \
272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
274 _TRACE_PROFILE_INIT(call) \
275}
276
277#include <trace/trace_event_types.h>
278
279#undef _TRACE_PROFILE
280#undef _TRACE_PROFILE_INIT
281
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf3..d06cf898dc86 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
19#undef TRACE_STRUCT 19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args 20#define TRACE_STRUCT(args...) args
21 21
22extern void __bad_type_size(void);
23
22#undef TRACE_FIELD 24#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \ 25#define TRACE_FIELD(type, item, assign) \
26 if (sizeof(type) != sizeof(field.item)) \
27 __bad_type_size(); \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \ 29 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \ 30 (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
30 34
31 35
32#undef TRACE_FIELD_SPECIAL 36#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \ 39 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \ 40 (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
46 if (!ret) \ 50 if (!ret) \
47 return 0; 51 return 0;
48 52
53#undef TRACE_FIELD_SIGN
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
55 TRACE_FIELD(type, item, assign)
49 56
50#undef TP_RAW_FMT 57#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args 58#define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s) \
65 return ret; \ 72 return ret; \
66} 73}
67 74
75#undef TRACE_EVENT_FORMAT_NOFILTER
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
77 tpfmt) \
78static int \
79ftrace_format_##call(struct trace_seq *s) \
80{ \
81 struct args field; \
82 int ret; \
83 \
84 tstruct; \
85 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
87 \
88 return ret; \
89}
90
68#include "trace_event_types.h" 91#include "trace_event_types.h"
69 92
70#undef TRACE_ZERO_CHAR 93#undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \
78#define TRACE_FIELD(type, item, assign)\ 101#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign; 102 entry->item = assign;
80 103
104#undef TRACE_FIELD_SIGN
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
106 TRACE_FIELD(type, item, assign)
107
81#undef TP_CMD 108#undef TP_CMD
82#define TP_CMD(cmd...) cmd 109#define TP_CMD(cmd...) cmd
83 110
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s) \
85#define TRACE_ENTRY entry 112#define TRACE_ENTRY entry
86 113
87#undef TRACE_FIELD_SPECIAL 114#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ 115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
89 cmd; 116 cmd;
90 117
91#undef TRACE_EVENT_FORMAT 118#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \
122 \
123struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \
127 .id = proto, \
128 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \
131 .define_fields = ftrace_define_fields_##call, \
132}; \
133static int ftrace_raw_init_event_##call(void) \
134{ \
135 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \
138} \
139
140#undef TRACE_EVENT_FORMAT_NOFILTER
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
93 \ 143 \
94static struct ftrace_event_call __used \ 144struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \ 145__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \ 146__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \ 147 .name = #call, \
98 .id = proto, \ 148 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \ 149 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \ 150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
101} 200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
102#include "trace_event_types.h" 206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index c9a0b7df44ff..90f134764837 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -193,9 +193,11 @@ static void tracing_start_function_trace(void)
193static void tracing_stop_function_trace(void) 193static void tracing_stop_function_trace(void)
194{ 194{
195 ftrace_function_enabled = 0; 195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */ 196
197 unregister_ftrace_function(&trace_stack_ops); 197 if (func_flags.val & TRACE_FUNC_OPT_STACK)
198 unregister_ftrace_function(&trace_ops); 198 unregister_ftrace_function(&trace_stack_ops);
199 else
200 unregister_ftrace_function(&trace_ops);
199} 201}
200 202
201static int func_set_flag(u32 old_flags, u32 bit, int set) 203static int func_set_flag(u32 old_flags, u32 bit, int set)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7..d2249abafb53 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,7 +57,8 @@ static struct tracer_flags tracer_flags = {
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
59int 59int
60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
61 unsigned long frame_pointer)
61{ 62{
62 unsigned long long calltime; 63 unsigned long long calltime;
63 int index; 64 int index;
@@ -65,6 +66,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
65 if (!current->ret_stack) 66 if (!current->ret_stack)
66 return -EBUSY; 67 return -EBUSY;
67 68
69 /*
70 * We must make sure the ret_stack is tested before we read
71 * anything else.
72 */
73 smp_rmb();
74
68 /* The return trace stack is full */ 75 /* The return trace stack is full */
69 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { 76 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
70 atomic_inc(&current->trace_overrun); 77 atomic_inc(&current->trace_overrun);
@@ -78,14 +85,17 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
78 current->ret_stack[index].ret = ret; 85 current->ret_stack[index].ret = ret;
79 current->ret_stack[index].func = func; 86 current->ret_stack[index].func = func;
80 current->ret_stack[index].calltime = calltime; 87 current->ret_stack[index].calltime = calltime;
88 current->ret_stack[index].subtime = 0;
89 current->ret_stack[index].fp = frame_pointer;
81 *depth = index; 90 *depth = index;
82 91
83 return 0; 92 return 0;
84} 93}
85 94
86/* Retrieve a function return address to the trace stack on thread info.*/ 95/* Retrieve a function return address to the trace stack on thread info.*/
87void 96static void
88ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) 97ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
98 unsigned long frame_pointer)
89{ 99{
90 int index; 100 int index;
91 101
@@ -99,28 +109,52 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
99 return; 109 return;
100 } 110 }
101 111
112#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
113 /*
114 * The arch may choose to record the frame pointer used
115 * and check it here to make sure that it is what we expect it
116 * to be. If gcc does not set the place holder of the return
117 * address in the frame pointer, and does a copy instead, then
118 * the function graph trace will fail. This test detects this
119 * case.
120 *
121 * Currently, x86_32 with optimize for size (-Os) makes the latest
122 * gcc do the above.
123 */
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n",
128 current->ret_stack[index].fp,
129 frame_pointer,
130 (void *)current->ret_stack[index].func,
131 current->ret_stack[index].ret);
132 *ret = (unsigned long)panic;
133 return;
134 }
135#endif
136
102 *ret = current->ret_stack[index].ret; 137 *ret = current->ret_stack[index].ret;
103 trace->func = current->ret_stack[index].func; 138 trace->func = current->ret_stack[index].func;
104 trace->calltime = current->ret_stack[index].calltime; 139 trace->calltime = current->ret_stack[index].calltime;
105 trace->overrun = atomic_read(&current->trace_overrun); 140 trace->overrun = atomic_read(&current->trace_overrun);
106 trace->depth = index; 141 trace->depth = index;
107 barrier();
108 current->curr_ret_stack--;
109
110} 142}
111 143
112/* 144/*
113 * Send the trace to the ring-buffer. 145 * Send the trace to the ring-buffer.
114 * @return the original return address. 146 * @return the original return address.
115 */ 147 */
116unsigned long ftrace_return_to_handler(void) 148unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
117{ 149{
118 struct ftrace_graph_ret trace; 150 struct ftrace_graph_ret trace;
119 unsigned long ret; 151 unsigned long ret;
120 152
121 ftrace_pop_return_trace(&trace, &ret); 153 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
122 trace.rettime = trace_clock_local(); 154 trace.rettime = trace_clock_local();
123 ftrace_graph_return(&trace); 155 ftrace_graph_return(&trace);
156 barrier();
157 current->curr_ret_stack--;
124 158
125 if (unlikely(!ret)) { 159 if (unlikely(!ret)) {
126 ftrace_graph_stop(); 160 ftrace_graph_stop();
@@ -426,8 +460,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
426 return TRACE_TYPE_HANDLED; 460 return TRACE_TYPE_HANDLED;
427} 461}
428 462
429static enum print_line_t 463enum print_line_t
430print_graph_duration(unsigned long long duration, struct trace_seq *s) 464trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
431{ 465{
432 unsigned long nsecs_rem = do_div(duration, 1000); 466 unsigned long nsecs_rem = do_div(duration, 1000);
433 /* log10(ULONG_MAX) + '\0' */ 467 /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +498,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
464 if (!ret) 498 if (!ret)
465 return TRACE_TYPE_PARTIAL_LINE; 499 return TRACE_TYPE_PARTIAL_LINE;
466 } 500 }
501 return TRACE_TYPE_HANDLED;
502}
503
504static enum print_line_t
505print_graph_duration(unsigned long long duration, struct trace_seq *s)
506{
507 int ret;
508
509 ret = trace_print_graph_duration(duration, s);
510 if (ret != TRACE_TYPE_HANDLED)
511 return ret;
467 512
468 ret = trace_seq_printf(s, "| "); 513 ret = trace_seq_printf(s, "| ");
469 if (!ret) 514 if (!ret)
470 return TRACE_TYPE_PARTIAL_LINE; 515 return TRACE_TYPE_PARTIAL_LINE;
471 return TRACE_TYPE_HANDLED;
472 516
517 return TRACE_TYPE_HANDLED;
473} 518}
474 519
475/* Case of a leaf function on its call entry */ 520/* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f..ca7d7c4d0c2a 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,10 +1,9 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on BTS
3 * 3 *
4 * Copyright (C) 2008-2009 Intel Corporation. 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7#include <linux/spinlock.h>
8#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
9#include <linux/debugfs.h> 8#include <linux/debugfs.h>
10#include <linux/ftrace.h> 9#include <linux/ftrace.h>
@@ -15,110 +14,119 @@
15 14
16#include <asm/ds.h> 15#include <asm/ds.h>
17 16
18#include "trace.h"
19#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h"
20 19
21 20
22#define SIZEOF_BTS (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
23 22
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
35static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, tracer);
36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
37 25
38#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(tracer, smp_processor_id())
39#define this_buffer per_cpu(buffer, smp_processor_id())
40 27
41static int __read_mostly trace_hw_branches_enabled; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
42static struct trace_array *hw_branch_trace __read_mostly; 30static struct trace_array *hw_branch_trace __read_mostly;
43 31
44 32
45/* 33static void bts_trace_init_cpu(int cpu)
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
51static void bts_trace_start_cpu(void *arg)
52{ 34{
53 if (this_tracer) 35 per_cpu(tracer, cpu) =
54 ds_release_bts(this_tracer); 36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
55 37 NULL, (size_t)-1, BTS_KERNEL);
56 this_tracer = 38
57 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, 39 if (IS_ERR(per_cpu(tracer, cpu)))
58 /* ovfl = */ NULL, /* th = */ (size_t)-1, 40 per_cpu(tracer, cpu) = NULL;
59 BTS_KERNEL);
60 if (IS_ERR(this_tracer)) {
61 this_tracer = NULL;
62 return;
63 }
64} 41}
65 42
66static void bts_trace_start(struct trace_array *tr) 43static int bts_trace_init(struct trace_array *tr)
67{ 44{
68 spin_lock(&bts_tracer_lock); 45 int cpu;
46
47 hw_branch_trace = tr;
48 trace_hw_branches_enabled = 0;
69 49
70 on_each_cpu(bts_trace_start_cpu, NULL, 1); 50 get_online_cpus();
71 trace_hw_branches_enabled = 1; 51 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu);
72 53
73 spin_unlock(&bts_tracer_lock); 54 if (likely(per_cpu(tracer, cpu)))
55 trace_hw_branches_enabled = 1;
56 }
57 trace_hw_branches_suspended = 0;
58 put_online_cpus();
59
60 /* If we could not enable tracing on a single cpu, we fail. */
61 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
74} 62}
75 63
76/* 64static void bts_trace_reset(struct trace_array *tr)
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
82static void bts_trace_stop_cpu(void *arg)
83{ 65{
84 if (this_tracer) { 66 int cpu;
85 ds_release_bts(this_tracer); 67
86 this_tracer = NULL; 68 get_online_cpus();
69 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu));
72 per_cpu(tracer, cpu) = NULL;
73 }
87 } 74 }
75 trace_hw_branches_enabled = 0;
76 trace_hw_branches_suspended = 0;
77 put_online_cpus();
88} 78}
89 79
90static void bts_trace_stop(struct trace_array *tr) 80static void bts_trace_start(struct trace_array *tr)
91{ 81{
92 spin_lock(&bts_tracer_lock); 82 int cpu;
93 83
94 trace_hw_branches_enabled = 0; 84 get_online_cpus();
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1); 85 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu));
88 trace_hw_branches_suspended = 0;
89 put_online_cpus();
90}
96 91
97 spin_unlock(&bts_tracer_lock); 92static void bts_trace_stop(struct trace_array *tr)
93{
94 int cpu;
95
96 get_online_cpus();
97 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu));
100 trace_hw_branches_suspended = 1;
101 put_online_cpus();
98} 102}
99 103
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, 104static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu) 105 unsigned long action, void *hcpu)
102{ 106{
103 unsigned int cpu = (unsigned long)hcpu; 107 int cpu = (long)hcpu;
104
105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109 108
110 switch (action) { 109 switch (action) {
111 case CPU_ONLINE: 110 case CPU_ONLINE:
112 case CPU_DOWN_FAILED: 111 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); 112 /* The notification is sent with interrupts enabled. */
113 if (trace_hw_branches_enabled) {
114 bts_trace_init_cpu(cpu);
115
116 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu));
119 }
114 break; 120 break;
121
115 case CPU_DOWN_PREPARE: 122 case CPU_DOWN_PREPARE:
116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 123 /* The notification is sent with interrupts enabled. */
117 break; 124 if (likely(per_cpu(tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu));
126 per_cpu(tracer, cpu) = NULL;
127 }
118 } 128 }
119 129
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE; 130 return NOTIFY_DONE;
123} 131}
124 132
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler 134 .notifier_call = bts_hotcpu_handler
127}; 135};
128 136
129static int bts_trace_init(struct trace_array *tr)
130{
131 hw_branch_trace = tr;
132
133 bts_trace_start(tr);
134
135 return 0;
136}
137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
143static void bts_trace_print_header(struct seq_file *m) 137static void bts_trace_print_header(struct seq_file *m)
144{ 138{
145 seq_puts(m, "# CPU# TO <- FROM\n"); 139 seq_puts(m, "# CPU# TO <- FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
147 141
148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 142static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
149{ 143{
144 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
150 struct trace_entry *entry = iter->ent; 145 struct trace_entry *entry = iter->ent;
151 struct trace_seq *seq = &iter->seq; 146 struct trace_seq *seq = &iter->seq;
152 struct hw_branch_entry *it; 147 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
154 148
155 trace_assign_type(it, entry); 149 trace_assign_type(it, entry);
156 150
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
168 162
169void trace_hw_branch(u64 from, u64 to) 163void trace_hw_branch(u64 from, u64 to)
170{ 164{
165 struct ftrace_event_call *call = &event_hw_branch;
171 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
172 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
173 struct hw_branch_entry *entry; 168 struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
194 entry->ent.type = TRACE_HW_BRANCHES; 189 entry->ent.type = TRACE_HW_BRANCHES;
195 entry->from = from; 190 entry->from = from;
196 entry->to = to; 191 entry->to = to;
197 trace_buffer_unlock_commit(tr, event, 0, 0); 192 if (!filter_check_discard(call, entry, tr->buffer, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0);
198 194
199 out: 195 out:
200 atomic_dec(&tr->data[cpu]->disabled); 196 atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
224/* 220/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer. 221 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 * 222 *
227 * pre: bts_tracer_lock must be locked 223 * pre: tracing must be suspended on the current cpu
228 */ 224 */
229static void trace_bts_cpu(void *arg) 225static void trace_bts_cpu(void *arg)
230{ 226{
231 struct trace_array *tr = (struct trace_array *) arg; 227 struct trace_array *tr = (struct trace_array *)arg;
232 const struct bts_trace *trace; 228 const struct bts_trace *trace;
233 unsigned char *at; 229 unsigned char *at;
234 230
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
241 if (unlikely(!this_tracer)) 237 if (unlikely(!this_tracer))
242 return; 238 return;
243 239
244 ds_suspend_bts(this_tracer);
245 trace = ds_read_bts(this_tracer); 240 trace = ds_read_bts(this_tracer);
246 if (!trace) 241 if (!trace)
247 goto out; 242 return;
248 243
249 for (at = trace->ds.top; (void *)at < trace->ds.end; 244 for (at = trace->ds.top; (void *)at < trace->ds.end;
250 at += trace->ds.size) 245 at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
253 for (at = trace->ds.begin; (void *)at < trace->ds.top; 248 for (at = trace->ds.begin; (void *)at < trace->ds.top;
254 at += trace->ds.size) 249 at += trace->ds.size)
255 trace_bts_at(trace, at); 250 trace_bts_at(trace, at);
256
257out:
258 ds_resume_bts(this_tracer);
259} 251}
260 252
261static void trace_bts_prepare(struct trace_iterator *iter) 253static void trace_bts_prepare(struct trace_iterator *iter)
262{ 254{
263 spin_lock(&bts_tracer_lock); 255 int cpu;
264 256
257 get_online_cpus();
258 for_each_online_cpu(cpu)
259 if (likely(per_cpu(tracer, cpu)))
260 ds_suspend_bts(per_cpu(tracer, cpu));
261 /*
262 * We need to collect the trace on the respective cpu since ftrace
263 * implicitly adds the record for the current cpu.
264 * Once that is more flexible, we could collect the data from any cpu.
265 */
265 on_each_cpu(trace_bts_cpu, iter->tr, 1); 266 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266 267
267 spin_unlock(&bts_tracer_lock); 268 for_each_online_cpu(cpu)
269 if (likely(per_cpu(tracer, cpu)))
270 ds_resume_bts(per_cpu(tracer, cpu));
271 put_online_cpus();
268} 272}
269 273
270static void trace_bts_close(struct trace_iterator *iter) 274static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
274 278
275void trace_hw_branch_oops(void) 279void trace_hw_branch_oops(void)
276{ 280{
277 spin_lock(&bts_tracer_lock); 281 if (this_tracer) {
278 282 ds_suspend_bts_noirq(this_tracer);
279 trace_bts_cpu(hw_branch_trace); 283 trace_bts_cpu(hw_branch_trace);
280 284 ds_resume_bts_noirq(this_tracer);
281 spin_unlock(&bts_tracer_lock); 285 }
282} 286}
283 287
284struct tracer bts_tracer __read_mostly = 288struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
291 .start = bts_trace_start, 295 .start = bts_trace_start,
292 .stop = bts_trace_stop, 296 .stop = bts_trace_stop,
293 .open = trace_bts_prepare, 297 .open = trace_bts_prepare,
294 .close = trace_bts_close 298 .close = trace_bts_close,
299#ifdef CONFIG_FTRACE_SELFTEST
300 .selftest = trace_selftest_startup_hw_branches,
301#endif /* CONFIG_FTRACE_SELFTEST */
295}; 302};
296 303
297__init static int init_bts_trace(void) 304__init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b4..d53b45ed0806 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/time.h>
13
12#include <asm/atomic.h> 14#include <asm/atomic.h>
13 15
14#include "trace.h" 16#include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
174 struct mmiotrace_rw *rw; 176 struct mmiotrace_rw *rw;
175 struct trace_seq *s = &iter->seq; 177 struct trace_seq *s = &iter->seq;
176 unsigned long long t = ns2usecs(iter->ts); 178 unsigned long long t = ns2usecs(iter->ts);
177 unsigned long usec_rem = do_div(t, 1000000ULL); 179 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
178 unsigned secs = (unsigned long)t; 180 unsigned secs = (unsigned long)t;
179 int ret = 1; 181 int ret = 1;
180 182
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
221 struct mmiotrace_map *m; 223 struct mmiotrace_map *m;
222 struct trace_seq *s = &iter->seq; 224 struct trace_seq *s = &iter->seq;
223 unsigned long long t = ns2usecs(iter->ts); 225 unsigned long long t = ns2usecs(iter->ts);
224 unsigned long usec_rem = do_div(t, 1000000ULL); 226 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
225 unsigned secs = (unsigned long)t; 227 unsigned secs = (unsigned long)t;
226 int ret; 228 int ret;
227 229
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64b54a59c55b..7938f3ae93e3 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,11 +14,25 @@
14/* must be a power of 2 */ 14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128 15#define EVENT_HASHSIZE 128
16 16
17static DEFINE_MUTEX(trace_event_mutex); 17DECLARE_RWSEM(trace_event_mutex);
18
19DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
20EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
21
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; 22static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19 23
20static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
21 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29
30 s->buffer[len] = 0;
31 seq_puts(m, s->buffer);
32
33 trace_seq_init(s);
34}
35
22enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 36enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
23{ 37{
24 struct trace_seq *s = &iter->seq; 38 struct trace_seq *s = &iter->seq;
@@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
84 98
85 return len; 99 return len;
86} 100}
101EXPORT_SYMBOL_GPL(trace_seq_printf);
102
103/**
104 * trace_seq_vprintf - sequence printing of trace information
105 * @s: trace sequence descriptor
106 * @fmt: printf format string
107 *
108 * The tracer may use either sequence operations or its own
109 * copy to user routines. To simplify formating of a trace
110 * trace_seq_printf is used to store strings into a special
111 * buffer (@s). Then the output may be either used by
112 * the sequencer or pulled into another buffer.
113 */
114int
115trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
116{
117 int len = (PAGE_SIZE - 1) - s->len;
118 int ret;
119
120 if (!len)
121 return 0;
122
123 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
124
125 /* If we can't write it all, don't bother writing anything */
126 if (ret >= len)
127 return 0;
128
129 s->len += ret;
130
131 return len;
132}
133EXPORT_SYMBOL_GPL(trace_seq_vprintf);
87 134
88int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) 135int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
89{ 136{
@@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
201 return 0; 248 return 0;
202} 249}
203 250
251const char *
252ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
253 unsigned long flags,
254 const struct trace_print_flags *flag_array)
255{
256 unsigned long mask;
257 const char *str;
258 const char *ret = p->buffer + p->len;
259 int i;
260
261 for (i = 0; flag_array[i].name && flags; i++) {
262
263 mask = flag_array[i].mask;
264 if ((flags & mask) != mask)
265 continue;
266
267 str = flag_array[i].name;
268 flags &= ~mask;
269 if (p->len && delim)
270 trace_seq_puts(p, delim);
271 trace_seq_puts(p, str);
272 }
273
274 /* check for left over flags */
275 if (flags) {
276 if (p->len && delim)
277 trace_seq_puts(p, delim);
278 trace_seq_printf(p, "0x%lx", flags);
279 }
280
281 trace_seq_putc(p, 0);
282
283 return ret;
284}
285EXPORT_SYMBOL(ftrace_print_flags_seq);
286
287const char *
288ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
289 const struct trace_print_flags *symbol_array)
290{
291 int i;
292 const char *ret = p->buffer + p->len;
293
294 for (i = 0; symbol_array[i].name; i++) {
295
296 if (val != symbol_array[i].mask)
297 continue;
298
299 trace_seq_puts(p, symbol_array[i].name);
300 break;
301 }
302
303 if (!p->len)
304 trace_seq_printf(p, "0x%lx", val);
305
306 trace_seq_putc(p, 0);
307
308 return ret;
309}
310EXPORT_SYMBOL(ftrace_print_symbols_seq);
311
204#ifdef CONFIG_KRETPROBES 312#ifdef CONFIG_KRETPROBES
205static inline const char *kretprobed(const char *name) 313static inline const char *kretprobed(const char *name)
206{ 314{
@@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
311 419
312 if (ip == ULONG_MAX || !ret) 420 if (ip == ULONG_MAX || !ret)
313 break; 421 break;
314 if (i && ret) 422 if (ret)
315 ret = trace_seq_puts(s, " <- "); 423 ret = trace_seq_puts(s, " => ");
316 if (!ip) { 424 if (!ip) {
317 if (ret) 425 if (ret)
318 ret = trace_seq_puts(s, "??"); 426 ret = trace_seq_puts(s, "??");
427 if (ret)
428 ret = trace_seq_puts(s, "\n");
319 continue; 429 continue;
320 } 430 }
321 if (!ret) 431 if (!ret)
322 break; 432 break;
323 if (ret) 433 if (ret)
324 ret = seq_print_user_ip(s, mm, ip, sym_flags); 434 ret = seq_print_user_ip(s, mm, ip, sym_flags);
435 ret = trace_seq_puts(s, "\n");
325 } 436 }
326 437
327 if (mm) 438 if (mm)
@@ -455,6 +566,7 @@ static int task_state_char(unsigned long state)
455 * @type: the type of event to look for 566 * @type: the type of event to look for
456 * 567 *
457 * Returns an event of type @type otherwise NULL 568 * Returns an event of type @type otherwise NULL
569 * Called with trace_event_read_lock() held.
458 */ 570 */
459struct trace_event *ftrace_find_event(int type) 571struct trace_event *ftrace_find_event(int type)
460{ 572{
@@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type)
464 576
465 key = type & (EVENT_HASHSIZE - 1); 577 key = type & (EVENT_HASHSIZE - 1);
466 578
467 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { 579 hlist_for_each_entry(event, n, &event_hash[key], node) {
468 if (event->type == type) 580 if (event->type == type)
469 return event; 581 return event;
470 } 582 }
@@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type)
472 return NULL; 584 return NULL;
473} 585}
474 586
587static LIST_HEAD(ftrace_event_list);
588
589static int trace_search_list(struct list_head **list)
590{
591 struct trace_event *e;
592 int last = __TRACE_LAST_TYPE;
593
594 if (list_empty(&ftrace_event_list)) {
595 *list = &ftrace_event_list;
596 return last + 1;
597 }
598
599 /*
600 * We used up all possible max events,
601 * lets see if somebody freed one.
602 */
603 list_for_each_entry(e, &ftrace_event_list, list) {
604 if (e->type != last + 1)
605 break;
606 last++;
607 }
608
609 /* Did we used up all 65 thousand events??? */
610 if ((last + 1) > FTRACE_MAX_EVENT)
611 return 0;
612
613 *list = &e->list;
614 return last + 1;
615}
616
617void trace_event_read_lock(void)
618{
619 down_read(&trace_event_mutex);
620}
621
622void trace_event_read_unlock(void)
623{
624 up_read(&trace_event_mutex);
625}
626
475/** 627/**
476 * register_ftrace_event - register output for an event type 628 * register_ftrace_event - register output for an event type
477 * @event: the event type to register 629 * @event: the event type to register
@@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event)
492 unsigned key; 644 unsigned key;
493 int ret = 0; 645 int ret = 0;
494 646
495 mutex_lock(&trace_event_mutex); 647 down_write(&trace_event_mutex);
496 648
497 if (!event) { 649 if (WARN_ON(!event))
498 ret = next_event_type++;
499 goto out; 650 goto out;
500 }
501 651
502 if (!event->type) 652 INIT_LIST_HEAD(&event->list);
503 event->type = next_event_type++; 653
504 else if (event->type > __TRACE_LAST_TYPE) { 654 if (!event->type) {
655 struct list_head *list = NULL;
656
657 if (next_event_type > FTRACE_MAX_EVENT) {
658
659 event->type = trace_search_list(&list);
660 if (!event->type)
661 goto out;
662
663 } else {
664
665 event->type = next_event_type++;
666 list = &ftrace_event_list;
667 }
668
669 if (WARN_ON(ftrace_find_event(event->type)))
670 goto out;
671
672 list_add_tail(&event->list, list);
673
674 } else if (event->type > __TRACE_LAST_TYPE) {
505 printk(KERN_WARNING "Need to add type to trace.h\n"); 675 printk(KERN_WARNING "Need to add type to trace.h\n");
506 WARN_ON(1); 676 WARN_ON(1);
507 }
508
509 if (ftrace_find_event(event->type))
510 goto out; 677 goto out;
678 } else {
679 /* Is this event already used */
680 if (ftrace_find_event(event->type))
681 goto out;
682 }
511 683
512 if (event->trace == NULL) 684 if (event->trace == NULL)
513 event->trace = trace_nop_print; 685 event->trace = trace_nop_print;
@@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event)
520 692
521 key = event->type & (EVENT_HASHSIZE - 1); 693 key = event->type & (EVENT_HASHSIZE - 1);
522 694
523 hlist_add_head_rcu(&event->node, &event_hash[key]); 695 hlist_add_head(&event->node, &event_hash[key]);
524 696
525 ret = event->type; 697 ret = event->type;
526 out: 698 out:
527 mutex_unlock(&trace_event_mutex); 699 up_write(&trace_event_mutex);
528 700
529 return ret; 701 return ret;
530} 702}
703EXPORT_SYMBOL_GPL(register_ftrace_event);
704
705/*
706 * Used by module code with the trace_event_mutex held for write.
707 */
708int __unregister_ftrace_event(struct trace_event *event)
709{
710 hlist_del(&event->node);
711 list_del(&event->list);
712 return 0;
713}
531 714
532/** 715/**
533 * unregister_ftrace_event - remove a no longer used event 716 * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event)
535 */ 718 */
536int unregister_ftrace_event(struct trace_event *event) 719int unregister_ftrace_event(struct trace_event *event)
537{ 720{
538 mutex_lock(&trace_event_mutex); 721 down_write(&trace_event_mutex);
539 hlist_del(&event->node); 722 __unregister_ftrace_event(event);
540 mutex_unlock(&trace_event_mutex); 723 up_write(&trace_event_mutex);
541 724
542 return 0; 725 return 0;
543} 726}
727EXPORT_SYMBOL_GPL(unregister_ftrace_event);
544 728
545/* 729/*
546 * Standard events 730 * Standard events
@@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
833 1017
834 trace_assign_type(field, iter->ent); 1018 trace_assign_type(field, iter->ent);
835 1019
1020 if (!trace_seq_puts(s, "<stack trace>\n"))
1021 goto partial;
836 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1022 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
837 if (i) { 1023 if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
838 if (!trace_seq_puts(s, " <= ")) 1024 break;
839 goto partial; 1025 if (!trace_seq_puts(s, " => "))
1026 goto partial;
840 1027
841 if (!seq_print_ip_sym(s, field->caller[i], flags)) 1028 if (!seq_print_ip_sym(s, field->caller[i], flags))
842 goto partial; 1029 goto partial;
843 }
844 if (!trace_seq_puts(s, "\n")) 1030 if (!trace_seq_puts(s, "\n"))
845 goto partial; 1031 goto partial;
846 } 1032 }
@@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
868 1054
869 trace_assign_type(field, iter->ent); 1055 trace_assign_type(field, iter->ent);
870 1056
871 if (!seq_print_userip_objs(field, s, flags)) 1057 if (!trace_seq_puts(s, "<user stack trace>\n"))
872 goto partial; 1058 goto partial;
873 1059
874 if (!trace_seq_putc(s, '\n')) 1060 if (!seq_print_userip_objs(field, s, flags))
875 goto partial; 1061 goto partial;
876 1062
877 return TRACE_TYPE_HANDLED; 1063 return TRACE_TYPE_HANDLED;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index e0bde39c2dd9..d38bec4a9c30 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,41 +1,17 @@
1#ifndef __TRACE_EVENTS_H 1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H 2#define __TRACE_EVENTS_H
3 3
4#include <linux/trace_seq.h>
4#include "trace.h" 5#include "trace.h"
5 6
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern enum print_line_t 7extern enum print_line_t
19trace_print_bprintk_msg_only(struct trace_iterator *iter); 8trace_print_bprintk_msg_only(struct trace_iterator *iter);
20extern enum print_line_t 9extern enum print_line_t
21trace_print_printk_msg_only(struct trace_iterator *iter); 10trace_print_printk_msg_only(struct trace_iterator *iter);
22 11
23extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
24 __attribute__ ((format (printf, 2, 3)));
25extern int
26trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
27extern int 12extern int
28seq_print_ip_sym(struct trace_seq *s, unsigned long ip, 13seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
29 unsigned long sym_flags); 14 unsigned long sym_flags);
30extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
31 size_t cnt);
32extern int trace_seq_puts(struct trace_seq *s, const char *str);
33extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
34extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
35extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
36 size_t len);
37extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
38extern int trace_seq_path(struct trace_seq *s, struct path *path);
39extern int seq_print_userip_objs(const struct userstack_entry *entry, 15extern int seq_print_userip_objs(const struct userstack_entry *entry,
40 struct trace_seq *s, unsigned long sym_flags); 16 struct trace_seq *s, unsigned long sym_flags);
41extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, 17extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
44extern int trace_print_context(struct trace_iterator *iter); 20extern int trace_print_context(struct trace_iterator *iter);
45extern int trace_print_lat_context(struct trace_iterator *iter); 21extern int trace_print_lat_context(struct trace_iterator *iter);
46 22
23extern void trace_event_read_lock(void);
24extern void trace_event_read_unlock(void);
47extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
48extern int register_ftrace_event(struct trace_event *event);
49extern int unregister_ftrace_event(struct trace_event *event);
50 26
51extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
52 int flags); 28 int flags);
53 29
30/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event);
32extern struct rw_semaphore trace_event_mutex;
33
54#define MAX_MEMHEX_BYTES 8 34#define MAX_MEMHEX_BYTES 8
55#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) 35#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
56 36
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 118439709fb7..8a30d9874cd4 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
36 36
37static void probe_power_end(struct power_trace *it) 37static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power;
39 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
40 struct trace_power *entry; 41 struct trace_power *entry;
41 struct trace_array_cpu *data; 42 struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
54 goto out; 55 goto out;
55 entry = ring_buffer_event_data(event); 56 entry = ring_buffer_event_data(event);
56 entry->state_data = *it; 57 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0); 58 if (!filter_check_discard(call, entry, tr->buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out: 60 out:
59 preempt_enable(); 61 preempt_enable();
60} 62}
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
62static void probe_power_mark(struct power_trace *it, unsigned int type, 64static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level) 65 unsigned int level)
64{ 66{
67 struct ftrace_event_call *call = &event_power;
65 struct ring_buffer_event *event; 68 struct ring_buffer_event *event;
66 struct trace_power *entry; 69 struct trace_power *entry;
67 struct trace_array_cpu *data; 70 struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
84 goto out; 87 goto out;
85 entry = ring_buffer_event_data(event); 88 entry = ring_buffer_event_data(event);
86 entry->state_data = *it; 89 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0); 90 if (!filter_check_discard(call, entry, tr->buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out: 92 out:
89 preempt_enable(); 93 preempt_enable();
90} 94}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107fe..9bece9687b62 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
245static __init int init_trace_printk_function_export(void) 245static __init int init_trace_printk_function_export(void)
246{ 246{
247 struct dentry *d_tracer; 247 struct dentry *d_tracer;
248 struct dentry *entry;
249 248
250 d_tracer = tracing_init_dentry(); 249 d_tracer = tracing_init_dentry();
251 if (!d_tracer) 250 if (!d_tracer)
252 return 0; 251 return 0;
253 252
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer, 253 trace_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops); 254 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259 255
260 return 0; 256 return 0;
261} 257}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1ae..a98106dd979c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h> 13#include <trace/events/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
29 int cpu; 29 int cpu;
30 int pc; 30 int pc;
31 31
32 if (!sched_ref || sched_stopped) 32 if (unlikely(!sched_ref))
33 return; 33 return;
34 34
35 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
36 tracing_record_cmdline(next); 36 tracing_record_cmdline(next);
37 37
38 if (!tracer_enabled) 38 if (!tracer_enabled || sched_stopped)
39 return; 39 return;
40 40
41 pc = preempt_count(); 41 pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
56 unsigned long flags; 56 unsigned long flags;
57 int cpu, pc; 57 int cpu, pc;
58 58
59 if (!likely(tracer_enabled)) 59 if (unlikely(!sched_ref))
60 return; 60 return;
61 61
62 pc = preempt_count();
63 tracing_record_cmdline(current); 62 tracing_record_cmdline(current);
64 63
65 if (sched_stopped) 64 if (!tracer_enabled || sched_stopped)
66 return; 65 return;
67 66
67 pc = preempt_count();
68 local_irq_save(flags); 68 local_irq_save(flags);
69 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
70 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153e..eacb27225173 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <trace/sched.h> 18#include <trace/events/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
138 138
139 pc = preempt_count(); 139 pc = preempt_count();
140 140
141 /* The task we are waiting for is waking up */
142 data = wakeup_trace->data[wakeup_cpu];
143
144 /* disable local data, not wakeup_cpu data */ 141 /* disable local data, not wakeup_cpu data */
145 cpu = raw_smp_processor_id(); 142 cpu = raw_smp_processor_id();
146 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); 143 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
154 if (unlikely(!tracer_enabled || next != wakeup_task)) 151 if (unlikely(!tracer_enabled || next != wakeup_task))
155 goto out_unlock; 152 goto out_unlock;
156 153
154 /* The task we are waiting for is waking up */
155 data = wakeup_trace->data[wakeup_cpu];
156
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 159
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 08f4eb2763d1..00dd6485bdd7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 18 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
19 return 1; 20 return 1;
20 } 21 }
21 return 0; 22 return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
188#else 189#else
189# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) 190# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
190#endif /* CONFIG_DYNAMIC_FTRACE */ 191#endif /* CONFIG_DYNAMIC_FTRACE */
192
191/* 193/*
192 * Simple verification test of ftrace function tracer. 194 * Simple verification test of ftrace function tracer.
193 * Enable ftrace, sleep 1/10 second, and then read the trace 195 * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
749 return ret; 751 return ret;
750} 752}
751#endif /* CONFIG_BRANCH_TRACER */ 753#endif /* CONFIG_BRANCH_TRACER */
754
755#ifdef CONFIG_HW_BRANCH_TRACER
756int
757trace_selftest_startup_hw_branches(struct tracer *trace,
758 struct trace_array *tr)
759{
760 struct trace_iterator *iter;
761 struct tracer tracer;
762 unsigned long count;
763 int ret;
764
765 if (!trace->open) {
766 printk(KERN_CONT "missing open function...");
767 return -1;
768 }
769
770 ret = tracer_init(trace, tr);
771 if (ret) {
772 warn_failed_init_tracer(trace, ret);
773 return ret;
774 }
775
776 /*
777 * The hw-branch tracer needs to collect the trace from the various
778 * cpu trace buffers - before tracing is stopped.
779 */
780 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
781 if (!iter)
782 return -ENOMEM;
783
784 memcpy(&tracer, trace, sizeof(tracer));
785
786 iter->trace = &tracer;
787 iter->tr = tr;
788 iter->pos = -1;
789 mutex_init(&iter->mutex);
790
791 trace->open(iter);
792
793 mutex_destroy(&iter->mutex);
794 kfree(iter);
795
796 tracing_stop();
797
798 ret = trace_test_buffer(tr, &count);
799 trace->reset(tr);
800 tracing_start();
801
802 if (!ret && !count) {
803 printk(KERN_CONT "no entries found..");
804 ret = -1;
805 }
806
807 return ret;
808}
809#endif /* CONFIG_HW_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f9661..2d7aebd71dbd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
265 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
266 " (%d entries)\n" 266 " (%d entries)\n"
267 " ----- ---- --------\n", 267 " ----- ---- --------\n",
268 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries - 1);
269 269
270 if (!stack_tracer_enabled && !max_stack_size) 270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m); 271 print_disabled(m);
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
352static __init int stack_trace_init(void) 352static __init int stack_trace_init(void)
353{ 353{
354 struct dentry *d_tracer; 354 struct dentry *d_tracer;
355 struct dentry *entry;
356 355
357 d_tracer = tracing_init_dentry(); 356 d_tracer = tracing_init_dentry();
358 357
359 entry = debugfs_create_file("stack_max_size", 0644, d_tracer, 358 trace_create_file("stack_max_size", 0644, d_tracer,
360 &max_stack_size, &stack_max_size_fops); 359 &max_stack_size, &stack_max_size_fops);
361 if (!entry)
362 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
363 360
364 entry = debugfs_create_file("stack_trace", 0444, d_tracer, 361 trace_create_file("stack_trace", 0444, d_tracer,
365 NULL, &stack_trace_fops); 362 NULL, &stack_trace_fops);
366 if (!entry)
367 pr_warning("Could not create debugfs 'stack_trace' entry\n");
368 363
369 if (stack_tracer_enabled) 364 if (stack_tracer_enabled)
370 register_ftrace_function(&trace_ops); 365 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index acdebd771a93..c00643733f4c 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Infrastructure for statistic tracing (histogram output). 2 * Infrastructure for statistic tracing (histogram output).
3 * 3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * 5 *
6 * Based on the code from trace_branch.c which is 6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,22 +10,27 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/rbtree.h>
13#include <linux/debugfs.h> 14#include <linux/debugfs.h>
14#include "trace_stat.h" 15#include "trace_stat.h"
15#include "trace.h" 16#include "trace.h"
16 17
17 18
18/* List of stat entries from a tracer */ 19/*
19struct trace_stat_list { 20 * List of stat red-black nodes from a tracer
20 struct list_head list; 21 * We use a such tree to sort quickly the stat
22 * entries from the tracer.
23 */
24struct stat_node {
25 struct rb_node node;
21 void *stat; 26 void *stat;
22}; 27};
23 28
24/* A stat session is the stats output in one file */ 29/* A stat session is the stats output in one file */
25struct tracer_stat_session { 30struct stat_session {
26 struct list_head session_list; 31 struct list_head session_list;
27 struct tracer_stat *ts; 32 struct tracer_stat *ts;
28 struct list_head stat_list; 33 struct rb_root stat_root;
29 struct mutex stat_mutex; 34 struct mutex stat_mutex;
30 struct dentry *file; 35 struct dentry *file;
31}; 36};
@@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
37/* The root directory for all stat files */ 42/* The root directory for all stat files */
38static struct dentry *stat_dir; 43static struct dentry *stat_dir;
39 44
45/*
46 * Iterate through the rbtree using a post order traversal path
47 * to release the next node.
48 * It won't necessary release one at each iteration
49 * but it will at least advance closer to the next one
50 * to be released.
51 */
52static struct rb_node *release_next(struct rb_node *node)
53{
54 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node);
56
57 if (node->rb_left)
58 return node->rb_left;
59 else if (node->rb_right)
60 return node->rb_right;
61 else {
62 if (!parent)
63 ;
64 else if (parent->rb_left == node)
65 parent->rb_left = NULL;
66 else
67 parent->rb_right = NULL;
68
69 snode = container_of(node, struct stat_node, node);
70 kfree(snode);
71
72 return parent;
73 }
74}
40 75
41static void reset_stat_session(struct tracer_stat_session *session) 76static void reset_stat_session(struct stat_session *session)
42{ 77{
43 struct trace_stat_list *node, *next; 78 struct rb_node *node = session->stat_root.rb_node;
44 79
45 list_for_each_entry_safe(node, next, &session->stat_list, list) 80 while (node)
46 kfree(node); 81 node = release_next(node);
47 82
48 INIT_LIST_HEAD(&session->stat_list); 83 session->stat_root = RB_ROOT;
49} 84}
50 85
51static void destroy_session(struct tracer_stat_session *session) 86static void destroy_session(struct stat_session *session)
52{ 87{
53 debugfs_remove(session->file); 88 debugfs_remove(session->file);
54 reset_stat_session(session); 89 reset_stat_session(session);
@@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session)
56 kfree(session); 91 kfree(session);
57} 92}
58 93
94typedef int (*cmp_stat_t)(void *, void *);
95
96static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
97{
98 struct rb_node **new = &(root->rb_node), *parent = NULL;
99 struct stat_node *data;
100
101 data = kzalloc(sizeof(*data), GFP_KERNEL);
102 if (!data)
103 return -ENOMEM;
104 data->stat = stat;
105
106 /*
107 * Figure out where to put new node
108 * This is a descendent sorting
109 */
110 while (*new) {
111 struct stat_node *this;
112 int result;
113
114 this = container_of(*new, struct stat_node, node);
115 result = cmp(data->stat, this->stat);
116
117 parent = *new;
118 if (result >= 0)
119 new = &((*new)->rb_left);
120 else
121 new = &((*new)->rb_right);
122 }
123
124 rb_link_node(&data->node, parent, new);
125 rb_insert_color(&data->node, root);
126 return 0;
127}
128
59/* 129/*
60 * For tracers that don't provide a stat_cmp callback. 130 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of 131 * This one will force an insertion as right-most node
62 * the list. 132 * in the rbtree.
63 */ 133 */
64static int dummy_cmp(void *p1, void *p2) 134static int dummy_cmp(void *p1, void *p2)
65{ 135{
66 return 1; 136 return -1;
67} 137}
68 138
69/* 139/*
70 * Initialize the stat list at each trace_stat file opening. 140 * Initialize the stat rbtree at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening 141 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions. 142 * since the stats could have changed between two file sessions.
73 */ 143 */
74static int stat_seq_init(struct tracer_stat_session *session) 144static int stat_seq_init(struct stat_session *session)
75{ 145{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts; 146 struct tracer_stat *ts = session->ts;
147 struct rb_root *root = &session->stat_root;
78 void *stat; 148 void *stat;
79 int ret = 0; 149 int ret = 0;
80 int i; 150 int i;
@@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
85 if (!ts->stat_cmp) 155 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp; 156 ts->stat_cmp = dummy_cmp;
87 157
88 stat = ts->stat_start(); 158 stat = ts->stat_start(ts);
89 if (!stat) 159 if (!stat)
90 goto exit; 160 goto exit;
91 161
92 /* 162 ret = insert_stat(root, stat, ts->stat_cmp);
93 * The first entry. Actually this is the second, but the first 163 if (ret)
94 * one (the stat_list head) is pointless.
95 */
96 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
97 if (!new_entry) {
98 ret = -ENOMEM;
99 goto exit; 164 goto exit;
100 }
101
102 INIT_LIST_HEAD(&new_entry->list);
103
104 list_add(&new_entry->list, &session->stat_list);
105
106 new_entry->stat = stat;
107 165
108 /* 166 /*
109 * Iterate over the tracer stat entries and store them in a sorted 167 * Iterate over the tracer stat entries and store them in an rbtree.
110 * list.
111 */ 168 */
112 for (i = 1; ; i++) { 169 for (i = 1; ; i++) {
113 stat = ts->stat_next(stat, i); 170 stat = ts->stat_next(stat, i);
@@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
116 if (!stat) 173 if (!stat)
117 break; 174 break;
118 175
119 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); 176 ret = insert_stat(root, stat, ts->stat_cmp);
120 if (!new_entry) { 177 if (ret)
121 ret = -ENOMEM; 178 goto exit_free_rbtree;
122 goto exit_free_list;
123 }
124
125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat;
127
128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
130
131 /* Insertion with a descendent sorting */
132 if (ts->stat_cmp(iter_entry->stat,
133 new_entry->stat) >= 0) {
134
135 list_add(&new_entry->list, &iter_entry->list);
136 break;
137 }
138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
143 } 179 }
180
144exit: 181exit:
145 mutex_unlock(&session->stat_mutex); 182 mutex_unlock(&session->stat_mutex);
146 return ret; 183 return ret;
147 184
148exit_free_list: 185exit_free_rbtree:
149 reset_stat_session(session); 186 reset_stat_session(session);
150 mutex_unlock(&session->stat_mutex); 187 mutex_unlock(&session->stat_mutex);
151 return ret; 188 return ret;
@@ -154,38 +191,51 @@ exit_free_list:
154 191
155static void *stat_seq_start(struct seq_file *s, loff_t *pos) 192static void *stat_seq_start(struct seq_file *s, loff_t *pos)
156{ 193{
157 struct tracer_stat_session *session = s->private; 194 struct stat_session *session = s->private;
195 struct rb_node *node;
196 int i;
158 197
159 /* Prevent from tracer switch or stat_list modification */ 198 /* Prevent from tracer switch or rbtree modification */
160 mutex_lock(&session->stat_mutex); 199 mutex_lock(&session->stat_mutex);
161 200
162 /* If we are in the beginning of the file, print the headers */ 201 /* If we are in the beginning of the file, print the headers */
163 if (!*pos && session->ts->stat_headers) 202 if (!*pos && session->ts->stat_headers) {
203 (*pos)++;
164 return SEQ_START_TOKEN; 204 return SEQ_START_TOKEN;
205 }
165 206
166 return seq_list_start(&session->stat_list, *pos); 207 node = rb_first(&session->stat_root);
208 for (i = 0; node && i < *pos; i++)
209 node = rb_next(node);
210
211 (*pos)++;
212
213 return node;
167} 214}
168 215
169static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) 216static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
170{ 217{
171 struct tracer_stat_session *session = s->private; 218 struct stat_session *session = s->private;
219 struct rb_node *node = p;
220
221 (*pos)++;
172 222
173 if (p == SEQ_START_TOKEN) 223 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos); 224 return rb_first(&session->stat_root);
175 225
176 return seq_list_next(p, &session->stat_list, pos); 226 return rb_next(node);
177} 227}
178 228
179static void stat_seq_stop(struct seq_file *s, void *p) 229static void stat_seq_stop(struct seq_file *s, void *p)
180{ 230{
181 struct tracer_stat_session *session = s->private; 231 struct stat_session *session = s->private;
182 mutex_unlock(&session->stat_mutex); 232 mutex_unlock(&session->stat_mutex);
183} 233}
184 234
185static int stat_seq_show(struct seq_file *s, void *v) 235static int stat_seq_show(struct seq_file *s, void *v)
186{ 236{
187 struct tracer_stat_session *session = s->private; 237 struct stat_session *session = s->private;
188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); 238 struct stat_node *l = container_of(v, struct stat_node, node);
189 239
190 if (v == SEQ_START_TOKEN) 240 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s); 241 return session->ts->stat_headers(s);
@@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
205{ 255{
206 int ret; 256 int ret;
207 257
208 struct tracer_stat_session *session = inode->i_private; 258 struct stat_session *session = inode->i_private;
209 259
210 ret = seq_open(file, &trace_stat_seq_ops); 260 ret = seq_open(file, &trace_stat_seq_ops);
211 if (!ret) { 261 if (!ret) {
@@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
218} 268}
219 269
220/* 270/*
221 * Avoid consuming memory with our now useless list. 271 * Avoid consuming memory with our now useless rbtree.
222 */ 272 */
223static int tracing_stat_release(struct inode *i, struct file *f) 273static int tracing_stat_release(struct inode *i, struct file *f)
224{ 274{
225 struct tracer_stat_session *session = i->i_private; 275 struct stat_session *session = i->i_private;
226 276
227 mutex_lock(&session->stat_mutex); 277 mutex_lock(&session->stat_mutex);
228 reset_stat_session(session); 278 reset_stat_session(session);
@@ -251,7 +301,7 @@ static int tracing_stat_init(void)
251 return 0; 301 return 0;
252} 302}
253 303
254static int init_stat_file(struct tracer_stat_session *session) 304static int init_stat_file(struct stat_session *session)
255{ 305{
256 if (!stat_dir && tracing_stat_init()) 306 if (!stat_dir && tracing_stat_init())
257 return -ENODEV; 307 return -ENODEV;
@@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session)
266 316
267int register_stat_tracer(struct tracer_stat *trace) 317int register_stat_tracer(struct tracer_stat *trace)
268{ 318{
269 struct tracer_stat_session *session, *node, *tmp; 319 struct stat_session *session, *node;
270 int ret; 320 int ret;
271 321
272 if (!trace) 322 if (!trace)
@@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
277 327
278 /* Already registered? */ 328 /* Already registered? */
279 mutex_lock(&all_stat_sessions_mutex); 329 mutex_lock(&all_stat_sessions_mutex);
280 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 330 list_for_each_entry(node, &all_stat_sessions, session_list) {
281 if (node->ts == trace) { 331 if (node->ts == trace) {
282 mutex_unlock(&all_stat_sessions_mutex); 332 mutex_unlock(&all_stat_sessions_mutex);
283 return -EINVAL; 333 return -EINVAL;
@@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace)
286 mutex_unlock(&all_stat_sessions_mutex); 336 mutex_unlock(&all_stat_sessions_mutex);
287 337
288 /* Init the session */ 338 /* Init the session */
289 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); 339 session = kzalloc(sizeof(*session), GFP_KERNEL);
290 if (!session) 340 if (!session)
291 return -ENOMEM; 341 return -ENOMEM;
292 342
293 session->ts = trace; 343 session->ts = trace;
294 INIT_LIST_HEAD(&session->session_list); 344 INIT_LIST_HEAD(&session->session_list);
295 INIT_LIST_HEAD(&session->stat_list);
296 mutex_init(&session->stat_mutex); 345 mutex_init(&session->stat_mutex);
297 session->file = NULL;
298 346
299 ret = init_stat_file(session); 347 ret = init_stat_file(session);
300 if (ret) { 348 if (ret) {
@@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace)
312 360
313void unregister_stat_tracer(struct tracer_stat *trace) 361void unregister_stat_tracer(struct tracer_stat *trace)
314{ 362{
315 struct tracer_stat_session *node, *tmp; 363 struct stat_session *node, *tmp;
316 364
317 mutex_lock(&all_stat_sessions_mutex); 365 mutex_lock(&all_stat_sessions_mutex);
318 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { 366 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3d..f3546a2cd826 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
12 /* The name of your stat file */ 12 /* The name of your stat file */
13 const char *name; 13 const char *name;
14 /* Iteration over statistic entries */ 14 /* Iteration over statistic entries */
15 void *(*stat_start)(void); 15 void *(*stat_start)(struct tracer_stat *trace);
16 void *(*stat_next)(void *prev, int idx); 16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */ 17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149f..f6693969287d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -203,7 +203,8 @@ static void start_stack_timer(void *unused)
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 205
206 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 206 hrtimer_start(hrtimer, ns_to_ktime(sample_period),
207 HRTIMER_MODE_REL_PINNED);
207} 208}
208 209
209static void start_stack_timers(void) 210static void start_stack_timers(void)
@@ -321,11 +322,7 @@ static const struct file_operations sysprof_sample_fops = {
321 322
322void init_tracer_sysprof_debugfs(struct dentry *d_tracer) 323void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
323{ 324{
324 struct dentry *entry;
325 325
326 entry = debugfs_create_file("sysprof_sample_period", 0644, 326 trace_create_file("sysprof_sample_period", 0644,
327 d_tracer, NULL, &sysprof_sample_fops); 327 d_tracer, NULL, &sysprof_sample_fops);
328 if (entry)
329 return;
330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
331} 328}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 797201e4a137..97fcea4acce1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8 8
9#include <trace/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include "trace_stat.h" 12#include "trace_stat.h"
@@ -16,8 +16,6 @@
16/* A cpu workqueue thread */ 16/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 17struct cpu_workqueue_stats {
18 struct list_head list; 18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu; 19 int cpu;
22 pid_t pid; 20 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */ 21/* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work) 45 struct work_struct *work)
48{ 46{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed); 47 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next; 48 struct cpu_workqueue_stats *node;
51 unsigned long flags; 49 unsigned long flags;
52 50
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 51 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 52 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
55 list) {
56 if (node->pid == wq_thread->pid) { 53 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted); 54 atomic_inc(&node->inserted);
58 goto found; 55 goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work) 66 struct work_struct *work)
70{ 67{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed); 68 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next; 69 struct cpu_workqueue_stats *node;
73 unsigned long flags; 70 unsigned long flags;
74 71
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 72 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, 73 list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
77 list) {
78 if (node->pid == wq_thread->pid) { 74 if (node->pid == wq_thread->pid) {
79 node->executed++; 75 node->executed++;
80 goto found; 76 goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
105 cws->pid = wq_thread->pid; 101 cws->pid = wq_thread->pid;
106 102
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); 104 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 105 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112} 106}
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
152 return ret; 146 return ret;
153} 147}
154 148
155static void *workqueue_stat_start(void) 149static void *workqueue_stat_start(struct tracer_stat *trace)
156{ 150{
157 int cpu; 151 int cpu;
158 void *ret = NULL; 152 void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
191static int workqueue_stat_show(struct seq_file *s, void *p) 185static int workqueue_stat_show(struct seq_file *s, void *p)
192{ 186{
193 struct cpu_workqueue_stats *cws = p; 187 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid; 188 struct pid *pid;
197 struct task_struct *tsk; 189 struct task_struct *tsk;
198 190
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
204 pid = find_get_pid(cws->pid); 191 pid = find_get_pid(cws->pid);
205 if (pid) { 192 if (pid) {
206 tsk = get_pid_task(pid, PIDTYPE_PID); 193 tsk = get_pid_task(pid, PIDTYPE_PID);
diff --git a/kernel/user.c b/kernel/user.c
index 850e0ba41c1e..2c000e7132ac 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -75,21 +75,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 75 put_user_ns(up->user_ns);
76} 76}
77 77
78static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
79{
80 struct user_struct *user;
81 struct hlist_node *h;
82
83 hlist_for_each_entry(user, h, hashent, uidhash_node) {
84 if (user->uid == uid) {
85 atomic_inc(&user->__count);
86 return user;
87 }
88 }
89
90 return NULL;
91}
92
93#ifdef CONFIG_USER_SCHED 78#ifdef CONFIG_USER_SCHED
94 79
95static void sched_destroy_user(struct user_struct *up) 80static void sched_destroy_user(struct user_struct *up)
@@ -119,6 +104,23 @@ static int sched_create_user(struct user_struct *up) { return 0; }
119 104
120#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS) 105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
121 106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
122static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ 124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
123static DEFINE_MUTEX(uids_mutex); 125static DEFINE_MUTEX(uids_mutex);
124 126
@@ -283,12 +285,12 @@ int __init uids_sysfs_init(void)
283 return uids_user_create(&root_user); 285 return uids_user_create(&root_user);
284} 286}
285 287
286/* work function to remove sysfs directory for a user and free up 288/* delayed work function to remove sysfs directory for a user and free up
287 * corresponding structures. 289 * corresponding structures.
288 */ 290 */
289static void cleanup_user_struct(struct work_struct *w) 291static void cleanup_user_struct(struct work_struct *w)
290{ 292{
291 struct user_struct *up = container_of(w, struct user_struct, work); 293 struct user_struct *up = container_of(w, struct user_struct, work.work);
292 unsigned long flags; 294 unsigned long flags;
293 int remove_user = 0; 295 int remove_user = 0;
294 296
@@ -297,15 +299,12 @@ static void cleanup_user_struct(struct work_struct *w)
297 */ 299 */
298 uids_mutex_lock(); 300 uids_mutex_lock();
299 301
300 local_irq_save(flags); 302 spin_lock_irqsave(&uidhash_lock, flags);
301 303 if (atomic_read(&up->__count) == 0) {
302 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
303 uid_hash_remove(up); 304 uid_hash_remove(up);
304 remove_user = 1; 305 remove_user = 1;
305 spin_unlock_irqrestore(&uidhash_lock, flags);
306 } else {
307 local_irq_restore(flags);
308 } 306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
309 308
310 if (!remove_user) 309 if (!remove_user)
311 goto done; 310 goto done;
@@ -331,16 +330,28 @@ done:
331 */ 330 */
332static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
333{ 332{
334 /* restore back the count */
335 atomic_inc(&up->__count);
336 spin_unlock_irqrestore(&uidhash_lock, flags); 333 spin_unlock_irqrestore(&uidhash_lock, flags);
337 334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
338 INIT_WORK(&up->work, cleanup_user_struct); 335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
339 schedule_work(&up->work);
340} 336}
341 337
342#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
343 339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{
342 struct user_struct *user;
343 struct hlist_node *h;
344
345 hlist_for_each_entry(user, h, hashent, uidhash_node) {
346 if (user->uid == uid) {
347 atomic_inc(&user->__count);
348 return user;
349 }
350 }
351
352 return NULL;
353}
354
344int uids_sysfs_init(void) { return 0; } 355int uids_sysfs_init(void) { return 0; }
345static inline int uids_user_create(struct user_struct *up) { return 0; } 356static inline int uids_user_create(struct user_struct *up) { return 0; }
346static inline void uids_mutex_lock(void) { } 357static inline void uids_mutex_lock(void) { }
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 815237a55af8..8a82b4b8ea52 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,16 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17 17
18static struct uts_namespace *create_uts_ns(void)
19{
20 struct uts_namespace *uts_ns;
21
22 uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
23 if (uts_ns)
24 kref_init(&uts_ns->kref);
25 return uts_ns;
26}
27
18/* 28/*
19 * Clone a new ns copying an original utsname, setting refcount to 1 29 * Clone a new ns copying an original utsname, setting refcount to 1
20 * @old_ns: namespace to clone 30 * @old_ns: namespace to clone
@@ -24,14 +34,13 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
24{ 34{
25 struct uts_namespace *ns; 35 struct uts_namespace *ns;
26 36
27 ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); 37 ns = create_uts_ns();
28 if (!ns) 38 if (!ns)
29 return ERR_PTR(-ENOMEM); 39 return ERR_PTR(-ENOMEM);
30 40
31 down_read(&uts_sem); 41 down_read(&uts_sem);
32 memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); 42 memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
33 up_read(&uts_sem); 43 up_read(&uts_sem);
34 kref_init(&ns->kref);
35 return ns; 44 return ns;
36} 45}
37 46
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a08950..0668795d8818 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h> 36#define CREATE_TRACE_POINTS
37#include <trace/events/workqueue.h>
37 38
38/* 39/*
39 * The per-CPU workqueue (if single thread, we always use the first 40 * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
124 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 125 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
125} 126}
126 127
127DEFINE_TRACE(workqueue_insertion);
128
129static void insert_work(struct cpu_workqueue_struct *cwq, 128static void insert_work(struct cpu_workqueue_struct *cwq,
130 struct work_struct *work, struct list_head *head) 129 struct work_struct *work, struct list_head *head)
131{ 130{
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
262} 261}
263EXPORT_SYMBOL_GPL(queue_delayed_work_on); 262EXPORT_SYMBOL_GPL(queue_delayed_work_on);
264 263
265DEFINE_TRACE(workqueue_execution);
266
267static void run_workqueue(struct cpu_workqueue_struct *cwq) 264static void run_workqueue(struct cpu_workqueue_struct *cwq)
268{ 265{
269 spin_lock_irq(&cwq->lock); 266 spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
753 return cwq; 750 return cwq;
754} 751}
755 752
756DEFINE_TRACE(workqueue_creation);
757
758static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 753static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
759{ 754{
760 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 755 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
860} 855}
861EXPORT_SYMBOL_GPL(__create_workqueue_key); 856EXPORT_SYMBOL_GPL(__create_workqueue_key);
862 857
863DEFINE_TRACE(workqueue_destruction);
864
865static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 858static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
866{ 859{
867 /* 860 /*