aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:09:09 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:12:33 -0400
commit0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch)
tree94a8d419a470a4f9852ca397bb9bbe48db92ff5c /kernel
parentdca2d6ac09d9ef59ff46820d4f0c94b08a671202 (diff)
parentf39cdf25bf77219676ec5360980ac40b1a7e144a (diff)
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts: kernel/Makefile kernel/trace/Makefile kernel/trace/trace.h samples/Makefile Merge reason: We need to be uptodate with the perf events development branch because we plan to rewrite the breakpoints API on top of perf events.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cgroup.c1128
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cpuset.c66
-rw-r--r--kernel/cred.c19
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c166
-rw-r--r--kernel/fork.c85
-rw-r--r--kernel/futex.c140
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c150
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/itimer.c169
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/lockdep.c23
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c190
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/perf_counter.c4962
-rw-r--r--kernel/perf_event.c5174
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c155
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c188
-rw-r--r--kernel/rcutorture.c47
-rw-r--r--kernel/rcutree.c431
-rw-r--r--kernel/rcutree.h88
-rw-r--r--kernel/rcutree_plugin.h191
-rw-r--r--kernel/rcutree_trace.c16
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c23
-rw-r--r--kernel/sched.c587
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c468
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c20
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/slow-work.c12
-rw-r--r--kernel/smp.c76
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c46
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c153
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c536
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c64
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/ftrace.c603
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c34
-rw-r--r--kernel/trace/trace.c201
-rw-r--r--kernel/trace/trace.h342
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h382
-rw-r--r--kernel/trace/trace_event_profile.c92
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c184
-rw-r--r--kernel/trace/trace_events_filter.c467
-rw-r--r--kernel/trace/trace_export.c287
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c66
-rw-r--r--kernel/trace/trace_hw_branches.c10
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c50
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c52
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c207
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/workqueue.c18
113 files changed, 11434 insertions, 10160 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 69943fdd7a41..17b575ec7d07 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 59obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 60obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 61obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 62obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 63obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -87,18 +86,16 @@ obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 88obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 89obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 90obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 91obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 92obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 93obj-$(CONFIG_X86_DS) += trace/
97obj-$(CONFIG_RING_BUFFER) += trace/ 94obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 96obj-$(CONFIG_SLOW_WORK) += slow-work.o
97obj-$(CONFIG_PERF_EVENTS) += perf_event.o
100obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 98obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
101obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
102 99
103ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
104# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c7ece8f027f2..ca83b73fba19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -596,8 +702,8 @@ void cgroup_unlock(void)
596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 702static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup", 709 .name = "cgroup",
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
961 return ret; 1121 return ret;
962} 1122}
963 1123
964static struct super_operations cgroup_ops = { 1124static const struct super_operations cgroup_ops = {
965 .statfs = simple_statfs, 1125 .statfs = simple_statfs,
966 .drop_inode = generic_delete_inode, 1126 .drop_inode = generic_delete_inode,
967 .show_options = cgroup_show_options, 1127 .show_options = cgroup_show_options,
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1644,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1644 return single_release(inode, file); 1863 return single_release(inode, file);
1645} 1864}
1646 1865
1647static struct file_operations cgroup_seqfile_operations = { 1866static const struct file_operations cgroup_seqfile_operations = {
1648 .read = seq_read, 1867 .read = seq_read,
1649 .write = cgroup_file_write, 1868 .write = cgroup_file_write,
1650 .llseek = seq_lseek, 1869 .llseek = seq_lseek,
@@ -1703,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1703 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1922 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1704} 1923}
1705 1924
1706static struct file_operations cgroup_file_operations = { 1925static const struct file_operations cgroup_file_operations = {
1707 .read = cgroup_file_read, 1926 .read = cgroup_file_read,
1708 .write = cgroup_file_write, 1927 .write = cgroup_file_write,
1709 .llseek = generic_file_llseek, 1928 .llseek = generic_file_llseek,
@@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = {
1711 .release = cgroup_file_release, 1930 .release = cgroup_file_release,
1712}; 1931};
1713 1932
1714static struct inode_operations cgroup_dir_inode_operations = { 1933static const struct inode_operations cgroup_dir_inode_operations = {
1715 .lookup = simple_lookup, 1934 .lookup = simple_lookup,
1716 .mkdir = cgroup_mkdir, 1935 .mkdir = cgroup_mkdir,
1717 .rmdir = cgroup_rmdir, 1936 .rmdir = cgroup_rmdir,
@@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2095 * the start of a css_set
1877 */ 2096 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2097static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2098 struct cgroup_iter *it)
1880{ 2099{
1881 struct list_head *l = it->cg_link; 2100 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2101 struct cg_cgroup_link *link;
@@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2348}
2130 2349
2131/* 2350/*
2132 * Stuff for reading the 'tasks' file. 2351 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2352 *
2134 * Reading this file can return large amounts of data if a cgroup has 2353 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2354 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2358 */
2140 2359
2141/* 2360/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2361 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2362 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2363 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2364 */
2146 * immutable after creation. 2365#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2366static void *pidlist_allocate(int count)
2367{
2368 if (PIDLIST_TOO_LARGE(count))
2369 return vmalloc(count * sizeof(pid_t));
2370 else
2371 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2372}
2373static void pidlist_free(void *p)
2374{
2375 if (is_vmalloc_addr(p))
2376 vfree(p);
2377 else
2378 kfree(p);
2379}
2380static void *pidlist_resize(void *p, int newcount)
2381{
2382 void *newlist;
2383 /* note: if new alloc fails, old p will still be valid either way */
2384 if (is_vmalloc_addr(p)) {
2385 newlist = vmalloc(newcount * sizeof(pid_t));
2386 if (!newlist)
2387 return NULL;
2388 memcpy(newlist, p, newcount * sizeof(pid_t));
2389 vfree(p);
2390 } else {
2391 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2392 }
2393 return newlist;
2394}
2395
2396/*
2397 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2398 * If the new stripped list is sufficiently smaller and there's enough memory
2399 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2400 * number of unique elements.
2401 */
2402/* is the size difference enough that we should re-allocate the array? */
2403#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2404static int pidlist_uniq(pid_t **p, int length)
2405{
2406 int src, dest = 1;
2407 pid_t *list = *p;
2408 pid_t *newlist;
2409
2410 /*
2411 * we presume the 0th element is unique, so i starts at 1. trivial
2412 * edge cases first; no work needs to be done for either
2413 */
2414 if (length == 0 || length == 1)
2415 return length;
2416 /* src and dest walk down the list; dest counts unique elements */
2417 for (src = 1; src < length; src++) {
2418 /* find next unique element */
2419 while (list[src] == list[src-1]) {
2420 src++;
2421 if (src == length)
2422 goto after;
2423 }
2424 /* dest always points to where the next unique element goes */
2425 list[dest] = list[src];
2426 dest++;
2427 }
2428after:
2429 /*
2430 * if the length difference is large enough, we want to allocate a
2431 * smaller buffer to save memory. if this fails due to out of memory,
2432 * we'll just stay with what we've got.
2433 */
2434 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2435 newlist = pidlist_resize(list, dest);
2436 if (newlist)
2437 *p = newlist;
2438 }
2439 return dest;
2440}
2441
2442static int cmppid(const void *a, const void *b)
2443{
2444 return *(pid_t *)a - *(pid_t *)b;
2445}
2446
2447/*
2448 * find the appropriate pidlist for our purpose (given procs vs tasks)
2449 * returns with the lock on that pidlist already held, and takes care
2450 * of the use count, or returns NULL with no locks held if we're out of
2451 * memory.
2147 */ 2452 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2453static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2454 enum cgroup_filetype type)
2149{ 2455{
2150 int n = 0, pid; 2456 struct cgroup_pidlist *l;
2457 /* don't need task_nsproxy() if we're looking at ourself */
2458 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2459 /*
2460 * We can't drop the pidlist_mutex before taking the l->mutex in case
2461 * the last ref-holder is trying to remove l from the list at the same
2462 * time. Holding the pidlist_mutex precludes somebody taking whichever
2463 * list we find out from under us - compare release_pid_array().
2464 */
2465 mutex_lock(&cgrp->pidlist_mutex);
2466 list_for_each_entry(l, &cgrp->pidlists, links) {
2467 if (l->key.type == type && l->key.ns == ns) {
2468 /* found a matching list - drop the extra refcount */
2469 put_pid_ns(ns);
2470 /* make sure l doesn't vanish out from under us */
2471 down_write(&l->mutex);
2472 mutex_unlock(&cgrp->pidlist_mutex);
2473 l->use_count++;
2474 return l;
2475 }
2476 }
2477 /* entry not found; create a new one */
2478 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2479 if (!l) {
2480 mutex_unlock(&cgrp->pidlist_mutex);
2481 put_pid_ns(ns);
2482 return l;
2483 }
2484 init_rwsem(&l->mutex);
2485 down_write(&l->mutex);
2486 l->key.type = type;
2487 l->key.ns = ns;
2488 l->use_count = 0; /* don't increment here */
2489 l->list = NULL;
2490 l->owner = cgrp;
2491 list_add(&l->links, &cgrp->pidlists);
2492 mutex_unlock(&cgrp->pidlist_mutex);
2493 return l;
2494}
2495
2496/*
2497 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2498 */
2499static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2500 struct cgroup_pidlist **lp)
2501{
2502 pid_t *array;
2503 int length;
2504 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2505 struct cgroup_iter it;
2152 struct task_struct *tsk; 2506 struct task_struct *tsk;
2507 struct cgroup_pidlist *l;
2508
2509 /*
2510 * If cgroup gets more users after we read count, we won't have
2511 * enough space - tough. This race is indistinguishable to the
2512 * caller from the case that the additional cgroup users didn't
2513 * show up until sometime later on.
2514 */
2515 length = cgroup_task_count(cgrp);
2516 array = pidlist_allocate(length);
2517 if (!array)
2518 return -ENOMEM;
2519 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2520 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2521 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2522 if (unlikely(n == length))
2156 break; 2523 break;
2157 pid = task_pid_vnr(tsk); 2524 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2525 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2526 pid = task_tgid_vnr(tsk);
2527 else
2528 pid = task_pid_vnr(tsk);
2529 if (pid > 0) /* make sure to only use valid results */
2530 array[n++] = pid;
2160 } 2531 }
2161 cgroup_iter_end(cgrp, &it); 2532 cgroup_iter_end(cgrp, &it);
2162 return n; 2533 length = n;
2534 /* now sort & (if procs) strip out duplicates */
2535 sort(array, length, sizeof(pid_t), cmppid, NULL);
2536 if (type == CGROUP_FILE_PROCS)
2537 length = pidlist_uniq(&array, length);
2538 l = cgroup_pidlist_find(cgrp, type);
2539 if (!l) {
2540 pidlist_free(array);
2541 return -ENOMEM;
2542 }
2543 /* store array, freeing old if necessary - lock already held */
2544 pidlist_free(l->list);
2545 l->list = array;
2546 l->length = length;
2547 l->use_count++;
2548 up_write(&l->mutex);
2549 *lp = l;
2550 return 0;
2163} 2551}
2164 2552
2165/** 2553/**
@@ -2216,37 +2604,14 @@ err:
2216 return ret; 2604 return ret;
2217} 2605}
2218 2606
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2607
2243/* 2608/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2609 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2610 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2611 * in the cgroup->l->list array.
2247 */ 2612 */
2248 2613
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2614static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2615{
2251 /* 2616 /*
2252 * Initially we receive a position value that corresponds to 2617 * Initially we receive a position value that corresponds to
@@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2619 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2620 * next pid to display, if any
2256 */ 2621 */
2257 struct cgroup_pids *cp = s->private; 2622 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2623 int index = 0, pid = *pos;
2260 int *iter; 2624 int *iter;
2261 2625
2262 down_read(&cgrp->pids_mutex); 2626 down_read(&l->mutex);
2263 if (pid) { 2627 if (pid) {
2264 int end = cp->length; 2628 int end = l->length;
2265 2629
2266 while (index < end) { 2630 while (index < end) {
2267 int mid = (index + end) / 2; 2631 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2632 if (l->list[mid] == pid) {
2269 index = mid; 2633 index = mid;
2270 break; 2634 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2635 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2636 index = mid + 1;
2273 else 2637 else
2274 end = mid; 2638 end = mid;
2275 } 2639 }
2276 } 2640 }
2277 /* If we're off the end of the array, we're done */ 2641 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2642 if (index >= l->length)
2279 return NULL; 2643 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2644 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2645 iter = l->list + index;
2282 *pos = *iter; 2646 *pos = *iter;
2283 return iter; 2647 return iter;
2284} 2648}
2285 2649
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2650static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2651{
2288 struct cgroup_pids *cp = s->private; 2652 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2653 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2654}
2292 2655
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2656static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2657{
2295 struct cgroup_pids *cp = s->private; 2658 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2659 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2660 pid_t *end = l->list + l->length;
2298
2299 /* 2661 /*
2300 * Advance to the next pid in the array. If this goes off the 2662 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2663 * end, we're done
@@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2671 }
2310} 2672}
2311 2673
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2674static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2675{
2314 return seq_printf(s, "%d\n", *(int *)v); 2676 return seq_printf(s, "%d\n", *(int *)v);
2315} 2677}
2316 2678
2317static struct seq_operations cgroup_tasks_seq_operations = { 2679/*
2318 .start = cgroup_tasks_start, 2680 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2681 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2682 */
2321 .show = cgroup_tasks_show, 2683static const struct seq_operations cgroup_pidlist_seq_operations = {
2684 .start = cgroup_pidlist_start,
2685 .stop = cgroup_pidlist_stop,
2686 .next = cgroup_pidlist_next,
2687 .show = cgroup_pidlist_show,
2322}; 2688};
2323 2689
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2690static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2691{
2326 struct cgroup *cgrp = cp->cgrp; 2692 /*
2327 2693 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2694 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2695 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2696 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2697 */
2332 put_pid_ns(cp->ns); 2698 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2699 down_write(&l->mutex);
2334 kfree(cp); 2700 BUG_ON(!l->use_count);
2701 if (!--l->use_count) {
2702 /* we're the last user if refcount is 0; remove and free */
2703 list_del(&l->links);
2704 mutex_unlock(&l->owner->pidlist_mutex);
2705 pidlist_free(l->list);
2706 put_pid_ns(l->key.ns);
2707 up_write(&l->mutex);
2708 kfree(l);
2709 return;
2335 } 2710 }
2336 up_write(&cgrp->pids_mutex); 2711 mutex_unlock(&l->owner->pidlist_mutex);
2712 up_write(&l->mutex);
2337} 2713}
2338 2714
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2715static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2716{
2341 struct seq_file *seq; 2717 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2718 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2719 return 0;
2346 2720 /*
2347 seq = file->private_data; 2721 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2722 * reading; hence we check if it's not null only in that case.
2349 2723 */
2350 release_cgroup_pid_array(cp); 2724 l = ((struct seq_file *)file->private_data)->private;
2725 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2726 return seq_release(inode, file);
2352} 2727}
2353 2728
2354static struct file_operations cgroup_tasks_operations = { 2729static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2730 .read = seq_read,
2356 .llseek = seq_lseek, 2731 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2732 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2733 .release = cgroup_pidlist_release,
2359}; 2734};
2360 2735
2361/* 2736/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2737 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2738 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2739 * in the cgroup.
2364 */ 2740 */
2365 2741/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2742static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2743{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2744 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2745 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2746 int retval;
2374 2747
2375 /* Nothing to do for write-only files */ 2748 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2749 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2750 return 0;
2378 2751
2379 /* 2752 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2753 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2754 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2755 return retval;
2383 * show up until sometime later on. 2756 /* configure file information */
2384 */ 2757 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2758
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2759 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2760 if (retval) {
2424 release_cgroup_pid_array(cp); 2761 cgroup_release_pid_array(l);
2425 return retval; 2762 return retval;
2426 } 2763 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2764 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2765 return 0;
2429} 2766}
2767static int cgroup_tasks_open(struct inode *unused, struct file *file)
2768{
2769 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2770}
2771static int cgroup_procs_open(struct inode *unused, struct file *file)
2772{
2773 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2774}
2430 2775
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2776static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2777 struct cftype *cft)
@@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2794/*
2450 * for the common functions, 'private' gives the type of file 2795 * for the common functions, 'private' gives the type of file
2451 */ 2796 */
2797/* for hysterical raisins, we can't put this on the older files */
2798#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2799static struct cftype files[] = {
2453 { 2800 {
2454 .name = "tasks", 2801 .name = "tasks",
2455 .open = cgroup_tasks_open, 2802 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2803 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2804 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2805 .mode = S_IRUGO | S_IWUSR,
2460 }, 2806 },
2461 2807 {
2808 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2809 .open = cgroup_procs_open,
2810 /* .write_u64 = cgroup_procs_write, TODO */
2811 .release = cgroup_pidlist_release,
2812 .mode = S_IRUGO,
2813 },
2462 { 2814 {
2463 .name = "notify_on_release", 2815 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2816 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2817 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2818 },
2468}; 2819};
2469 2820
@@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2823 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2824 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2825 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2826};
2477 2827
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2828static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3229 init_task.cgroups = &init_css_set;
2880 3230
2881 init_css_set_link.cg = &init_css_set; 3231 init_css_set_link.cg = &init_css_set;
3232 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3233 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3234 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3235 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3284,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3284 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3285 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3286 hlist_add_head(&init_css_set.hlist, hhead);
2936 3287 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3288 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3289 if (err < 0)
2939 goto out; 3290 goto out;
@@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3337 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3338 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3339 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3340 int count = 0;
2991 3341
2992 seq_printf(m, "%lu:", root->subsys_bits); 3342 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3343 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3344 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3345 if (strlen(root->name))
3346 seq_printf(m, "%sname=%s", count ? "," : "",
3347 root->name);
2995 seq_putc(m, ':'); 3348 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3349 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3350 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3351 if (retval < 0)
3000 goto out_unlock; 3352 goto out_unlock;
@@ -3017,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3017 return single_open(file, proc_cgroup_show, pid); 3369 return single_open(file, proc_cgroup_show, pid);
3018} 3370}
3019 3371
3020struct file_operations proc_cgroup_operations = { 3372const struct file_operations proc_cgroup_operations = {
3021 .open = cgroup_open, 3373 .open = cgroup_open,
3022 .read = seq_read, 3374 .read = seq_read,
3023 .llseek = seq_lseek, 3375 .llseek = seq_lseek,
@@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3385 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3387 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3388 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3389 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3390 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3391 }
3040 mutex_unlock(&cgroup_mutex); 3392 mutex_unlock(&cgroup_mutex);
@@ -3046,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3046 return single_open(file, proc_cgroupstats_show, NULL); 3398 return single_open(file, proc_cgroupstats_show, NULL);
3047} 3399}
3048 3400
3049static struct file_operations proc_cgroupstats_operations = { 3401static const struct file_operations proc_cgroupstats_operations = {
3050 .open = cgroupstats_open, 3402 .open = cgroupstats_open,
3051 .read = seq_read, 3403 .read = seq_read,
3052 .llseek = seq_lseek, 3404 .llseek = seq_lseek,
@@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3672{
3321 int ret; 3673 int ret;
3322 struct cgroup *target; 3674 struct cgroup *target;
3323 int subsys_id;
3324 3675
3325 if (cgrp == dummytop) 3676 if (cgrp == dummytop)
3326 return 1; 3677 return 1;
3327 3678
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3679 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3680 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3681 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3682 ret = (cgrp == target);
@@ -3358,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp)
3358void __css_put(struct cgroup_subsys_state *css) 3708void __css_put(struct cgroup_subsys_state *css)
3359{ 3709{
3360 struct cgroup *cgrp = css->cgroup; 3710 struct cgroup *cgrp = css->cgroup;
3711 int val;
3361 rcu_read_lock(); 3712 rcu_read_lock();
3362 if (atomic_dec_return(&css->refcnt) == 1) { 3713 val = atomic_dec_return(&css->refcnt);
3714 if (val == 1) {
3363 if (notify_on_release(cgrp)) { 3715 if (notify_on_release(cgrp)) {
3364 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3716 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3365 check_for_release(cgrp); 3717 check_for_release(cgrp);
@@ -3367,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css)
3367 cgroup_wakeup_rmdir_waiter(cgrp); 3719 cgroup_wakeup_rmdir_waiter(cgrp);
3368 } 3720 }
3369 rcu_read_unlock(); 3721 rcu_read_unlock();
3722 WARN_ON_ONCE(val < 1);
3370} 3723}
3371 3724
3372/* 3725/*
@@ -3693,3 +4046,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4046 return ret;
3694} 4047}
3695 4048
4049#ifdef CONFIG_CGROUP_DEBUG
4050static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4051 struct cgroup *cont)
4052{
4053 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4054
4055 if (!css)
4056 return ERR_PTR(-ENOMEM);
4057
4058 return css;
4059}
4060
4061static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4062{
4063 kfree(cont->subsys[debug_subsys_id]);
4064}
4065
4066static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4067{
4068 return atomic_read(&cont->count);
4069}
4070
4071static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4072{
4073 return cgroup_task_count(cont);
4074}
4075
4076static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4077{
4078 return (u64)(unsigned long)current->cgroups;
4079}
4080
4081static u64 current_css_set_refcount_read(struct cgroup *cont,
4082 struct cftype *cft)
4083{
4084 u64 count;
4085
4086 rcu_read_lock();
4087 count = atomic_read(&current->cgroups->refcount);
4088 rcu_read_unlock();
4089 return count;
4090}
4091
4092static int current_css_set_cg_links_read(struct cgroup *cont,
4093 struct cftype *cft,
4094 struct seq_file *seq)
4095{
4096 struct cg_cgroup_link *link;
4097 struct css_set *cg;
4098
4099 read_lock(&css_set_lock);
4100 rcu_read_lock();
4101 cg = rcu_dereference(current->cgroups);
4102 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4103 struct cgroup *c = link->cgrp;
4104 const char *name;
4105
4106 if (c->dentry)
4107 name = c->dentry->d_name.name;
4108 else
4109 name = "?";
4110 seq_printf(seq, "Root %d group %s\n",
4111 c->root->hierarchy_id, name);
4112 }
4113 rcu_read_unlock();
4114 read_unlock(&css_set_lock);
4115 return 0;
4116}
4117
4118#define MAX_TASKS_SHOWN_PER_CSS 25
4119static int cgroup_css_links_read(struct cgroup *cont,
4120 struct cftype *cft,
4121 struct seq_file *seq)
4122{
4123 struct cg_cgroup_link *link;
4124
4125 read_lock(&css_set_lock);
4126 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4127 struct css_set *cg = link->cg;
4128 struct task_struct *task;
4129 int count = 0;
4130 seq_printf(seq, "css_set %p\n", cg);
4131 list_for_each_entry(task, &cg->tasks, cg_list) {
4132 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4133 seq_puts(seq, " ...\n");
4134 break;
4135 } else {
4136 seq_printf(seq, " task %d\n",
4137 task_pid_vnr(task));
4138 }
4139 }
4140 }
4141 read_unlock(&css_set_lock);
4142 return 0;
4143}
4144
4145static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4146{
4147 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4148}
4149
4150static struct cftype debug_files[] = {
4151 {
4152 .name = "cgroup_refcount",
4153 .read_u64 = cgroup_refcount_read,
4154 },
4155 {
4156 .name = "taskcount",
4157 .read_u64 = debug_taskcount_read,
4158 },
4159
4160 {
4161 .name = "current_css_set",
4162 .read_u64 = current_css_set_read,
4163 },
4164
4165 {
4166 .name = "current_css_set_refcount",
4167 .read_u64 = current_css_set_refcount_read,
4168 },
4169
4170 {
4171 .name = "current_css_set_cg_links",
4172 .read_seq_string = current_css_set_cg_links_read,
4173 },
4174
4175 {
4176 .name = "cgroup_css_links",
4177 .read_seq_string = cgroup_css_links_read,
4178 },
4179
4180 {
4181 .name = "releasable",
4182 .read_u64 = releasable_read,
4183 },
4184};
4185
4186static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4187{
4188 return cgroup_add_files(cont, ss, debug_files,
4189 ARRAY_SIZE(debug_files));
4190}
4191
4192struct cgroup_subsys debug_subsys = {
4193 .name = "debug",
4194 .create = debug_create,
4195 .destroy = debug_destroy,
4196 .populate = debug_populate,
4197 .subsys_id = debug_subsys_id,
4198};
4199#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b5cb469d2545 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1324static cpumask_var_t cpus_attach;
1325 1325
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1327static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1328 struct task_struct *tsk, bool threadgroup)
1329{ 1329{
1330 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1331 struct cpuset *cs = cgroup_cs(cont);
1331 1332
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1333 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1344 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1345 return -EINVAL;
1345 1346
1346 return security_task_setscheduler(tsk, 0, NULL); 1347 ret = security_task_setscheduler(tsk, 0, NULL);
1348 if (ret)
1349 return ret;
1350 if (threadgroup) {
1351 struct task_struct *c;
1352
1353 rcu_read_lock();
1354 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1355 ret = security_task_setscheduler(c, 0, NULL);
1356 if (ret) {
1357 rcu_read_unlock();
1358 return ret;
1359 }
1360 }
1361 rcu_read_unlock();
1362 }
1363 return 0;
1364}
1365
1366static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1367 struct cpuset *cs)
1368{
1369 int err;
1370 /*
1371 * can_attach beforehand should guarantee that this doesn't fail.
1372 * TODO: have a better way to handle failure here
1373 */
1374 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1375 WARN_ON_ONCE(err);
1376
1377 task_lock(tsk);
1378 cpuset_change_task_nodemask(tsk, to);
1379 task_unlock(tsk);
1380 cpuset_update_task_spread_flag(cs, tsk);
1381
1347} 1382}
1348 1383
1349static void cpuset_attach(struct cgroup_subsys *ss, 1384static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1350 struct cgroup *cont, struct cgroup *oldcont, 1385 struct cgroup *oldcont, struct task_struct *tsk,
1351 struct task_struct *tsk) 1386 bool threadgroup)
1352{ 1387{
1353 nodemask_t from, to; 1388 nodemask_t from, to;
1354 struct mm_struct *mm; 1389 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1390 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1391 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1392
1359 if (cs == &top_cpuset) { 1393 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1394 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1397 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1398 guarantee_online_mems(cs, &to);
1365 } 1399 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1400
1370 task_lock(tsk); 1401 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1402 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1403 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1404 struct task_struct *c;
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs);
1408 }
1409 rcu_read_unlock();
1410 }
1374 1411
1412 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1413 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1414 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1415 mm = get_task_mm(tsk);
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082eb..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
782 782
783#ifdef CONFIG_DEBUG_CREDENTIALS 783#ifdef CONFIG_DEBUG_CREDENTIALS
784 784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
785/* 804/*
786 * dump invalid credentials 805 * dump invalid credentials
787 */ 806 */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddff..e61891f80123 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
47#include <linux/tracehook.h> 47#include <linux/tracehook.h>
48#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 154{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 156
157#ifdef CONFIG_PERF_COUNTERS 157#ifdef CONFIG_PERF_EVENTS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp); 158 WARN_ON_ONCE(tsk->perf_event_ctxp);
159#endif 159#endif
160 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
161 put_task_struct(tsk); 161 put_task_struct(tsk);
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
359{ 359{
360 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
361 361
362 if (task_session(curr) != pid) 362 if (task_session(curr) != pid) {
363 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
364 366
365 if (task_pgrp(curr) != pid) 367 if (task_pgrp(curr) != pid)
366 change_pid(curr, PIDTYPE_PGID, pid); 368 change_pid(curr, PIDTYPE_PGID, pid);
@@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code)
945 if (group_dead) { 947 if (group_dead) {
946 hrtimer_cancel(&tsk->signal->real_timer); 948 hrtimer_cancel(&tsk->signal->real_timer);
947 exit_itimers(tsk->signal); 949 exit_itimers(tsk->signal);
950 if (tsk->mm)
951 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
948 } 952 }
949 acct_collect(code, group_dead); 953 acct_collect(code, group_dead);
950 if (group_dead) 954 if (group_dead)
@@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code)
972 disassociate_ctty(1); 976 disassociate_ctty(1);
973 977
974 module_put(task_thread_info(tsk)->exec_domain->module); 978 module_put(task_thread_info(tsk)->exec_domain->module);
975 if (tsk->binfmt)
976 module_put(tsk->binfmt->module);
977 979
978 proc_exit_connector(tsk); 980 proc_exit_connector(tsk);
979 981
@@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code)
981 * Flush inherited counters to the parent - before the parent 983 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 984 * gets woken up by child-exit notifications.
983 */ 985 */
984 perf_counter_exit_task(tsk); 986 perf_event_exit_task(tsk);
985 987
986 exit_notify(tsk, group_dead); 988 exit_notify(tsk, group_dead);
987#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
@@ -989,8 +991,6 @@ NORET_TYPE void do_exit(long code)
989 tsk->mempolicy = NULL; 991 tsk->mempolicy = NULL;
990#endif 992#endif
991#ifdef CONFIG_FUTEX 993#ifdef CONFIG_FUTEX
992 if (unlikely(!list_empty(&tsk->pi_state_list)))
993 exit_pi_state_list(tsk);
994 if (unlikely(current->pi_state_cache)) 994 if (unlikely(current->pi_state_cache))
995 kfree(current->pi_state_cache); 995 kfree(current->pi_state_cache);
996#endif 996#endif
@@ -1093,28 +1093,28 @@ struct wait_opts {
1093 int __user *wo_stat; 1093 int __user *wo_stat;
1094 struct rusage __user *wo_rusage; 1094 struct rusage __user *wo_rusage;
1095 1095
1096 wait_queue_t child_wait;
1096 int notask_error; 1097 int notask_error;
1097}; 1098};
1098 1099
1099static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1100static inline
1101struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1100{ 1102{
1101 struct pid *pid = NULL; 1103 if (type != PIDTYPE_PID)
1102 if (type == PIDTYPE_PID) 1104 task = task->group_leader;
1103 pid = task->pids[type].pid; 1105 return task->pids[type].pid;
1104 else if (type < PIDTYPE_MAX)
1105 pid = task->group_leader->pids[type].pid;
1106 return pid;
1107} 1106}
1108 1107
1109static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1108static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1110{ 1109{
1111 int err; 1110 return wo->wo_type == PIDTYPE_MAX ||
1112 1111 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1113 if (wo->wo_type < PIDTYPE_MAX) { 1112}
1114 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1115 return 0;
1116 }
1117 1113
1114static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1115{
1116 if (!eligible_pid(wo, p))
1117 return 0;
1118 /* Wait for all children (clone and not) if __WALL is set; 1118 /* Wait for all children (clone and not) if __WALL is set;
1119 * otherwise, wait for clone children *only* if __WCLONE is 1119 * otherwise, wait for clone children *only* if __WCLONE is
1120 * set; otherwise, wait for non-clone children *only*. (Note: 1120 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1124,10 +1124,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1124 && !(wo->wo_flags & __WALL)) 1124 && !(wo->wo_flags & __WALL))
1125 return 0; 1125 return 0;
1126 1126
1127 err = security_task_wait(p);
1128 if (err)
1129 return err;
1130
1131 return 1; 1127 return 1;
1132} 1128}
1133 1129
@@ -1140,18 +1136,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1140 1136
1141 put_task_struct(p); 1137 put_task_struct(p);
1142 infop = wo->wo_info; 1138 infop = wo->wo_info;
1143 if (!retval) 1139 if (infop) {
1144 retval = put_user(SIGCHLD, &infop->si_signo); 1140 if (!retval)
1145 if (!retval) 1141 retval = put_user(SIGCHLD, &infop->si_signo);
1146 retval = put_user(0, &infop->si_errno); 1142 if (!retval)
1147 if (!retval) 1143 retval = put_user(0, &infop->si_errno);
1148 retval = put_user((short)why, &infop->si_code); 1144 if (!retval)
1149 if (!retval) 1145 retval = put_user((short)why, &infop->si_code);
1150 retval = put_user(pid, &infop->si_pid); 1146 if (!retval)
1151 if (!retval) 1147 retval = put_user(pid, &infop->si_pid);
1152 retval = put_user(uid, &infop->si_uid); 1148 if (!retval)
1153 if (!retval) 1149 retval = put_user(uid, &infop->si_uid);
1154 retval = put_user(status, &infop->si_status); 1150 if (!retval)
1151 retval = put_user(status, &infop->si_status);
1152 }
1155 if (!retval) 1153 if (!retval)
1156 retval = pid; 1154 retval = pid;
1157 return retval; 1155 return retval;
@@ -1208,6 +1206,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1208 if (likely(!traced) && likely(!task_detached(p))) { 1206 if (likely(!traced) && likely(!task_detached(p))) {
1209 struct signal_struct *psig; 1207 struct signal_struct *psig;
1210 struct signal_struct *sig; 1208 struct signal_struct *sig;
1209 unsigned long maxrss;
1211 1210
1212 /* 1211 /*
1213 * The resource counters for the group leader are in its 1212 * The resource counters for the group leader are in its
@@ -1256,6 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1256 psig->coublock += 1255 psig->coublock +=
1257 task_io_get_oublock(p) + 1256 task_io_get_oublock(p) +
1258 sig->oublock + sig->coublock; 1257 sig->oublock + sig->coublock;
1258 maxrss = max(sig->maxrss, sig->cmaxrss);
1259 if (psig->cmaxrss < maxrss)
1260 psig->cmaxrss = maxrss;
1259 task_io_accounting_add(&psig->ioac, &p->ioac); 1261 task_io_accounting_add(&psig->ioac, &p->ioac);
1260 task_io_accounting_add(&psig->ioac, &sig->ioac); 1262 task_io_accounting_add(&psig->ioac, &sig->ioac);
1261 spin_unlock_irq(&p->real_parent->sighand->siglock); 1263 spin_unlock_irq(&p->real_parent->sighand->siglock);
@@ -1477,13 +1479,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1477 * then ->notask_error is 0 if @p is an eligible child, 1479 * then ->notask_error is 0 if @p is an eligible child,
1478 * or another error from security_task_wait(), or still -ECHILD. 1480 * or another error from security_task_wait(), or still -ECHILD.
1479 */ 1481 */
1480static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1482static int wait_consider_task(struct wait_opts *wo, int ptrace,
1481 int ptrace, struct task_struct *p) 1483 struct task_struct *p)
1482{ 1484{
1483 int ret = eligible_child(wo, p); 1485 int ret = eligible_child(wo, p);
1484 if (!ret) 1486 if (!ret)
1485 return ret; 1487 return ret;
1486 1488
1489 ret = security_task_wait(p);
1487 if (unlikely(ret < 0)) { 1490 if (unlikely(ret < 0)) {
1488 /* 1491 /*
1489 * If we have not yet seen any eligible child, 1492 * If we have not yet seen any eligible child,
@@ -1545,7 +1548,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1545 * Do not consider detached threads. 1548 * Do not consider detached threads.
1546 */ 1549 */
1547 if (!task_detached(p)) { 1550 if (!task_detached(p)) {
1548 int ret = wait_consider_task(wo, tsk, 0, p); 1551 int ret = wait_consider_task(wo, 0, p);
1549 if (ret) 1552 if (ret)
1550 return ret; 1553 return ret;
1551 } 1554 }
@@ -1559,7 +1562,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1559 struct task_struct *p; 1562 struct task_struct *p;
1560 1563
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1564 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(wo, tsk, 1, p); 1565 int ret = wait_consider_task(wo, 1, p);
1563 if (ret) 1566 if (ret)
1564 return ret; 1567 return ret;
1565 } 1568 }
@@ -1567,15 +1570,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 return 0; 1570 return 0;
1568} 1571}
1569 1572
1573static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1574 int sync, void *key)
1575{
1576 struct wait_opts *wo = container_of(wait, struct wait_opts,
1577 child_wait);
1578 struct task_struct *p = key;
1579
1580 if (!eligible_pid(wo, p))
1581 return 0;
1582
1583 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1584 return 0;
1585
1586 return default_wake_function(wait, mode, sync, key);
1587}
1588
1589void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1590{
1591 __wake_up_sync_key(&parent->signal->wait_chldexit,
1592 TASK_INTERRUPTIBLE, 1, p);
1593}
1594
1570static long do_wait(struct wait_opts *wo) 1595static long do_wait(struct wait_opts *wo)
1571{ 1596{
1572 DECLARE_WAITQUEUE(wait, current);
1573 struct task_struct *tsk; 1597 struct task_struct *tsk;
1574 int retval; 1598 int retval;
1575 1599
1576 trace_sched_process_wait(wo->wo_pid); 1600 trace_sched_process_wait(wo->wo_pid);
1577 1601
1578 add_wait_queue(&current->signal->wait_chldexit,&wait); 1602 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1603 wo->child_wait.private = current;
1604 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1579repeat: 1605repeat:
1580 /* 1606 /*
1581 * If there is nothing that can match our critiera just get out. 1607 * If there is nothing that can match our critiera just get out.
@@ -1616,32 +1642,7 @@ notask:
1616 } 1642 }
1617end: 1643end:
1618 __set_current_state(TASK_RUNNING); 1644 __set_current_state(TASK_RUNNING);
1619 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1645 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1620 if (wo->wo_info) {
1621 struct siginfo __user *infop = wo->wo_info;
1622
1623 if (retval > 0)
1624 retval = 0;
1625 else {
1626 /*
1627 * For a WNOHANG return, clear out all the fields
1628 * we would set so the user can easily tell the
1629 * difference.
1630 */
1631 if (!retval)
1632 retval = put_user(0, &infop->si_signo);
1633 if (!retval)
1634 retval = put_user(0, &infop->si_errno);
1635 if (!retval)
1636 retval = put_user(0, &infop->si_code);
1637 if (!retval)
1638 retval = put_user(0, &infop->si_pid);
1639 if (!retval)
1640 retval = put_user(0, &infop->si_uid);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_status);
1643 }
1644 }
1645 return retval; 1646 return retval;
1646} 1647}
1647 1648
@@ -1686,6 +1687,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1686 wo.wo_stat = NULL; 1687 wo.wo_stat = NULL;
1687 wo.wo_rusage = ru; 1688 wo.wo_rusage = ru;
1688 ret = do_wait(&wo); 1689 ret = do_wait(&wo);
1690
1691 if (ret > 0) {
1692 ret = 0;
1693 } else if (infop) {
1694 /*
1695 * For a WNOHANG return, clear out all the fields
1696 * we would set so the user can easily tell the
1697 * difference.
1698 */
1699 if (!ret)
1700 ret = put_user(0, &infop->si_signo);
1701 if (!ret)
1702 ret = put_user(0, &infop->si_errno);
1703 if (!ret)
1704 ret = put_user(0, &infop->si_code);
1705 if (!ret)
1706 ret = put_user(0, &infop->si_pid);
1707 if (!ret)
1708 ret = put_user(0, &infop->si_uid);
1709 if (!ret)
1710 ret = put_user(0, &infop->si_status);
1711 }
1712
1689 put_pid(pid); 1713 put_pid(pid);
1690 1714
1691 /* avoid REGPARM breakage on x86: */ 1715 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3fb..4c20fff8c13a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/ftrace.h> 49#include <linux/ftrace.h>
50#include <linux/profile.h> 50#include <linux/profile.h>
51#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
53#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
54#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
61#include <linux/blkdev.h> 62#include <linux/blkdev.h>
62#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
63#include <linux/magic.h> 64#include <linux/magic.h>
64#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
65 67
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
136/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
137static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
138 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
139void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
140{ 149{
141 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
142 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
143 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
253 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
254#endif 264#endif
255 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
256 return tsk; 269 return tsk;
257 270
258out: 271out:
@@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
288 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
289 rb_parent = NULL; 302 rb_parent = NULL;
290 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
291 307
292 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
293 struct file *file; 309 struct file *file;
@@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
418 434
419#include <linux/init_task.h> 435#include <linux/init_task.h>
420 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
421static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
422{ 446{
423 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
424 atomic_set(&mm->mm_count, 1); 448 atomic_set(&mm->mm_count, 1);
425 init_rwsem(&mm->mmap_sem); 449 init_rwsem(&mm->mmap_sem);
426 INIT_LIST_HEAD(&mm->mmlist); 450 INIT_LIST_HEAD(&mm->mmlist);
427 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 451 mm->flags = (current->mm) ?
452 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
428 mm->core_state = NULL; 453 mm->core_state = NULL;
429 mm->nr_ptes = 0; 454 mm->nr_ptes = 0;
430 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
431 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
432 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
433 spin_lock_init(&mm->ioctx_lock);
434 INIT_HLIST_HEAD(&mm->ioctx_list);
435 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
436 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
437 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
438 462
439 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm)
485 509
486 if (atomic_dec_and_test(&mm->mm_users)) { 510 if (atomic_dec_and_test(&mm->mm_users)) {
487 exit_aio(mm); 511 exit_aio(mm);
512 ksm_exit(mm);
488 exit_mmap(mm); 513 exit_mmap(mm);
489 set_mm_exe_file(mm, NULL); 514 set_mm_exe_file(mm, NULL);
490 if (!list_empty(&mm->mmlist)) { 515 if (!list_empty(&mm->mmlist)) {
@@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm)
493 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
494 } 519 }
495 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
496 mmdrop(mm); 523 mmdrop(mm);
497 } 524 }
498} 525}
@@ -543,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
543 570
544 /* Get rid of any futexes when releasing the mm */ 571 /* Get rid of any futexes when releasing the mm */
545#ifdef CONFIG_FUTEX 572#ifdef CONFIG_FUTEX
546 if (unlikely(tsk->robust_list)) 573 if (unlikely(tsk->robust_list)) {
547 exit_robust_list(tsk); 574 exit_robust_list(tsk);
575 tsk->robust_list = NULL;
576 }
548#ifdef CONFIG_COMPAT 577#ifdef CONFIG_COMPAT
549 if (unlikely(tsk->compat_robust_list)) 578 if (unlikely(tsk->compat_robust_list)) {
550 compat_exit_robust_list(tsk); 579 compat_exit_robust_list(tsk);
580 tsk->compat_robust_list = NULL;
581 }
551#endif 582#endif
583 if (unlikely(!list_empty(&tsk->pi_state_list)))
584 exit_pi_state_list(tsk);
552#endif 585#endif
553 586
554 /* Get rid of any cached register state */ 587 /* Get rid of any cached register state */
@@ -618,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
618 mm->hiwater_rss = get_mm_rss(mm); 651 mm->hiwater_rss = get_mm_rss(mm);
619 mm->hiwater_vm = mm->total_vm; 652 mm->hiwater_vm = mm->total_vm;
620 653
654 if (mm->binfmt && !try_module_get(mm->binfmt->module))
655 goto free_pt;
656
621 return mm; 657 return mm;
622 658
623free_pt: 659free_pt:
660 /* don't put binfmt in mmput, we haven't got module yet */
661 mm->binfmt = NULL;
624 mmput(mm); 662 mmput(mm);
625 663
626fail_nomem: 664fail_nomem:
@@ -788,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
788 thread_group_cputime_init(sig); 826 thread_group_cputime_init(sig);
789 827
790 /* Expiration times and increments. */ 828 /* Expiration times and increments. */
791 sig->it_virt_expires = cputime_zero; 829 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
792 sig->it_virt_incr = cputime_zero; 830 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
793 sig->it_prof_expires = cputime_zero; 831 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
794 sig->it_prof_incr = cputime_zero; 832 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
795 833
796 /* Cached expiration times. */ 834 /* Cached expiration times. */
797 sig->cputime_expires.prof_exp = cputime_zero; 835 sig->cputime_expires.prof_exp = cputime_zero;
@@ -849,6 +887,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
849 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
850 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
851 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
890 sig->maxrss = sig->cmaxrss = 0;
852 task_io_accounting_init(&sig->ioac); 891 task_io_accounting_init(&sig->ioac);
853 sig->sum_sched_runtime = 0; 892 sig->sum_sched_runtime = 0;
854 taskstats_tgid_init(sig); 893 taskstats_tgid_init(sig);
@@ -863,6 +902,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
863 902
864 tty_audit_fork(sig); 903 tty_audit_fork(sig);
865 904
905 sig->oom_adj = current->signal->oom_adj;
906
866 return 0; 907 return 0;
867} 908}
868 909
@@ -958,6 +999,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
958 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 999 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
959 return ERR_PTR(-EINVAL); 1000 return ERR_PTR(-EINVAL);
960 1001
1002 /*
1003 * Siblings of global init remain as zombies on exit since they are
1004 * not reaped by their parent (swapper). To solve this and to avoid
1005 * multi-rooted process trees, prevent global and container-inits
1006 * from creating siblings.
1007 */
1008 if ((clone_flags & CLONE_PARENT) &&
1009 current->signal->flags & SIGNAL_UNKILLABLE)
1010 return ERR_PTR(-EINVAL);
1011
961 retval = security_task_create(clone_flags); 1012 retval = security_task_create(clone_flags);
962 if (retval) 1013 if (retval)
963 goto fork_out; 1014 goto fork_out;
@@ -999,9 +1050,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1050 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1000 goto bad_fork_cleanup_count; 1051 goto bad_fork_cleanup_count;
1001 1052
1002 if (p->binfmt && !try_module_get(p->binfmt->module))
1003 goto bad_fork_cleanup_put_domain;
1004
1005 p->did_exec = 0; 1053 p->did_exec = 0;
1006 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1054 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1007 copy_flags(clone_flags, p); 1055 copy_flags(clone_flags, p);
@@ -1075,10 +1123,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1075 1123
1076 p->bts = NULL; 1124 p->bts = NULL;
1077 1125
1126 p->stack_start = stack_start;
1127
1078 /* Perform scheduler related setup. Assign this task to a CPU. */ 1128 /* Perform scheduler related setup. Assign this task to a CPU. */
1079 sched_fork(p, clone_flags); 1129 sched_fork(p, clone_flags);
1080 1130
1081 retval = perf_counter_init_task(p); 1131 retval = perf_event_init_task(p);
1082 if (retval) 1132 if (retval)
1083 goto bad_fork_cleanup_policy; 1133 goto bad_fork_cleanup_policy;
1084 1134
@@ -1253,7 +1303,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1253 write_unlock_irq(&tasklist_lock); 1303 write_unlock_irq(&tasklist_lock);
1254 proc_fork_connector(p); 1304 proc_fork_connector(p);
1255 cgroup_post_fork(p); 1305 cgroup_post_fork(p);
1256 perf_counter_fork(p); 1306 perf_event_fork(p);
1257 return p; 1307 return p;
1258 1308
1259bad_fork_free_pid: 1309bad_fork_free_pid:
@@ -1280,16 +1330,13 @@ bad_fork_cleanup_semundo:
1280bad_fork_cleanup_audit: 1330bad_fork_cleanup_audit:
1281 audit_free(p); 1331 audit_free(p);
1282bad_fork_cleanup_policy: 1332bad_fork_cleanup_policy:
1283 perf_counter_free_task(p); 1333 perf_event_free_task(p);
1284#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
1285 mpol_put(p->mempolicy); 1335 mpol_put(p->mempolicy);
1286bad_fork_cleanup_cgroup: 1336bad_fork_cleanup_cgroup:
1287#endif 1337#endif
1288 cgroup_exit(p, cgroup_callbacks_done); 1338 cgroup_exit(p, cgroup_callbacks_done);
1289 delayacct_tsk_free(p); 1339 delayacct_tsk_free(p);
1290 if (p->binfmt)
1291 module_put(p->binfmt->module);
1292bad_fork_cleanup_put_domain:
1293 module_put(task_thread_info(p)->exec_domain->module); 1340 module_put(task_thread_info(p)->exec_domain->module);
1294bad_fork_cleanup_count: 1341bad_fork_cleanup_count:
1295 atomic_dec(&p->cred->user->processes); 1342 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 248dd119a86e..4949d336d88d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key; 121 union futex_key *requeue_pi_key;
120
121 /* Bitset for the optional bitmasked wakeup */
122 u32 bitset; 122 u32 bitset;
123}; 123};
124 124
@@ -198,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key)
198} 198}
199 199
200/** 200/**
201 * get_futex_key - Get parameters which are the keys for a futex. 201 * get_futex_key() - Get parameters which are the keys for a futex
202 * @uaddr: virtual address of the futex 202 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 204 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 205 * @rw: mapping needs to be read/write (values: VERIFY_READ,
206 * VERIFY_WRITE)
206 * 207 *
207 * Returns a negative error code or 0 208 * Returns a negative error code or 0
208 * The key words are stored in *key on success. 209 * The key words are stored in *key on success.
@@ -288,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key)
288 drop_futex_key_refs(key); 289 drop_futex_key_refs(key);
289} 290}
290 291
291/* 292/**
292 * fault_in_user_writeable - fault in user address and verify RW access 293 * fault_in_user_writeable() - Fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address 294 * @uaddr: pointer to faulting user space address
294 * 295 *
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
@@ -309,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
309 310
310/** 311/**
311 * futex_top_waiter() - Return the highest priority waiter on a futex 312 * futex_top_waiter() - Return the highest priority waiter on a futex
312 * @hb: the hash bucket the futex_q's reside in 313 * @hb: the hash bucket the futex_q's reside in
313 * @key: the futex key (to distinguish it from other futex futex_q's) 314 * @key: the futex key (to distinguish it from other futex futex_q's)
314 * 315 *
315 * Must be called with the hb lock held. 316 * Must be called with the hb lock held.
316 */ 317 */
@@ -588,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
588} 589}
589 590
590/** 591/**
591 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 592 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
592 * @uaddr: the pi futex user address 593 * @uaddr: the pi futex user address
593 * @hb: the pi futex hash bucket 594 * @hb: the pi futex hash bucket
594 * @key: the futex key associated with uaddr and hb 595 * @key: the futex key associated with uaddr and hb
@@ -915,8 +916,8 @@ retry:
915 hb1 = hash_futex(&key1); 916 hb1 = hash_futex(&key1);
916 hb2 = hash_futex(&key2); 917 hb2 = hash_futex(&key2);
917 918
918 double_lock_hb(hb1, hb2);
919retry_private: 919retry_private:
920 double_lock_hb(hb1, hb2);
920 op_ret = futex_atomic_op_inuser(op, uaddr2); 921 op_ret = futex_atomic_op_inuser(op, uaddr2);
921 if (unlikely(op_ret < 0)) { 922 if (unlikely(op_ret < 0)) {
922 923
@@ -1011,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1011 1012
1012/** 1013/**
1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1014 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1014 * q: the futex_q 1015 * @q: the futex_q
1015 * key: the key of the requeue target futex 1016 * @key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex 1017 * @hb: the hash_bucket of the requeue target futex
1017 * 1018 *
1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1019 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1020 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1350,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1350 return hb; 1351 return hb;
1351} 1352}
1352 1353
1354static inline void
1355queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1356{
1357 spin_unlock(&hb->lock);
1358 drop_futex_key_refs(&q->key);
1359}
1360
1361/**
1362 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1363 * @q: The futex_q to enqueue
1364 * @hb: The destination hash bucket
1365 *
1366 * The hb->lock must be held by the caller, and is released here. A call to
1367 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1368 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1369 * or nothing if the unqueue is done as part of the wake process and the unqueue
1370 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1371 * an example).
1372 */
1353static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1373static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1354{ 1374{
1355 int prio; 1375 int prio;
@@ -1373,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1373 spin_unlock(&hb->lock); 1393 spin_unlock(&hb->lock);
1374} 1394}
1375 1395
1376static inline void 1396/**
1377queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1397 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1378{ 1398 * @q: The futex_q to unqueue
1379 spin_unlock(&hb->lock); 1399 *
1380 drop_futex_key_refs(&q->key); 1400 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1381} 1401 * be paired with exactly one earlier call to queue_me().
1382 1402 *
1383/* 1403 * Returns:
1384 * queue_me and unqueue_me must be called as a pair, each 1404 * 1 - if the futex_q was still queued (and we removed unqueued it)
1385 * exactly once. They are called with the hashed spinlock held. 1405 * 0 - if the futex_q was already removed by the waking thread
1386 */ 1406 */
1387
1388/* Return 1 if we were still queued (ie. 0 means we were woken) */
1389static int unqueue_me(struct futex_q *q) 1407static int unqueue_me(struct futex_q *q)
1390{ 1408{
1391 spinlock_t *lock_ptr; 1409 spinlock_t *lock_ptr;
@@ -1638,17 +1656,14 @@ out:
1638static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1656static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1639 struct hrtimer_sleeper *timeout) 1657 struct hrtimer_sleeper *timeout)
1640{ 1658{
1641 queue_me(q, hb);
1642
1643 /* 1659 /*
1644 * There might have been scheduling since the queue_me(), as we 1660 * The task state is guaranteed to be set before another task can
1645 * cannot hold a spinlock across the get_user() in case it 1661 * wake it. set_current_state() is implemented using set_mb() and
1646 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1662 * queue_me() calls spin_unlock() upon completion, both serializing
1647 * queueing ourselves into the futex hash. This code thus has to 1663 * access to the hash list and forcing another memory barrier.
1648 * rely on the futex_wake() code removing us from hash when it
1649 * wakes us up.
1650 */ 1664 */
1651 set_current_state(TASK_INTERRUPTIBLE); 1665 set_current_state(TASK_INTERRUPTIBLE);
1666 queue_me(q, hb);
1652 1667
1653 /* Arm the timer */ 1668 /* Arm the timer */
1654 if (timeout) { 1669 if (timeout) {
@@ -1658,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1658 } 1673 }
1659 1674
1660 /* 1675 /*
1661 * !plist_node_empty() is safe here without any lock. 1676 * If we have been removed from the hash list, then another task
1662 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1677 * has tried to wake us, and we can skip the call to schedule().
1663 */ 1678 */
1664 if (likely(!plist_node_empty(&q->list))) { 1679 if (likely(!plist_node_empty(&q->list))) {
1665 /* 1680 /*
@@ -2102,7 +2117,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2102 * Unqueue the futex_q and determine which it was. 2117 * Unqueue the futex_q and determine which it was.
2103 */ 2118 */
2104 plist_del(&q->list, &q->list.plist); 2119 plist_del(&q->list, &q->list.plist);
2105 drop_futex_key_refs(&q->key);
2106 2120
2107 if (timeout && !timeout->task) 2121 if (timeout && !timeout->task)
2108 ret = -ETIMEDOUT; 2122 ret = -ETIMEDOUT;
@@ -2114,12 +2128,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2114 2128
2115/** 2129/**
2116 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2130 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2117 * @uaddr: the futex we initialyl wait on (non-pi) 2131 * @uaddr: the futex we initially wait on (non-pi)
2118 * @fshared: whether the futexes are shared (1) or not (0). They must be 2132 * @fshared: whether the futexes are shared (1) or not (0). They must be
2119 * the same type, no requeueing from private to shared, etc. 2133 * the same type, no requeueing from private to shared, etc.
2120 * @val: the expected value of uaddr 2134 * @val: the expected value of uaddr
2121 * @abs_time: absolute timeout 2135 * @abs_time: absolute timeout
2122 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2136 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2123 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2137 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2124 * @uaddr2: the pi futex we will take prior to returning to user-space 2138 * @uaddr2: the pi futex we will take prior to returning to user-space
2125 * 2139 *
@@ -2246,7 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2246 res = fixup_owner(uaddr2, fshared, &q, !ret); 2260 res = fixup_owner(uaddr2, fshared, &q, !ret);
2247 /* 2261 /*
2248 * If fixup_owner() returned an error, proprogate that. If it 2262 * If fixup_owner() returned an error, proprogate that. If it
2249 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2263 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2250 */ 2264 */
2251 if (res) 2265 if (res)
2252 ret = (res < 0) ? res : 0; 2266 ret = (res < 0) ? res : 0;
@@ -2302,9 +2316,9 @@ out:
2302 */ 2316 */
2303 2317
2304/** 2318/**
2305 * sys_set_robust_list - set the robust-futex list head of a task 2319 * sys_set_robust_list() - Set the robust-futex list head of a task
2306 * @head: pointer to the list-head 2320 * @head: pointer to the list-head
2307 * @len: length of the list-head, as userspace expects 2321 * @len: length of the list-head, as userspace expects
2308 */ 2322 */
2309SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2323SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310 size_t, len) 2324 size_t, len)
@@ -2323,10 +2337,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2323} 2337}
2324 2338
2325/** 2339/**
2326 * sys_get_robust_list - get the robust-futex list head of a task 2340 * sys_get_robust_list() - Get the robust-futex list head of a task
2327 * @pid: pid of the process [zero for current task] 2341 * @pid: pid of the process [zero for current task]
2328 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2342 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2329 * @len_ptr: pointer to a length field, the kernel fills in the header size 2343 * @len_ptr: pointer to a length field, the kernel fills in the header size
2330 */ 2344 */
2331SYSCALL_DEFINE3(get_robust_list, int, pid, 2345SYSCALL_DEFINE3(get_robust_list, int, pid,
2332 struct robust_list_head __user * __user *, head_ptr, 2346 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05071bf6a37b..3e1c36e7998f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,36 +48,7 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/** 51#include <trace/events/timer.h>
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81 52
82/* 53/*
83 * The timer bases: 54 * The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 77 }
107}; 78};
108 79
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 80/*
135 * Get the coarse grained time at the softirq based on xtime and 81 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 82 * wall_to_monotonic.
@@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
498static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 444static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
499#endif 445#endif
500 446
447static inline void
448debug_init(struct hrtimer *timer, clockid_t clockid,
449 enum hrtimer_mode mode)
450{
451 debug_hrtimer_init(timer);
452 trace_hrtimer_init(timer, clockid, mode);
453}
454
455static inline void debug_activate(struct hrtimer *timer)
456{
457 debug_hrtimer_activate(timer);
458 trace_hrtimer_start(timer);
459}
460
461static inline void debug_deactivate(struct hrtimer *timer)
462{
463 debug_hrtimer_deactivate(timer);
464 trace_hrtimer_cancel(timer);
465}
466
501/* High resolution timer related functions */ 467/* High resolution timer related functions */
502#ifdef CONFIG_HIGH_RES_TIMERS 468#ifdef CONFIG_HIGH_RES_TIMERS
503 469
@@ -543,13 +509,14 @@ static inline int hrtimer_hres_active(void)
543 * next event 509 * next event
544 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
545 */ 511 */
546static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
547{ 514{
548 int i; 515 int i;
549 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
550 ktime_t expires; 517 ktime_t expires, expires_next;
551 518
552 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
553 520
554 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
555 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -565,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
565 */ 532 */
566 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
567 expires.tv64 = 0; 534 expires.tv64 = 0;
568 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
569 cpu_base->expires_next = expires; 536 expires_next = expires;
570 } 537 }
571 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
572 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
573 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
574} 546}
@@ -651,7 +623,7 @@ static void retrigger_next_event(void *arg)
651 base->clock_base[CLOCK_REALTIME].offset = 623 base->clock_base[CLOCK_REALTIME].offset =
652 timespec_to_ktime(realtime_offset); 624 timespec_to_ktime(realtime_offset);
653 625
654 hrtimer_force_reprogram(base); 626 hrtimer_force_reprogram(base, 0);
655 spin_unlock(&base->lock); 627 spin_unlock(&base->lock);
656} 628}
657 629
@@ -754,8 +726,6 @@ static int hrtimer_switch_to_hres(void)
754 /* "Retrigger" the interrupt to get things going */ 726 /* "Retrigger" the interrupt to get things going */
755 retrigger_next_event(NULL); 727 retrigger_next_event(NULL);
756 local_irq_restore(flags); 728 local_irq_restore(flags);
757 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
758 smp_processor_id());
759 return 1; 729 return 1;
760} 730}
761 731
@@ -764,7 +734,8 @@ static int hrtimer_switch_to_hres(void)
764static inline int hrtimer_hres_active(void) { return 0; } 734static inline int hrtimer_hres_active(void) { return 0; }
765static inline int hrtimer_is_hres_enabled(void) { return 0; } 735static inline int hrtimer_is_hres_enabled(void) { return 0; }
766static inline int hrtimer_switch_to_hres(void) { return 0; } 736static inline int hrtimer_switch_to_hres(void) { return 0; }
767static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 737static inline void
738hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
768static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 739static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
769 struct hrtimer_clock_base *base, 740 struct hrtimer_clock_base *base,
770 int wakeup) 741 int wakeup)
@@ -854,7 +825,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
854 struct hrtimer *entry; 825 struct hrtimer *entry;
855 int leftmost = 1; 826 int leftmost = 1;
856 827
857 debug_hrtimer_activate(timer); 828 debug_activate(timer);
858 829
859 /* 830 /*
860 * Find the right place in the rbtree: 831 * Find the right place in the rbtree:
@@ -907,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
907 struct hrtimer_clock_base *base, 878 struct hrtimer_clock_base *base,
908 unsigned long newstate, int reprogram) 879 unsigned long newstate, int reprogram)
909{ 880{
910 if (timer->state & HRTIMER_STATE_ENQUEUED) { 881 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
911 /* 882 goto out;
912 * Remove the timer from the rbtree and replace the 883
913 * first entry pointer if necessary. 884 /*
914 */ 885 * Remove the timer from the rbtree and replace the first
915 if (base->first == &timer->node) { 886 * entry pointer if necessary.
916 base->first = rb_next(&timer->node); 887 */
917 /* Reprogram the clock event device. if enabled */ 888 if (base->first == &timer->node) {
918 if (reprogram && hrtimer_hres_active()) 889 base->first = rb_next(&timer->node);
919 hrtimer_force_reprogram(base->cpu_base); 890#ifdef CONFIG_HIGH_RES_TIMERS
891 /* Reprogram the clock event device. if enabled */
892 if (reprogram && hrtimer_hres_active()) {
893 ktime_t expires;
894
895 expires = ktime_sub(hrtimer_get_expires(timer),
896 base->offset);
897 if (base->cpu_base->expires_next.tv64 == expires.tv64)
898 hrtimer_force_reprogram(base->cpu_base, 1);
920 } 899 }
921 rb_erase(&timer->node, &base->active); 900#endif
922 } 901 }
902 rb_erase(&timer->node, &base->active);
903out:
923 timer->state = newstate; 904 timer->state = newstate;
924} 905}
925 906
@@ -940,7 +921,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
940 * reprogramming happens in the interrupt handler. This is a 921 * reprogramming happens in the interrupt handler. This is a
941 * rare case and less expensive than a smp call. 922 * rare case and less expensive than a smp call.
942 */ 923 */
943 debug_hrtimer_deactivate(timer); 924 debug_deactivate(timer);
944 timer_stats_hrtimer_clear_start_info(timer); 925 timer_stats_hrtimer_clear_start_info(timer);
945 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 926 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
946 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 927 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1155,7 +1136,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1155 clock_id = CLOCK_MONOTONIC; 1136 clock_id = CLOCK_MONOTONIC;
1156 1137
1157 timer->base = &cpu_base->clock_base[clock_id]; 1138 timer->base = &cpu_base->clock_base[clock_id];
1158 INIT_LIST_HEAD(&timer->cb_entry);
1159 hrtimer_init_timer_hres(timer); 1139 hrtimer_init_timer_hres(timer);
1160 1140
1161#ifdef CONFIG_TIMER_STATS 1141#ifdef CONFIG_TIMER_STATS
@@ -1174,7 +1154,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1174void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1154void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1175 enum hrtimer_mode mode) 1155 enum hrtimer_mode mode)
1176{ 1156{
1177 debug_hrtimer_init(timer); 1157 debug_init(timer, clock_id, mode);
1178 __hrtimer_init(timer, clock_id, mode); 1158 __hrtimer_init(timer, clock_id, mode);
1179} 1159}
1180EXPORT_SYMBOL_GPL(hrtimer_init); 1160EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1198,7 +1178,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1198} 1178}
1199EXPORT_SYMBOL_GPL(hrtimer_get_res); 1179EXPORT_SYMBOL_GPL(hrtimer_get_res);
1200 1180
1201static void __run_hrtimer(struct hrtimer *timer) 1181static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1202{ 1182{
1203 struct hrtimer_clock_base *base = timer->base; 1183 struct hrtimer_clock_base *base = timer->base;
1204 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1184 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1207,7 +1187,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1207 1187
1208 WARN_ON(!irqs_disabled()); 1188 WARN_ON(!irqs_disabled());
1209 1189
1210 debug_hrtimer_deactivate(timer); 1190 debug_deactivate(timer);
1211 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1191 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1212 timer_stats_account_hrtimer(timer); 1192 timer_stats_account_hrtimer(timer);
1213 fn = timer->function; 1193 fn = timer->function;
@@ -1218,7 +1198,9 @@ static void __run_hrtimer(struct hrtimer *timer)
1218 * the timer base. 1198 * the timer base.
1219 */ 1199 */
1220 spin_unlock(&cpu_base->lock); 1200 spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now);
1221 restart = fn(timer); 1202 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer);
1222 spin_lock(&cpu_base->lock); 1204 spin_lock(&cpu_base->lock);
1223 1205
1224 /* 1206 /*
@@ -1329,7 +1311,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1329 break; 1311 break;
1330 } 1312 }
1331 1313
1332 __run_hrtimer(timer); 1314 __run_hrtimer(timer, &basenow);
1333 } 1315 }
1334 base++; 1316 base++;
1335 } 1317 }
@@ -1451,7 +1433,7 @@ void hrtimer_run_queues(void)
1451 hrtimer_get_expires_tv64(timer)) 1433 hrtimer_get_expires_tv64(timer))
1452 break; 1434 break;
1453 1435
1454 __run_hrtimer(timer); 1436 __run_hrtimer(timer, &base->softirq_time);
1455 } 1437 }
1456 spin_unlock(&cpu_base->lock); 1438 spin_unlock(&cpu_base->lock);
1457 } 1439 }
@@ -1628,7 +1610,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1628 while ((node = rb_first(&old_base->active))) { 1610 while ((node = rb_first(&old_base->active))) {
1629 timer = rb_entry(node, struct hrtimer, node); 1611 timer = rb_entry(node, struct hrtimer, node);
1630 BUG_ON(hrtimer_callback_running(timer)); 1612 BUG_ON(hrtimer_callback_running(timer));
1631 debug_hrtimer_deactivate(timer); 1613 debug_deactivate(timer);
1632 1614
1633 /* 1615 /*
1634 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1616 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..d4e841747400 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..17c71bb565c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/random.h> 17#include <linux/random.h>
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b03451ede528 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/posix-timers.h> 13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
41 return ktime_to_timeval(rem); 42 return ktime_to_timeval(rem);
42} 43}
43 44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime);
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cputime_le(cval, t))
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cputime_sub(cval, t);
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
44int do_getitimer(int which, struct itimerval *value) 79int do_getitimer(int which, struct itimerval *value)
45{ 80{
46 struct task_struct *tsk = current; 81 struct task_struct *tsk = current;
47 cputime_t cinterval, cval;
48 82
49 switch (which) { 83 switch (which) {
50 case ITIMER_REAL: 84 case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 89 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 90 break;
57 case ITIMER_VIRTUAL: 91 case ITIMER_VIRTUAL:
58 spin_lock_irq(&tsk->sighand->siglock); 92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
59 cval = tsk->signal->it_virt_expires;
60 cinterval = tsk->signal->it_virt_incr;
61 if (!cputime_eq(cval, cputime_zero)) {
62 struct task_cputime cputime;
63 cputime_t utime;
64
65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1);
69 } else {
70 cval = cputime_sub(cval, utime);
71 }
72 }
73 spin_unlock_irq(&tsk->sighand->siglock);
74 cputime_to_timeval(cval, &value->it_value);
75 cputime_to_timeval(cinterval, &value->it_interval);
76 break; 93 break;
77 case ITIMER_PROF: 94 case ITIMER_PROF:
78 spin_lock_irq(&tsk->sighand->siglock); 95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
79 cval = tsk->signal->it_prof_expires;
80 cinterval = tsk->signal->it_prof_incr;
81 if (!cputime_eq(cval, cputime_zero)) {
82 struct task_cputime times;
83 cputime_t ptime;
84
85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1);
89 } else {
90 cval = cputime_sub(cval, ptime);
91 }
92 }
93 spin_unlock_irq(&tsk->sighand->siglock);
94 cputime_to_timeval(cval, &value->it_value);
95 cputime_to_timeval(cinterval, &value->it_interval);
96 break; 96 break;
97 default: 97 default:
98 return(-EINVAL); 98 return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
123 struct signal_struct *sig = 123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer); 124 container_of(timer, struct signal_struct, real_timer);
125 125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
126 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); 127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
127 128
128 return HRTIMER_NORESTART; 129 return HRTIMER_NORESTART;
129} 130}
130 131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150
151 nval = timeval_to_cputime(&value->it_value);
152 ns_nval = timeval_to_ns(&value->it_value);
153 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval);
155
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
157 it->error = cputime_sub_ns(nval, ns_nval);
158
159 spin_lock_irq(&tsk->sighand->siglock);
160
161 cval = it->expires;
162 cinterval = it->incr;
163 if (!cputime_eq(cval, cputime_zero) ||
164 !cputime_eq(nval, cputime_zero)) {
165 if (cputime_gt(nval, cputime_zero))
166 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173
174 spin_unlock_irq(&tsk->sighand->siglock);
175
176 if (ovalue) {
177 cputime_to_timeval(cval, &ovalue->it_value);
178 cputime_to_timeval(cinterval, &ovalue->it_interval);
179 }
180}
181
131/* 182/*
132 * Returns true if the timeval is in canonical form 183 * Returns true if the timeval is in canonical form
133 */ 184 */
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
139 struct task_struct *tsk = current; 190 struct task_struct *tsk = current;
140 struct hrtimer *timer; 191 struct hrtimer *timer;
141 ktime_t expires; 192 ktime_t expires;
142 cputime_t cval, cinterval, nval, ninterval;
143 193
144 /* 194 /*
145 * Validate the timevals in value. 195 * Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
171 } else 221 } else
172 tsk->signal->it_real_incr.tv64 = 0; 222 tsk->signal->it_real_incr.tv64 = 0;
173 223
224 trace_itimer_state(ITIMER_REAL, value, 0);
174 spin_unlock_irq(&tsk->sighand->siglock); 225 spin_unlock_irq(&tsk->sighand->siglock);
175 break; 226 break;
176 case ITIMER_VIRTUAL: 227 case ITIMER_VIRTUAL:
177 nval = timeval_to_cputime(&value->it_value); 228 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
178 ninterval = timeval_to_cputime(&value->it_interval);
179 spin_lock_irq(&tsk->sighand->siglock);
180 cval = tsk->signal->it_virt_expires;
181 cinterval = tsk->signal->it_virt_incr;
182 if (!cputime_eq(cval, cputime_zero) ||
183 !cputime_eq(nval, cputime_zero)) {
184 if (cputime_gt(nval, cputime_zero))
185 nval = cputime_add(nval,
186 jiffies_to_cputime(1));
187 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
188 &nval, &cval);
189 }
190 tsk->signal->it_virt_expires = nval;
191 tsk->signal->it_virt_incr = ninterval;
192 spin_unlock_irq(&tsk->sighand->siglock);
193 if (ovalue) {
194 cputime_to_timeval(cval, &ovalue->it_value);
195 cputime_to_timeval(cinterval, &ovalue->it_interval);
196 }
197 break; 229 break;
198 case ITIMER_PROF: 230 case ITIMER_PROF:
199 nval = timeval_to_cputime(&value->it_value); 231 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
200 ninterval = timeval_to_cputime(&value->it_interval);
201 spin_lock_irq(&tsk->sighand->siglock);
202 cval = tsk->signal->it_prof_expires;
203 cinterval = tsk->signal->it_prof_incr;
204 if (!cputime_eq(cval, cputime_zero) ||
205 !cputime_eq(nval, cputime_zero)) {
206 if (cputime_gt(nval, cputime_zero))
207 nval = cputime_add(nval,
208 jiffies_to_cputime(1));
209 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
210 &nval, &cval);
211 }
212 tsk->signal->it_prof_expires = nval;
213 tsk->signal->it_prof_incr = ninterval;
214 spin_unlock_irq(&tsk->sighand->siglock);
215 if (ovalue) {
216 cputime_to_timeval(cval, &ovalue->it_value);
217 cputime_to_timeval(cinterval, &ovalue->it_interval);
218 }
219 break; 232 break;
220 default: 233 default:
221 return -EINVAL; 234 return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3a29dbe7898e..8b6b8b697c68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
59 59
60static inline int is_kernel_text(unsigned long addr) 60static inline int is_kernel_text(unsigned long addr)
61{ 61{
62 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 62 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
63 arch_is_kernel_text(addr))
63 return 1; 64 return 1;
64 return in_gate_area_no_task(addr); 65 return in_gate_area_no_task(addr);
65} 66}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..5240d75f4c60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1321 return 0; 1321 return 0;
1322} 1322}
1323 1323
1324static struct seq_operations kprobes_seq_ops = { 1324static const struct seq_operations kprobes_seq_ops = {
1325 .start = kprobe_seq_start, 1325 .start = kprobe_seq_start,
1326 .next = kprobe_seq_next, 1326 .next = kprobe_seq_next,
1327 .stop = kprobe_seq_stop, 1327 .stop = kprobe_seq_stop,
@@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1333 return seq_open(filp, &kprobes_seq_ops); 1333 return seq_open(filp, &kprobes_seq_ops);
1334} 1334}
1335 1335
1336static struct file_operations debugfs_kprobes_operations = { 1336static const struct file_operations debugfs_kprobes_operations = {
1337 .open = kprobes_open, 1337 .open = kprobes_open,
1338 .read = seq_read, 1338 .read = seq_read,
1339 .llseek = seq_lseek, 1339 .llseek = seq_lseek,
@@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1515 return count; 1515 return count;
1516} 1516}
1517 1517
1518static struct file_operations fops_kp = { 1518static const struct file_operations fops_kp = {
1519 .read = read_enabled_file_bool, 1519 .read = read_enabled_file_bool,
1520 .write = write_enabled_file_bool, 1520 .write = write_enabled_file_bool,
1521}; 1521};
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f74d2d7aa605..9af56723c096 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
144 144
145static inline u64 lockstat_clock(void)
146{
147 return cpu_clock(smp_processor_id());
148}
149
145static int lock_point(unsigned long points[], unsigned long ip) 150static int lock_point(unsigned long points[], unsigned long ip)
146{ 151{
147 int i; 152 int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 163 return i;
159} 164}
160 165
161static void lock_time_inc(struct lock_time *lt, s64 time) 166static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 167{
163 if (time > lt->max) 168 if (time > lt->max)
164 lt->max = time; 169 lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
234static void lock_release_holdtime(struct held_lock *hlock) 239static void lock_release_holdtime(struct held_lock *hlock)
235{ 240{
236 struct lock_class_stats *stats; 241 struct lock_class_stats *stats;
237 s64 holdtime; 242 u64 holdtime;
238 243
239 if (!lock_stat) 244 if (!lock_stat)
240 return; 245 return;
241 246
242 holdtime = sched_clock() - hlock->holdtime_stamp; 247 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 248
244 stats = get_lock_stats(hlock_class(hlock)); 249 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 250 if (hlock->read)
@@ -578,6 +583,9 @@ static int static_obj(void *obj)
578 if ((addr >= start) && (addr < end)) 583 if ((addr >= start) && (addr < end))
579 return 1; 584 return 1;
580 585
586 if (arch_is_kernel_data(addr))
587 return 1;
588
581#ifdef CONFIG_SMP 589#ifdef CONFIG_SMP
582 /* 590 /*
583 * percpu var? 591 * percpu var?
@@ -2789,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2789 hlock->references = references; 2797 hlock->references = references;
2790#ifdef CONFIG_LOCK_STAT 2798#ifdef CONFIG_LOCK_STAT
2791 hlock->waittime_stamp = 0; 2799 hlock->waittime_stamp = 0;
2792 hlock->holdtime_stamp = sched_clock(); 2800 hlock->holdtime_stamp = lockstat_clock();
2793#endif 2801#endif
2794 2802
2795 if (check == 2 && !mark_irqflags(curr, hlock)) 2803 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3319,7 +3327,7 @@ found_it:
3319 if (hlock->instance != lock) 3327 if (hlock->instance != lock)
3320 return; 3328 return;
3321 3329
3322 hlock->waittime_stamp = sched_clock(); 3330 hlock->waittime_stamp = lockstat_clock();
3323 3331
3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3332 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3325 contending_point = lock_point(hlock_class(hlock)->contending_point, 3333 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3342,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3342 struct held_lock *hlock, *prev_hlock; 3350 struct held_lock *hlock, *prev_hlock;
3343 struct lock_class_stats *stats; 3351 struct lock_class_stats *stats;
3344 unsigned int depth; 3352 unsigned int depth;
3345 u64 now; 3353 u64 now, waittime = 0;
3346 s64 waittime = 0;
3347 int i, cpu; 3354 int i, cpu;
3348 3355
3349 depth = curr->lockdep_depth; 3356 depth = curr->lockdep_depth;
@@ -3371,7 +3378,7 @@ found_it:
3371 3378
3372 cpu = smp_processor_id(); 3379 cpu = smp_processor_id();
3373 if (hlock->waittime_stamp) { 3380 if (hlock->waittime_stamp) {
3374 now = sched_clock(); 3381 now = lockstat_clock();
3375 waittime = now - hlock->waittime_stamp; 3382 waittime = now - hlock->waittime_stamp;
3376 hlock->holdtime_stamp = now; 3383 hlock->holdtime_stamp = now;
3377 } 3384 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4b3dbc79fdb..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
594 return 0; 594 return 0;
595} 595}
596 596
597static struct seq_operations lockstat_ops = { 597static const struct seq_operations lockstat_ops = {
598 .start = ls_start, 598 .start = ls_start,
599 .next = ls_next, 599 .next = ls_next,
600 .stop = ls_stop, 600 .stop = ls_stop,
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 46580edff0cb..8b7d8805819d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module);
369 370
370#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
371 372
372#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
373 374
374static void *percpu_modalloc(unsigned long size, unsigned long align, 375static void *percpu_modalloc(unsigned long size, unsigned long align,
375 const char *name) 376 const char *name)
@@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme)
394 free_percpu(freeme); 395 free_percpu(freeme);
395} 396}
396 397
397#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
398 399
399/* Number of blocks used and allocated. */ 400/* Number of blocks used and allocated. */
400static unsigned int pcpu_num_used, pcpu_num_allocated; 401static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -540,7 +541,7 @@ static int percpu_modinit(void)
540} 541}
541__initcall(percpu_modinit); 542__initcall(percpu_modinit);
542 543
543#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
544 545
545static unsigned int find_pcpusec(Elf_Ehdr *hdr, 546static unsigned int find_pcpusec(Elf_Ehdr *hdr,
546 Elf_Shdr *sechdrs, 547 Elf_Shdr *sechdrs,
@@ -1535,6 +1536,10 @@ static void free_module(struct module *mod)
1535 1536
1536 /* Finally, free the core (containing the module structure) */ 1537 /* Finally, free the core (containing the module structure) */
1537 module_free(mod, mod->module_core); 1538 module_free(mod, mod->module_core);
1539
1540#ifdef CONFIG_MPU
1541 update_protections(current->mm);
1542#endif
1538} 1543}
1539 1544
1540void *__symbol_get(const char *symbol) 1545void *__symbol_get(const char *symbol)
@@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1792 } 1797 }
1793} 1798}
1794 1799
1800static void free_modinfo(struct module *mod)
1801{
1802 struct module_attribute *attr;
1803 int i;
1804
1805 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1806 if (attr->free)
1807 attr->free(mod);
1808 }
1809}
1810
1795#ifdef CONFIG_KALLSYMS 1811#ifdef CONFIG_KALLSYMS
1796 1812
1797/* lookup symbol in given range of kernel_symbols */ 1813/* lookup symbol in given range of kernel_symbols */
@@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
1857 return '?'; 1873 return '?';
1858} 1874}
1859 1875
1876static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1877 unsigned int shnum)
1878{
1879 const Elf_Shdr *sec;
1880
1881 if (src->st_shndx == SHN_UNDEF
1882 || src->st_shndx >= shnum
1883 || !src->st_name)
1884 return false;
1885
1886 sec = sechdrs + src->st_shndx;
1887 if (!(sec->sh_flags & SHF_ALLOC)
1888#ifndef CONFIG_KALLSYMS_ALL
1889 || !(sec->sh_flags & SHF_EXECINSTR)
1890#endif
1891 || (sec->sh_entsize & INIT_OFFSET_MASK))
1892 return false;
1893
1894 return true;
1895}
1896
1897static unsigned long layout_symtab(struct module *mod,
1898 Elf_Shdr *sechdrs,
1899 unsigned int symindex,
1900 unsigned int strindex,
1901 const Elf_Ehdr *hdr,
1902 const char *secstrings,
1903 unsigned long *pstroffs,
1904 unsigned long *strmap)
1905{
1906 unsigned long symoffs;
1907 Elf_Shdr *symsect = sechdrs + symindex;
1908 Elf_Shdr *strsect = sechdrs + strindex;
1909 const Elf_Sym *src;
1910 const char *strtab;
1911 unsigned int i, nsrc, ndst;
1912
1913 /* Put symbol section at end of init part of module. */
1914 symsect->sh_flags |= SHF_ALLOC;
1915 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1916 symindex) | INIT_OFFSET_MASK;
1917 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1918
1919 src = (void *)hdr + symsect->sh_offset;
1920 nsrc = symsect->sh_size / sizeof(*src);
1921 strtab = (void *)hdr + strsect->sh_offset;
1922 for (ndst = i = 1; i < nsrc; ++i, ++src)
1923 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1924 unsigned int j = src->st_name;
1925
1926 while(!__test_and_set_bit(j, strmap) && strtab[j])
1927 ++j;
1928 ++ndst;
1929 }
1930
1931 /* Append room for core symbols at end of core part. */
1932 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1933 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1934
1935 /* Put string table section at end of init part of module. */
1936 strsect->sh_flags |= SHF_ALLOC;
1937 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1938 strindex) | INIT_OFFSET_MASK;
1939 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1940
1941 /* Append room for core symbols' strings at end of core part. */
1942 *pstroffs = mod->core_size;
1943 __set_bit(0, strmap);
1944 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1945
1946 return symoffs;
1947}
1948
1860static void add_kallsyms(struct module *mod, 1949static void add_kallsyms(struct module *mod,
1861 Elf_Shdr *sechdrs, 1950 Elf_Shdr *sechdrs,
1951 unsigned int shnum,
1862 unsigned int symindex, 1952 unsigned int symindex,
1863 unsigned int strindex, 1953 unsigned int strindex,
1864 const char *secstrings) 1954 unsigned long symoffs,
1955 unsigned long stroffs,
1956 const char *secstrings,
1957 unsigned long *strmap)
1865{ 1958{
1866 unsigned int i; 1959 unsigned int i, ndst;
1960 const Elf_Sym *src;
1961 Elf_Sym *dst;
1962 char *s;
1867 1963
1868 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1964 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1869 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1965 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1873,13 +1969,46 @@ static void add_kallsyms(struct module *mod,
1873 for (i = 0; i < mod->num_symtab; i++) 1969 for (i = 0; i < mod->num_symtab; i++)
1874 mod->symtab[i].st_info 1970 mod->symtab[i].st_info
1875 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1971 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1972
1973 mod->core_symtab = dst = mod->module_core + symoffs;
1974 src = mod->symtab;
1975 *dst = *src;
1976 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1977 if (!is_core_symbol(src, sechdrs, shnum))
1978 continue;
1979 dst[ndst] = *src;
1980 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1981 ++ndst;
1982 }
1983 mod->core_num_syms = ndst;
1984
1985 mod->core_strtab = s = mod->module_core + stroffs;
1986 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1987 if (test_bit(i, strmap))
1988 *++s = mod->strtab[i];
1876} 1989}
1877#else 1990#else
1991static inline unsigned long layout_symtab(struct module *mod,
1992 Elf_Shdr *sechdrs,
1993 unsigned int symindex,
1994 unsigned int strindex,
1995 const Elf_Ehdr *hdr,
1996 const char *secstrings,
1997 unsigned long *pstroffs,
1998 unsigned long *strmap)
1999{
2000 return 0;
2001}
2002
1878static inline void add_kallsyms(struct module *mod, 2003static inline void add_kallsyms(struct module *mod,
1879 Elf_Shdr *sechdrs, 2004 Elf_Shdr *sechdrs,
2005 unsigned int shnum,
1880 unsigned int symindex, 2006 unsigned int symindex,
1881 unsigned int strindex, 2007 unsigned int strindex,
1882 const char *secstrings) 2008 unsigned long symoffs,
2009 unsigned long stroffs,
2010 const char *secstrings,
2011 const unsigned long *strmap)
1883{ 2012{
1884} 2013}
1885#endif /* CONFIG_KALLSYMS */ 2014#endif /* CONFIG_KALLSYMS */
@@ -1954,6 +2083,8 @@ static noinline struct module *load_module(void __user *umod,
1954 struct module *mod; 2083 struct module *mod;
1955 long err = 0; 2084 long err = 0;
1956 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2085 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
2086 unsigned long symoffs, stroffs, *strmap;
2087
1957 mm_segment_t old_fs; 2088 mm_segment_t old_fs;
1958 2089
1959 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2090 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2035,11 +2166,6 @@ static noinline struct module *load_module(void __user *umod,
2035 /* Don't keep modinfo and version sections. */ 2166 /* Don't keep modinfo and version sections. */
2036 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2167 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2037 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2168 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2038#ifdef CONFIG_KALLSYMS
2039 /* Keep symbol and string tables for decoding later. */
2040 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2041 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2042#endif
2043 2169
2044 /* Check module struct version now, before we try to use module. */ 2170 /* Check module struct version now, before we try to use module. */
2045 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2171 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2075,6 +2201,13 @@ static noinline struct module *load_module(void __user *umod,
2075 goto free_hdr; 2201 goto free_hdr;
2076 } 2202 }
2077 2203
2204 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2205 * sizeof(long), GFP_KERNEL);
2206 if (!strmap) {
2207 err = -ENOMEM;
2208 goto free_mod;
2209 }
2210
2078 if (find_module(mod->name)) { 2211 if (find_module(mod->name)) {
2079 err = -EEXIST; 2212 err = -EEXIST;
2080 goto free_mod; 2213 goto free_mod;
@@ -2104,6 +2237,8 @@ static noinline struct module *load_module(void __user *umod,
2104 this is done generically; there doesn't appear to be any 2237 this is done generically; there doesn't appear to be any
2105 special cases for the architectures. */ 2238 special cases for the architectures. */
2106 layout_sections(mod, hdr, sechdrs, secstrings); 2239 layout_sections(mod, hdr, sechdrs, secstrings);
2240 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2241 secstrings, &stroffs, strmap);
2107 2242
2108 /* Do the allocs. */ 2243 /* Do the allocs. */
2109 ptr = module_alloc_update_bounds(mod->core_size); 2244 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2237,10 +2372,6 @@ static noinline struct module *load_module(void __user *umod,
2237 sizeof(*mod->ctors), &mod->num_ctors); 2372 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif 2373#endif
2239 2374
2240#ifdef CONFIG_MARKERS
2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2242 sizeof(*mod->markers), &mod->num_markers);
2243#endif
2244#ifdef CONFIG_TRACEPOINTS 2375#ifdef CONFIG_TRACEPOINTS
2245 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2376 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2246 "__tracepoints", 2377 "__tracepoints",
@@ -2312,7 +2443,10 @@ static noinline struct module *load_module(void __user *umod,
2312 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2443 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2313 sechdrs[pcpuindex].sh_size); 2444 sechdrs[pcpuindex].sh_size);
2314 2445
2315 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2446 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2447 symoffs, stroffs, secstrings, strmap);
2448 kfree(strmap);
2449 strmap = NULL;
2316 2450
2317 if (!mod->taints) { 2451 if (!mod->taints) {
2318 struct _ddebug *debug; 2452 struct _ddebug *debug;
@@ -2384,13 +2518,14 @@ static noinline struct module *load_module(void __user *umod,
2384 synchronize_sched(); 2518 synchronize_sched();
2385 module_arch_cleanup(mod); 2519 module_arch_cleanup(mod);
2386 cleanup: 2520 cleanup:
2521 free_modinfo(mod);
2387 kobject_del(&mod->mkobj.kobj); 2522 kobject_del(&mod->mkobj.kobj);
2388 kobject_put(&mod->mkobj.kobj); 2523 kobject_put(&mod->mkobj.kobj);
2389 free_unload: 2524 free_unload:
2390 module_unload_free(mod); 2525 module_unload_free(mod);
2391#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2526#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2392 free_init:
2393 percpu_modfree(mod->refptr); 2527 percpu_modfree(mod->refptr);
2528 free_init:
2394#endif 2529#endif
2395 module_free(mod, mod->module_init); 2530 module_free(mod, mod->module_init);
2396 free_core: 2531 free_core:
@@ -2401,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod,
2401 percpu_modfree(percpu); 2536 percpu_modfree(percpu);
2402 free_mod: 2537 free_mod:
2403 kfree(args); 2538 kfree(args);
2539 kfree(strmap);
2404 free_hdr: 2540 free_hdr:
2405 vfree(hdr); 2541 vfree(hdr);
2406 return ERR_PTR(err); 2542 return ERR_PTR(err);
@@ -2490,6 +2626,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2490 /* Drop initial reference. */ 2626 /* Drop initial reference. */
2491 module_put(mod); 2627 module_put(mod);
2492 trim_init_extable(mod); 2628 trim_init_extable(mod);
2629#ifdef CONFIG_KALLSYMS
2630 mod->num_symtab = mod->core_num_syms;
2631 mod->symtab = mod->core_symtab;
2632 mod->strtab = mod->core_strtab;
2633#endif
2493 module_free(mod, mod->module_init); 2634 module_free(mod, mod->module_init);
2494 mod->module_init = NULL; 2635 mod->module_init = NULL;
2495 mod->init_size = 0; 2636 mod->init_size = 0;
@@ -2951,27 +3092,12 @@ void module_layout(struct module *mod,
2951 struct modversion_info *ver, 3092 struct modversion_info *ver,
2952 struct kernel_param *kp, 3093 struct kernel_param *kp,
2953 struct kernel_symbol *ks, 3094 struct kernel_symbol *ks,
2954 struct marker *marker,
2955 struct tracepoint *tp) 3095 struct tracepoint *tp)
2956{ 3096{
2957} 3097}
2958EXPORT_SYMBOL(module_layout); 3098EXPORT_SYMBOL(module_layout);
2959#endif 3099#endif
2960 3100
2961#ifdef CONFIG_MARKERS
2962void module_update_markers(void)
2963{
2964 struct module *mod;
2965
2966 mutex_lock(&module_mutex);
2967 list_for_each_entry(mod, &modules, list)
2968 if (!mod->taints)
2969 marker_update_probe_range(mod->markers,
2970 mod->markers + mod->num_markers);
2971 mutex_unlock(&module_mutex);
2972}
2973#endif
2974
2975#ifdef CONFIG_TRACEPOINTS 3101#ifdef CONFIG_TRACEPOINTS
2976void module_update_tracepoints(void) 3102void module_update_tracepoints(void)
2977{ 3103{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..96b45d0b4ba5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...)
90 90
91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
92 92
93 bust_spinlocks(0);
94
93 if (!panic_blink) 95 if (!panic_blink)
94 panic_blink = no_blink; 96 panic_blink = no_blink;
95 97
@@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...)
136 mdelay(1); 138 mdelay(1);
137 i++; 139 i++;
138 } 140 }
139 bust_spinlocks(0);
140} 141}
141 142
142EXPORT_SYMBOL(panic); 143EXPORT_SYMBOL(panic);
@@ -177,7 +178,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 178 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 179 * 'C' - modules from drivers/staging are loaded.
179 * 180 *
180 * The string is overwritten by the next call to print_taint(). 181 * The string is overwritten by the next call to print_tainted().
181 */ 182 */
182const char *print_tainted(void) 183const char *print_tainted(void)
183{ 184{
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..9da58eabdcb2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 88 }
88 89
89 for (i = 0; args[i]; i++) { 90 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 91 if (isspace(args[i]) && !in_quote)
91 break; 92 break;
92 if (equals == 0) { 93 if (equals == 0) {
93 if (args[i] == '=') 94 if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 122 next = args + i;
122 123
123 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
124 while (*next == ' ') 125 while (isspace(*next))
125 next++; 126 next++;
126 return next; 127 return next;
127} 128}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
139 140
140 /* Chew leading spaces */ 141 /* Chew leading spaces */
141 while (*args == ' ') 142 while (isspace(*args))
142 args++; 143 args++;
143 144
144 while (*args) { 145 while (*args) {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index e0d91fdf0c3c..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4962 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_counter_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_counter_paranoid > 1;
69}
70
71int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf counter sample rate
75 */
76int sysctl_perf_counter_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_counter_id;
79
80/*
81 * Lock for (sysadmin-configurable) counter reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_counter_setup(int cpu) { barrier(); }
97void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_counter *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_counter_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_counter_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_counter_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_counter_context *ctx;
141
142 ctx = container_of(head, struct perf_counter_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_counter_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_counter_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit counters we want to return the parent counter id
167 * to userspace.
168 */
169static u64 primary_counter_id(struct perf_counter *counter)
170{
171 u64 id = counter->id;
172
173 if (counter->parent)
174 id = counter->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_counter_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_counter_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_counter_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_counter_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_counter_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_counter_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_counter_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a counter from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
252{
253 struct perf_counter *group_leader = counter->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling counter,
257 * add it straight to the context's counter list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == counter)
261 list_add_tail(&counter->list_entry, &ctx->counter_list);
262 else {
263 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&counter->event_entry, &ctx->event_list);
268 ctx->nr_counters++;
269 if (counter->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a counter from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
279{
280 struct perf_counter *sibling, *tmp;
281
282 if (list_empty(&counter->list_entry))
283 return;
284 ctx->nr_counters--;
285 if (counter->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&counter->list_entry);
289 list_del_rcu(&counter->event_entry);
290
291 if (counter->group_leader != counter)
292 counter->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group counter with sibling counters then
296 * upgrade the siblings to singleton counters by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp,
300 &counter->sibling_list, list_entry) {
301
302 list_move_tail(&sibling->list_entry, &ctx->counter_list);
303 sibling->group_leader = sibling;
304 }
305}
306
307static void
308counter_sched_out(struct perf_counter *counter,
309 struct perf_cpu_context *cpuctx,
310 struct perf_counter_context *ctx)
311{
312 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
313 return;
314
315 counter->state = PERF_COUNTER_STATE_INACTIVE;
316 if (counter->pending_disable) {
317 counter->pending_disable = 0;
318 counter->state = PERF_COUNTER_STATE_OFF;
319 }
320 counter->tstamp_stopped = ctx->time;
321 counter->pmu->disable(counter);
322 counter->oncpu = -1;
323
324 if (!is_software_counter(counter))
325 cpuctx->active_oncpu--;
326 ctx->nr_active--;
327 if (counter->attr.exclusive || !cpuctx->active_oncpu)
328 cpuctx->exclusive = 0;
329}
330
331static void
332group_sched_out(struct perf_counter *group_counter,
333 struct perf_cpu_context *cpuctx,
334 struct perf_counter_context *ctx)
335{
336 struct perf_counter *counter;
337
338 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
339 return;
340
341 counter_sched_out(group_counter, cpuctx, ctx);
342
343 /*
344 * Schedule out siblings (if any):
345 */
346 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
347 counter_sched_out(counter, cpuctx, ctx);
348
349 if (group_counter->attr.exclusive)
350 cpuctx->exclusive = 0;
351}
352
353/*
354 * Cross CPU call to remove a performance counter
355 *
356 * We disable the counter on the hardware level first. After that we
357 * remove it from the context list.
358 */
359static void __perf_counter_remove_from_context(void *info)
360{
361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
362 struct perf_counter *counter = info;
363 struct perf_counter_context *ctx = counter->ctx;
364
365 /*
366 * If this is a task context, we need to check whether it is
367 * the current task context of this cpu. If not it has been
368 * scheduled out before the smp call arrived.
369 */
370 if (ctx->task && cpuctx->task_ctx != ctx)
371 return;
372
373 spin_lock(&ctx->lock);
374 /*
375 * Protect the list operation against NMI by disabling the
376 * counters on a global level.
377 */
378 perf_disable();
379
380 counter_sched_out(counter, cpuctx, ctx);
381
382 list_del_counter(counter, ctx);
383
384 if (!ctx->task) {
385 /*
386 * Allow more per task counters with respect to the
387 * reservation:
388 */
389 cpuctx->max_pertask =
390 min(perf_max_counters - ctx->nr_counters,
391 perf_max_counters - perf_reserved_percpu);
392 }
393
394 perf_enable();
395 spin_unlock(&ctx->lock);
396}
397
398
399/*
400 * Remove the counter from a task's (or a CPU's) list of counters.
401 *
402 * Must be called with ctx->mutex held.
403 *
404 * CPU counters are removed with a smp call. For task counters we only
405 * call when the task is on a CPU.
406 *
407 * If counter->ctx is a cloned context, callers must make sure that
408 * every task struct that counter->ctx->task could possibly point to
409 * remains valid. This is OK when called from perf_release since
410 * that only calls us on the top-level context, which can't be a clone.
411 * When called from perf_counter_exit_task, it's OK because the
412 * context has been detached from its task.
413 */
414static void perf_counter_remove_from_context(struct perf_counter *counter)
415{
416 struct perf_counter_context *ctx = counter->ctx;
417 struct task_struct *task = ctx->task;
418
419 if (!task) {
420 /*
421 * Per cpu counters are removed via an smp call and
422 * the removal is always sucessful.
423 */
424 smp_call_function_single(counter->cpu,
425 __perf_counter_remove_from_context,
426 counter, 1);
427 return;
428 }
429
430retry:
431 task_oncpu_function_call(task, __perf_counter_remove_from_context,
432 counter);
433
434 spin_lock_irq(&ctx->lock);
435 /*
436 * If the context is active we need to retry the smp call.
437 */
438 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
439 spin_unlock_irq(&ctx->lock);
440 goto retry;
441 }
442
443 /*
444 * The lock prevents that this context is scheduled in so we
445 * can remove the counter safely, if the call above did not
446 * succeed.
447 */
448 if (!list_empty(&counter->list_entry)) {
449 list_del_counter(counter, ctx);
450 }
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_counter_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a counter.
472 */
473static void update_counter_times(struct perf_counter *counter)
474{
475 struct perf_counter_context *ctx = counter->ctx;
476 u64 run_end;
477
478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
480 return;
481
482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
483
484 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
485 run_end = counter->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 counter->total_time_running = run_end - counter->tstamp_running;
490}
491
492/*
493 * Update total_time_enabled and total_time_running for all counters in a group.
494 */
495static void update_group_times(struct perf_counter *leader)
496{
497 struct perf_counter *counter;
498
499 update_counter_times(leader);
500 list_for_each_entry(counter, &leader->sibling_list, list_entry)
501 update_counter_times(counter);
502}
503
504/*
505 * Cross CPU call to disable a performance counter
506 */
507static void __perf_counter_disable(void *info)
508{
509 struct perf_counter *counter = info;
510 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
511 struct perf_counter_context *ctx = counter->ctx;
512
513 /*
514 * If this is a per-task counter, need to check whether this
515 * counter's task is the current task on this cpu.
516 */
517 if (ctx->task && cpuctx->task_ctx != ctx)
518 return;
519
520 spin_lock(&ctx->lock);
521
522 /*
523 * If the counter is on, turn it off.
524 * If it is in error state, leave it in error state.
525 */
526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
527 update_context_time(ctx);
528 update_group_times(counter);
529 if (counter == counter->group_leader)
530 group_sched_out(counter, cpuctx, ctx);
531 else
532 counter_sched_out(counter, cpuctx, ctx);
533 counter->state = PERF_COUNTER_STATE_OFF;
534 }
535
536 spin_unlock(&ctx->lock);
537}
538
539/*
540 * Disable a counter.
541 *
542 * If counter->ctx is a cloned context, callers must make sure that
543 * every task struct that counter->ctx->task could possibly point to
544 * remains valid. This condition is satisifed when called through
545 * perf_counter_for_each_child or perf_counter_for_each because they
546 * hold the top-level counter's child_mutex, so any descendant that
547 * goes to exit will block in sync_child_counter.
548 * When called from perf_pending_counter it's OK because counter->ctx
549 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_counter_task_sched_out for this context.
551 */
552static void perf_counter_disable(struct perf_counter *counter)
553{
554 struct perf_counter_context *ctx = counter->ctx;
555 struct task_struct *task = ctx->task;
556
557 if (!task) {
558 /*
559 * Disable the counter on the cpu that it's on
560 */
561 smp_call_function_single(counter->cpu, __perf_counter_disable,
562 counter, 1);
563 return;
564 }
565
566 retry:
567 task_oncpu_function_call(task, __perf_counter_disable, counter);
568
569 spin_lock_irq(&ctx->lock);
570 /*
571 * If the counter is still active, we need to retry the cross-call.
572 */
573 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock);
575 goto retry;
576 }
577
578 /*
579 * Since we have the lock this context can't be scheduled
580 * in, so we can change the state safely.
581 */
582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
583 update_group_times(counter);
584 counter->state = PERF_COUNTER_STATE_OFF;
585 }
586
587 spin_unlock_irq(&ctx->lock);
588}
589
590static int
591counter_sched_in(struct perf_counter *counter,
592 struct perf_cpu_context *cpuctx,
593 struct perf_counter_context *ctx,
594 int cpu)
595{
596 if (counter->state <= PERF_COUNTER_STATE_OFF)
597 return 0;
598
599 counter->state = PERF_COUNTER_STATE_ACTIVE;
600 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
601 /*
602 * The new state must be visible before we turn it on in the hardware:
603 */
604 smp_wmb();
605
606 if (counter->pmu->enable(counter)) {
607 counter->state = PERF_COUNTER_STATE_INACTIVE;
608 counter->oncpu = -1;
609 return -EAGAIN;
610 }
611
612 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
613
614 if (!is_software_counter(counter))
615 cpuctx->active_oncpu++;
616 ctx->nr_active++;
617
618 if (counter->attr.exclusive)
619 cpuctx->exclusive = 1;
620
621 return 0;
622}
623
624static int
625group_sched_in(struct perf_counter *group_counter,
626 struct perf_cpu_context *cpuctx,
627 struct perf_counter_context *ctx,
628 int cpu)
629{
630 struct perf_counter *counter, *partial_group;
631 int ret;
632
633 if (group_counter->state == PERF_COUNTER_STATE_OFF)
634 return 0;
635
636 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
637 if (ret)
638 return ret < 0 ? ret : 0;
639
640 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
641 return -EAGAIN;
642
643 /*
644 * Schedule in siblings as one group (if any):
645 */
646 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
647 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
648 partial_group = counter;
649 goto group_error;
650 }
651 }
652
653 return 0;
654
655group_error:
656 /*
657 * Groups can be scheduled in as one unit only, so undo any
658 * partial group before returning:
659 */
660 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
661 if (counter == partial_group)
662 break;
663 counter_sched_out(counter, cpuctx, ctx);
664 }
665 counter_sched_out(group_counter, cpuctx, ctx);
666
667 return -EAGAIN;
668}
669
670/*
671 * Return 1 for a group consisting entirely of software counters,
672 * 0 if the group contains any hardware counters.
673 */
674static int is_software_only_group(struct perf_counter *leader)
675{
676 struct perf_counter *counter;
677
678 if (!is_software_counter(leader))
679 return 0;
680
681 list_for_each_entry(counter, &leader->sibling_list, list_entry)
682 if (!is_software_counter(counter))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this counter group on the CPU now.
690 */
691static int group_can_go_on(struct perf_counter *counter,
692 struct perf_cpu_context *cpuctx,
693 int can_add_hw)
694{
695 /*
696 * Groups consisting entirely of software counters can always go on.
697 */
698 if (is_software_only_group(counter))
699 return 1;
700 /*
701 * If an exclusive group is already on, no other hardware
702 * counters can go on.
703 */
704 if (cpuctx->exclusive)
705 return 0;
706 /*
707 * If this group is exclusive and there are already
708 * counters on the CPU, it can't go on.
709 */
710 if (counter->attr.exclusive && cpuctx->active_oncpu)
711 return 0;
712 /*
713 * Otherwise, try to add it if all previous groups were able
714 * to go on.
715 */
716 return can_add_hw;
717}
718
719static void add_counter_to_ctx(struct perf_counter *counter,
720 struct perf_counter_context *ctx)
721{
722 list_add_counter(counter, ctx);
723 counter->tstamp_enabled = ctx->time;
724 counter->tstamp_running = ctx->time;
725 counter->tstamp_stopped = ctx->time;
726}
727
728/*
729 * Cross CPU call to install and enable a performance counter
730 *
731 * Must be called with ctx->mutex held
732 */
733static void __perf_install_in_context(void *info)
734{
735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
736 struct perf_counter *counter = info;
737 struct perf_counter_context *ctx = counter->ctx;
738 struct perf_counter *leader = counter->group_leader;
739 int cpu = smp_processor_id();
740 int err;
741
742 /*
743 * If this is a task context, we need to check whether it is
744 * the current task context of this cpu. If not it has been
745 * scheduled out before the smp call arrived.
746 * Or possibly this is the right context but it isn't
747 * on this cpu because it had no counters.
748 */
749 if (ctx->task && cpuctx->task_ctx != ctx) {
750 if (cpuctx->task_ctx || ctx->task != current)
751 return;
752 cpuctx->task_ctx = ctx;
753 }
754
755 spin_lock(&ctx->lock);
756 ctx->is_active = 1;
757 update_context_time(ctx);
758
759 /*
760 * Protect the list operation against NMI by disabling the
761 * counters on a global level. NOP for non NMI based counters.
762 */
763 perf_disable();
764
765 add_counter_to_ctx(counter, ctx);
766
767 /*
768 * Don't put the counter on if it is disabled or if
769 * it is in a group and the group isn't on.
770 */
771 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
772 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
773 goto unlock;
774
775 /*
776 * An exclusive counter can't go on if there are already active
777 * hardware counters, and no hardware counter can go on if there
778 * is already an exclusive counter on.
779 */
780 if (!group_can_go_on(counter, cpuctx, 1))
781 err = -EEXIST;
782 else
783 err = counter_sched_in(counter, cpuctx, ctx, cpu);
784
785 if (err) {
786 /*
787 * This counter couldn't go on. If it is in a group
788 * then we have to pull the whole group off.
789 * If the counter group is pinned then put it in error state.
790 */
791 if (leader != counter)
792 group_sched_out(leader, cpuctx, ctx);
793 if (leader->attr.pinned) {
794 update_group_times(leader);
795 leader->state = PERF_COUNTER_STATE_ERROR;
796 }
797 }
798
799 if (!err && !ctx->task && cpuctx->max_pertask)
800 cpuctx->max_pertask--;
801
802 unlock:
803 perf_enable();
804
805 spin_unlock(&ctx->lock);
806}
807
808/*
809 * Attach a performance counter to a context
810 *
811 * First we add the counter to the list with the hardware enable bit
812 * in counter->hw_config cleared.
813 *
814 * If the counter is attached to a task which is on a CPU we use a smp
815 * call to enable it in the task context. The task might have been
816 * scheduled away, but we check this in the smp call again.
817 *
818 * Must be called with ctx->mutex held.
819 */
820static void
821perf_install_in_context(struct perf_counter_context *ctx,
822 struct perf_counter *counter,
823 int cpu)
824{
825 struct task_struct *task = ctx->task;
826
827 if (!task) {
828 /*
829 * Per cpu counters are installed via an smp call and
830 * the install is always sucessful.
831 */
832 smp_call_function_single(cpu, __perf_install_in_context,
833 counter, 1);
834 return;
835 }
836
837retry:
838 task_oncpu_function_call(task, __perf_install_in_context,
839 counter);
840
841 spin_lock_irq(&ctx->lock);
842 /*
843 * we need to retry the smp call.
844 */
845 if (ctx->is_active && list_empty(&counter->list_entry)) {
846 spin_unlock_irq(&ctx->lock);
847 goto retry;
848 }
849
850 /*
851 * The lock prevents that this context is scheduled in so we
852 * can add the counter safely, if it the call above did not
853 * succeed.
854 */
855 if (list_empty(&counter->list_entry))
856 add_counter_to_ctx(counter, ctx);
857 spin_unlock_irq(&ctx->lock);
858}
859
860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
882 * Cross CPU call to enable a performance counter
883 */
884static void __perf_counter_enable(void *info)
885{
886 struct perf_counter *counter = info;
887 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
888 struct perf_counter_context *ctx = counter->ctx;
889 struct perf_counter *leader = counter->group_leader;
890 int err;
891
892 /*
893 * If this is a per-task counter, need to check whether this
894 * counter's task is the current task on this cpu.
895 */
896 if (ctx->task && cpuctx->task_ctx != ctx) {
897 if (cpuctx->task_ctx || ctx->task != current)
898 return;
899 cpuctx->task_ctx = ctx;
900 }
901
902 spin_lock(&ctx->lock);
903 ctx->is_active = 1;
904 update_context_time(ctx);
905
906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
907 goto unlock;
908 __perf_counter_mark_enabled(counter, ctx);
909
910 /*
911 * If the counter is in a group and isn't the group leader,
912 * then don't put it on unless the group is on.
913 */
914 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
915 goto unlock;
916
917 if (!group_can_go_on(counter, cpuctx, 1)) {
918 err = -EEXIST;
919 } else {
920 perf_disable();
921 if (counter == leader)
922 err = group_sched_in(counter, cpuctx, ctx,
923 smp_processor_id());
924 else
925 err = counter_sched_in(counter, cpuctx, ctx,
926 smp_processor_id());
927 perf_enable();
928 }
929
930 if (err) {
931 /*
932 * If this counter can't go on and it's part of a
933 * group, then the whole group has to come off.
934 */
935 if (leader != counter)
936 group_sched_out(leader, cpuctx, ctx);
937 if (leader->attr.pinned) {
938 update_group_times(leader);
939 leader->state = PERF_COUNTER_STATE_ERROR;
940 }
941 }
942
943 unlock:
944 spin_unlock(&ctx->lock);
945}
946
947/*
948 * Enable a counter.
949 *
950 * If counter->ctx is a cloned context, callers must make sure that
951 * every task struct that counter->ctx->task could possibly point to
952 * remains valid. This condition is satisfied when called through
953 * perf_counter_for_each_child or perf_counter_for_each as described
954 * for perf_counter_disable.
955 */
956static void perf_counter_enable(struct perf_counter *counter)
957{
958 struct perf_counter_context *ctx = counter->ctx;
959 struct task_struct *task = ctx->task;
960
961 if (!task) {
962 /*
963 * Enable the counter on the cpu that it's on
964 */
965 smp_call_function_single(counter->cpu, __perf_counter_enable,
966 counter, 1);
967 return;
968 }
969
970 spin_lock_irq(&ctx->lock);
971 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
972 goto out;
973
974 /*
975 * If the counter is in error state, clear that first.
976 * That way, if we see the counter in error state below, we
977 * know that it has gone back into error state, as distinct
978 * from the task having been scheduled away before the
979 * cross-call arrived.
980 */
981 if (counter->state == PERF_COUNTER_STATE_ERROR)
982 counter->state = PERF_COUNTER_STATE_OFF;
983
984 retry:
985 spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_counter_enable, counter);
987
988 spin_lock_irq(&ctx->lock);
989
990 /*
991 * If the context is active and the counter is still off,
992 * we need to retry the cross-call.
993 */
994 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
995 goto retry;
996
997 /*
998 * Since we have the lock this context can't be scheduled
999 * in, so we can change the state safely.
1000 */
1001 if (counter->state == PERF_COUNTER_STATE_OFF)
1002 __perf_counter_mark_enabled(counter, ctx);
1003
1004 out:
1005 spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_counter_refresh(struct perf_counter *counter, int refresh)
1009{
1010 /*
1011 * not supported on inherited counters
1012 */
1013 if (counter->attr.inherit)
1014 return -EINVAL;
1015
1016 atomic_add(refresh, &counter->event_limit);
1017 perf_counter_enable(counter);
1018
1019 return 0;
1020}
1021
1022void __perf_counter_sched_out(struct perf_counter_context *ctx,
1023 struct perf_cpu_context *cpuctx)
1024{
1025 struct perf_counter *counter;
1026
1027 spin_lock(&ctx->lock);
1028 ctx->is_active = 0;
1029 if (likely(!ctx->nr_counters))
1030 goto out;
1031 update_context_time(ctx);
1032
1033 perf_disable();
1034 if (ctx->nr_active) {
1035 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1036 if (counter != counter->group_leader)
1037 counter_sched_out(counter, cpuctx, ctx);
1038 else
1039 group_sched_out(counter, cpuctx, ctx);
1040 }
1041 }
1042 perf_enable();
1043 out:
1044 spin_unlock(&ctx->lock);
1045}
1046
1047/*
1048 * Test whether two contexts are equivalent, i.e. whether they
1049 * have both been cloned from the same version of the same context
1050 * and they both have the same number of enabled counters.
1051 * If the number of enabled counters is the same, then the set
1052 * of enabled counters should be the same, because these are both
1053 * inherited contexts, therefore we can't access individual counters
1054 * in them directly with an fd; we can only enable/disable all
1055 * counters via prctl, or enable/disable all counters in a family
1056 * via ioctl, which will have the same effect on both contexts.
1057 */
1058static int context_equiv(struct perf_counter_context *ctx1,
1059 struct perf_counter_context *ctx2)
1060{
1061 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1062 && ctx1->parent_gen == ctx2->parent_gen
1063 && !ctx1->pin_count && !ctx2->pin_count;
1064}
1065
1066static void __perf_counter_read(void *counter);
1067
1068static void __perf_counter_sync_stat(struct perf_counter *counter,
1069 struct perf_counter *next_counter)
1070{
1071 u64 value;
1072
1073 if (!counter->attr.inherit_stat)
1074 return;
1075
1076 /*
1077 * Update the counter value, we cannot use perf_counter_read()
1078 * because we're in the middle of a context switch and have IRQs
1079 * disabled, which upsets smp_call_function_single(), however
1080 * we know the counter must be on the current CPU, therefore we
1081 * don't need to use it.
1082 */
1083 switch (counter->state) {
1084 case PERF_COUNTER_STATE_ACTIVE:
1085 __perf_counter_read(counter);
1086 break;
1087
1088 case PERF_COUNTER_STATE_INACTIVE:
1089 update_counter_times(counter);
1090 break;
1091
1092 default:
1093 break;
1094 }
1095
1096 /*
1097 * In order to keep per-task stats reliable we need to flip the counter
1098 * values when we flip the contexts.
1099 */
1100 value = atomic64_read(&next_counter->count);
1101 value = atomic64_xchg(&counter->count, value);
1102 atomic64_set(&next_counter->count, value);
1103
1104 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1105 swap(counter->total_time_running, next_counter->total_time_running);
1106
1107 /*
1108 * Since we swizzled the values, update the user visible data too.
1109 */
1110 perf_counter_update_userpage(counter);
1111 perf_counter_update_userpage(next_counter);
1112}
1113
1114#define list_next_entry(pos, member) \
1115 list_entry(pos->member.next, typeof(*pos), member)
1116
1117static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1118 struct perf_counter_context *next_ctx)
1119{
1120 struct perf_counter *counter, *next_counter;
1121
1122 if (!ctx->nr_stat)
1123 return;
1124
1125 counter = list_first_entry(&ctx->event_list,
1126 struct perf_counter, event_entry);
1127
1128 next_counter = list_first_entry(&next_ctx->event_list,
1129 struct perf_counter, event_entry);
1130
1131 while (&counter->event_entry != &ctx->event_list &&
1132 &next_counter->event_entry != &next_ctx->event_list) {
1133
1134 __perf_counter_sync_stat(counter, next_counter);
1135
1136 counter = list_next_entry(counter, event_entry);
1137 next_counter = list_next_entry(next_counter, event_entry);
1138 }
1139}
1140
1141/*
1142 * Called from scheduler to remove the counters of the current task,
1143 * with interrupts disabled.
1144 *
1145 * We stop each counter and update the counter value in counter->count.
1146 *
1147 * This does not protect us against NMI, but disable()
1148 * sets the disabled bit in the control field of counter _before_
1149 * accessing the counter control register. If a NMI hits, then it will
1150 * not restart the counter.
1151 */
1152void perf_counter_task_sched_out(struct task_struct *task,
1153 struct task_struct *next, int cpu)
1154{
1155 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1156 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1157 struct perf_counter_context *next_ctx;
1158 struct perf_counter_context *parent;
1159 struct pt_regs *regs;
1160 int do_switch = 1;
1161
1162 regs = task_pt_regs(task);
1163 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1164
1165 if (likely(!ctx || !cpuctx->task_ctx))
1166 return;
1167
1168 update_context_time(ctx);
1169
1170 rcu_read_lock();
1171 parent = rcu_dereference(ctx->parent_ctx);
1172 next_ctx = next->perf_counter_ctxp;
1173 if (parent && next_ctx &&
1174 rcu_dereference(next_ctx->parent_ctx) == parent) {
1175 /*
1176 * Looks like the two contexts are clones, so we might be
1177 * able to optimize the context switch. We lock both
1178 * contexts and check that they are clones under the
1179 * lock (including re-checking that neither has been
1180 * uncloned in the meantime). It doesn't matter which
1181 * order we take the locks because no other cpu could
1182 * be trying to lock both of these tasks.
1183 */
1184 spin_lock(&ctx->lock);
1185 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1186 if (context_equiv(ctx, next_ctx)) {
1187 /*
1188 * XXX do we need a memory barrier of sorts
1189 * wrt to rcu_dereference() of perf_counter_ctxp
1190 */
1191 task->perf_counter_ctxp = next_ctx;
1192 next->perf_counter_ctxp = ctx;
1193 ctx->task = next;
1194 next_ctx->task = task;
1195 do_switch = 0;
1196
1197 perf_counter_sync_stat(ctx, next_ctx);
1198 }
1199 spin_unlock(&next_ctx->lock);
1200 spin_unlock(&ctx->lock);
1201 }
1202 rcu_read_unlock();
1203
1204 if (do_switch) {
1205 __perf_counter_sched_out(ctx, cpuctx);
1206 cpuctx->task_ctx = NULL;
1207 }
1208}
1209
1210/*
1211 * Called with IRQs disabled
1212 */
1213static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1214{
1215 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1216
1217 if (!cpuctx->task_ctx)
1218 return;
1219
1220 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1221 return;
1222
1223 __perf_counter_sched_out(ctx, cpuctx);
1224 cpuctx->task_ctx = NULL;
1225}
1226
1227/*
1228 * Called with IRQs disabled
1229 */
1230static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1231{
1232 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1233}
1234
1235static void
1236__perf_counter_sched_in(struct perf_counter_context *ctx,
1237 struct perf_cpu_context *cpuctx, int cpu)
1238{
1239 struct perf_counter *counter;
1240 int can_add_hw = 1;
1241
1242 spin_lock(&ctx->lock);
1243 ctx->is_active = 1;
1244 if (likely(!ctx->nr_counters))
1245 goto out;
1246
1247 ctx->timestamp = perf_clock();
1248
1249 perf_disable();
1250
1251 /*
1252 * First go through the list and put on any pinned groups
1253 * in order to give them the best chance of going on.
1254 */
1255 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1256 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1257 !counter->attr.pinned)
1258 continue;
1259 if (counter->cpu != -1 && counter->cpu != cpu)
1260 continue;
1261
1262 if (counter != counter->group_leader)
1263 counter_sched_in(counter, cpuctx, ctx, cpu);
1264 else {
1265 if (group_can_go_on(counter, cpuctx, 1))
1266 group_sched_in(counter, cpuctx, ctx, cpu);
1267 }
1268
1269 /*
1270 * If this pinned group hasn't been scheduled,
1271 * put it in error state.
1272 */
1273 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1274 update_group_times(counter);
1275 counter->state = PERF_COUNTER_STATE_ERROR;
1276 }
1277 }
1278
1279 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1280 /*
1281 * Ignore counters in OFF or ERROR state, and
1282 * ignore pinned counters since we did them already.
1283 */
1284 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1285 counter->attr.pinned)
1286 continue;
1287
1288 /*
1289 * Listen to the 'cpu' scheduling filter constraint
1290 * of counters:
1291 */
1292 if (counter->cpu != -1 && counter->cpu != cpu)
1293 continue;
1294
1295 if (counter != counter->group_leader) {
1296 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1297 can_add_hw = 0;
1298 } else {
1299 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1300 if (group_sched_in(counter, cpuctx, ctx, cpu))
1301 can_add_hw = 0;
1302 }
1303 }
1304 }
1305 perf_enable();
1306 out:
1307 spin_unlock(&ctx->lock);
1308}
1309
1310/*
1311 * Called from scheduler to add the counters of the current task
1312 * with interrupts disabled.
1313 *
1314 * We restore the counter value and then enable it.
1315 *
1316 * This does not protect us against NMI, but enable()
1317 * sets the enabled bit in the control field of counter _before_
1318 * accessing the counter control register. If a NMI hits, then it will
1319 * keep the counter running.
1320 */
1321void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1322{
1323 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1324 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1325
1326 if (likely(!ctx))
1327 return;
1328 if (cpuctx->task_ctx == ctx)
1329 return;
1330 __perf_counter_sched_in(ctx, cpuctx, cpu);
1331 cpuctx->task_ctx = ctx;
1332}
1333
1334static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1335{
1336 struct perf_counter_context *ctx = &cpuctx->ctx;
1337
1338 __perf_counter_sched_in(ctx, cpuctx, cpu);
1339}
1340
1341#define MAX_INTERRUPTS (~0ULL)
1342
1343static void perf_log_throttle(struct perf_counter *counter, int enable);
1344
1345static void perf_adjust_period(struct perf_counter *counter, u64 events)
1346{
1347 struct hw_perf_counter *hwc = &counter->hw;
1348 u64 period, sample_period;
1349 s64 delta;
1350
1351 events *= hwc->sample_period;
1352 period = div64_u64(events, counter->attr.sample_freq);
1353
1354 delta = (s64)(period - hwc->sample_period);
1355 delta = (delta + 7) / 8; /* low pass filter */
1356
1357 sample_period = hwc->sample_period + delta;
1358
1359 if (!sample_period)
1360 sample_period = 1;
1361
1362 hwc->sample_period = sample_period;
1363}
1364
1365static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1366{
1367 struct perf_counter *counter;
1368 struct hw_perf_counter *hwc;
1369 u64 interrupts, freq;
1370
1371 spin_lock(&ctx->lock);
1372 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1373 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1374 continue;
1375
1376 hwc = &counter->hw;
1377
1378 interrupts = hwc->interrupts;
1379 hwc->interrupts = 0;
1380
1381 /*
1382 * unthrottle counters on the tick
1383 */
1384 if (interrupts == MAX_INTERRUPTS) {
1385 perf_log_throttle(counter, 1);
1386 counter->pmu->unthrottle(counter);
1387 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1388 }
1389
1390 if (!counter->attr.freq || !counter->attr.sample_freq)
1391 continue;
1392
1393 /*
1394 * if the specified freq < HZ then we need to skip ticks
1395 */
1396 if (counter->attr.sample_freq < HZ) {
1397 freq = counter->attr.sample_freq;
1398
1399 hwc->freq_count += freq;
1400 hwc->freq_interrupts += interrupts;
1401
1402 if (hwc->freq_count < HZ)
1403 continue;
1404
1405 interrupts = hwc->freq_interrupts;
1406 hwc->freq_interrupts = 0;
1407 hwc->freq_count -= HZ;
1408 } else
1409 freq = HZ;
1410
1411 perf_adjust_period(counter, freq * interrupts);
1412
1413 /*
1414 * In order to avoid being stalled by an (accidental) huge
1415 * sample period, force reset the sample period if we didn't
1416 * get any events in this freq period.
1417 */
1418 if (!interrupts) {
1419 perf_disable();
1420 counter->pmu->disable(counter);
1421 atomic64_set(&hwc->period_left, 0);
1422 counter->pmu->enable(counter);
1423 perf_enable();
1424 }
1425 }
1426 spin_unlock(&ctx->lock);
1427}
1428
1429/*
1430 * Round-robin a context's counters:
1431 */
1432static void rotate_ctx(struct perf_counter_context *ctx)
1433{
1434 struct perf_counter *counter;
1435
1436 if (!ctx->nr_counters)
1437 return;
1438
1439 spin_lock(&ctx->lock);
1440 /*
1441 * Rotate the first entry last (works just fine for group counters too):
1442 */
1443 perf_disable();
1444 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1445 list_move_tail(&counter->list_entry, &ctx->counter_list);
1446 break;
1447 }
1448 perf_enable();
1449
1450 spin_unlock(&ctx->lock);
1451}
1452
1453void perf_counter_task_tick(struct task_struct *curr, int cpu)
1454{
1455 struct perf_cpu_context *cpuctx;
1456 struct perf_counter_context *ctx;
1457
1458 if (!atomic_read(&nr_counters))
1459 return;
1460
1461 cpuctx = &per_cpu(perf_cpu_context, cpu);
1462 ctx = curr->perf_counter_ctxp;
1463
1464 perf_ctx_adjust_freq(&cpuctx->ctx);
1465 if (ctx)
1466 perf_ctx_adjust_freq(ctx);
1467
1468 perf_counter_cpu_sched_out(cpuctx);
1469 if (ctx)
1470 __perf_counter_task_sched_out(ctx);
1471
1472 rotate_ctx(&cpuctx->ctx);
1473 if (ctx)
1474 rotate_ctx(ctx);
1475
1476 perf_counter_cpu_sched_in(cpuctx, cpu);
1477 if (ctx)
1478 perf_counter_task_sched_in(curr, cpu);
1479}
1480
1481/*
1482 * Enable all of a task's counters that have been marked enable-on-exec.
1483 * This expects task == current.
1484 */
1485static void perf_counter_enable_on_exec(struct task_struct *task)
1486{
1487 struct perf_counter_context *ctx;
1488 struct perf_counter *counter;
1489 unsigned long flags;
1490 int enabled = 0;
1491
1492 local_irq_save(flags);
1493 ctx = task->perf_counter_ctxp;
1494 if (!ctx || !ctx->nr_counters)
1495 goto out;
1496
1497 __perf_counter_task_sched_out(ctx);
1498
1499 spin_lock(&ctx->lock);
1500
1501 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1502 if (!counter->attr.enable_on_exec)
1503 continue;
1504 counter->attr.enable_on_exec = 0;
1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1506 continue;
1507 __perf_counter_mark_enabled(counter, ctx);
1508 enabled = 1;
1509 }
1510
1511 /*
1512 * Unclone this context if we enabled any counter.
1513 */
1514 if (enabled)
1515 unclone_ctx(ctx);
1516
1517 spin_unlock(&ctx->lock);
1518
1519 perf_counter_task_sched_in(task, smp_processor_id());
1520 out:
1521 local_irq_restore(flags);
1522}
1523
1524/*
1525 * Cross CPU call to read the hardware counter
1526 */
1527static void __perf_counter_read(void *info)
1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1530 struct perf_counter *counter = info;
1531 struct perf_counter_context *ctx = counter->ctx;
1532 unsigned long flags;
1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1544 local_irq_save(flags);
1545 if (ctx->is_active)
1546 update_context_time(ctx);
1547 counter->pmu->read(counter);
1548 update_counter_times(counter);
1549 local_irq_restore(flags);
1550}
1551
1552static u64 perf_counter_read(struct perf_counter *counter)
1553{
1554 /*
1555 * If counter is enabled and currently active on a CPU, update the
1556 * value in the counter structure:
1557 */
1558 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1559 smp_call_function_single(counter->oncpu,
1560 __perf_counter_read, counter, 1);
1561 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1562 update_counter_times(counter);
1563 }
1564
1565 return atomic64_read(&counter->count);
1566}
1567
1568/*
1569 * Initialize the perf_counter context in a task_struct:
1570 */
1571static void
1572__perf_counter_init_context(struct perf_counter_context *ctx,
1573 struct task_struct *task)
1574{
1575 memset(ctx, 0, sizeof(*ctx));
1576 spin_lock_init(&ctx->lock);
1577 mutex_init(&ctx->mutex);
1578 INIT_LIST_HEAD(&ctx->counter_list);
1579 INIT_LIST_HEAD(&ctx->event_list);
1580 atomic_set(&ctx->refcount, 1);
1581 ctx->task = task;
1582}
1583
1584static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1585{
1586 struct perf_counter_context *ctx;
1587 struct perf_cpu_context *cpuctx;
1588 struct task_struct *task;
1589 unsigned long flags;
1590 int err;
1591
1592 /*
1593 * If cpu is not a wildcard then this is a percpu counter:
1594 */
1595 if (cpu != -1) {
1596 /* Must be root to operate on a CPU counter: */
1597 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1598 return ERR_PTR(-EACCES);
1599
1600 if (cpu < 0 || cpu > num_possible_cpus())
1601 return ERR_PTR(-EINVAL);
1602
1603 /*
1604 * We could be clever and allow to attach a counter to an
1605 * offline CPU and activate it when the CPU comes up, but
1606 * that's for later.
1607 */
1608 if (!cpu_isset(cpu, cpu_online_map))
1609 return ERR_PTR(-ENODEV);
1610
1611 cpuctx = &per_cpu(perf_cpu_context, cpu);
1612 ctx = &cpuctx->ctx;
1613 get_ctx(ctx);
1614
1615 return ctx;
1616 }
1617
1618 rcu_read_lock();
1619 if (!pid)
1620 task = current;
1621 else
1622 task = find_task_by_vpid(pid);
1623 if (task)
1624 get_task_struct(task);
1625 rcu_read_unlock();
1626
1627 if (!task)
1628 return ERR_PTR(-ESRCH);
1629
1630 /*
1631 * Can't attach counters to a dying task.
1632 */
1633 err = -ESRCH;
1634 if (task->flags & PF_EXITING)
1635 goto errout;
1636
1637 /* Reuse ptrace permission checks for now. */
1638 err = -EACCES;
1639 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1640 goto errout;
1641
1642 retry:
1643 ctx = perf_lock_task_context(task, &flags);
1644 if (ctx) {
1645 unclone_ctx(ctx);
1646 spin_unlock_irqrestore(&ctx->lock, flags);
1647 }
1648
1649 if (!ctx) {
1650 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1651 err = -ENOMEM;
1652 if (!ctx)
1653 goto errout;
1654 __perf_counter_init_context(ctx, task);
1655 get_ctx(ctx);
1656 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1657 /*
1658 * We raced with some other task; use
1659 * the context they set.
1660 */
1661 kfree(ctx);
1662 goto retry;
1663 }
1664 get_task_struct(task);
1665 }
1666
1667 put_task_struct(task);
1668 return ctx;
1669
1670 errout:
1671 put_task_struct(task);
1672 return ERR_PTR(err);
1673}
1674
1675static void free_counter_rcu(struct rcu_head *head)
1676{
1677 struct perf_counter *counter;
1678
1679 counter = container_of(head, struct perf_counter, rcu_head);
1680 if (counter->ns)
1681 put_pid_ns(counter->ns);
1682 kfree(counter);
1683}
1684
1685static void perf_pending_sync(struct perf_counter *counter);
1686
1687static void free_counter(struct perf_counter *counter)
1688{
1689 perf_pending_sync(counter);
1690
1691 if (!counter->parent) {
1692 atomic_dec(&nr_counters);
1693 if (counter->attr.mmap)
1694 atomic_dec(&nr_mmap_counters);
1695 if (counter->attr.comm)
1696 atomic_dec(&nr_comm_counters);
1697 if (counter->attr.task)
1698 atomic_dec(&nr_task_counters);
1699 }
1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1706 if (counter->destroy)
1707 counter->destroy(counter);
1708
1709 put_ctx(counter->ctx);
1710 call_rcu(&counter->rcu_head, free_counter_rcu);
1711}
1712
1713/*
1714 * Called when the last reference to the file is gone.
1715 */
1716static int perf_release(struct inode *inode, struct file *file)
1717{
1718 struct perf_counter *counter = file->private_data;
1719 struct perf_counter_context *ctx = counter->ctx;
1720
1721 file->private_data = NULL;
1722
1723 WARN_ON_ONCE(ctx->parent_ctx);
1724 mutex_lock(&ctx->mutex);
1725 perf_counter_remove_from_context(counter);
1726 mutex_unlock(&ctx->mutex);
1727
1728 mutex_lock(&counter->owner->perf_counter_mutex);
1729 list_del_init(&counter->owner_entry);
1730 mutex_unlock(&counter->owner->perf_counter_mutex);
1731 put_task_struct(counter->owner);
1732
1733 free_counter(counter);
1734
1735 return 0;
1736}
1737
1738static int perf_counter_read_size(struct perf_counter *counter)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (counter->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += counter->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
1763static u64 perf_counter_read_value(struct perf_counter *counter)
1764{
1765 struct perf_counter *child;
1766 u64 total = 0;
1767
1768 total += perf_counter_read(counter);
1769 list_for_each_entry(child, &counter->child_list, child_list)
1770 total += perf_counter_read(child);
1771
1772 return total;
1773}
1774
1775static int perf_counter_read_entry(struct perf_counter *counter,
1776 u64 read_format, char __user *buf)
1777{
1778 int n = 0, count = 0;
1779 u64 values[2];
1780
1781 values[n++] = perf_counter_read_value(counter);
1782 if (read_format & PERF_FORMAT_ID)
1783 values[n++] = primary_counter_id(counter);
1784
1785 count = n * sizeof(u64);
1786
1787 if (copy_to_user(buf, values, count))
1788 return -EFAULT;
1789
1790 return count;
1791}
1792
1793static int perf_counter_read_group(struct perf_counter *counter,
1794 u64 read_format, char __user *buf)
1795{
1796 struct perf_counter *leader = counter->group_leader, *sub;
1797 int n = 0, size = 0, err = -EFAULT;
1798 u64 values[3];
1799
1800 values[n++] = 1 + leader->nr_siblings;
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = leader->total_time_enabled +
1803 atomic64_read(&leader->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = leader->total_time_running +
1807 atomic64_read(&leader->child_total_time_running);
1808 }
1809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
1813 return -EFAULT;
1814
1815 err = perf_counter_read_entry(leader, read_format, buf + size);
1816 if (err < 0)
1817 return err;
1818
1819 size += err;
1820
1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1822 err = perf_counter_read_entry(sub, read_format,
1823 buf + size);
1824 if (err < 0)
1825 return err;
1826
1827 size += err;
1828 }
1829
1830 return size;
1831}
1832
1833static int perf_counter_read_one(struct perf_counter *counter,
1834 u64 read_format, char __user *buf)
1835{
1836 u64 values[4];
1837 int n = 0;
1838
1839 values[n++] = perf_counter_read_value(counter);
1840 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1841 values[n++] = counter->total_time_enabled +
1842 atomic64_read(&counter->child_total_time_enabled);
1843 }
1844 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1845 values[n++] = counter->total_time_running +
1846 atomic64_read(&counter->child_total_time_running);
1847 }
1848 if (read_format & PERF_FORMAT_ID)
1849 values[n++] = primary_counter_id(counter);
1850
1851 if (copy_to_user(buf, values, n * sizeof(u64)))
1852 return -EFAULT;
1853
1854 return n * sizeof(u64);
1855}
1856
1857/*
1858 * Read the performance counter - simple non blocking version for now
1859 */
1860static ssize_t
1861perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1862{
1863 u64 read_format = counter->attr.read_format;
1864 int ret;
1865
1866 /*
1867 * Return end-of-file for a read on a counter that is in
1868 * error state (i.e. because it was pinned but it couldn't be
1869 * scheduled on to the CPU at some point).
1870 */
1871 if (counter->state == PERF_COUNTER_STATE_ERROR)
1872 return 0;
1873
1874 if (count < perf_counter_read_size(counter))
1875 return -ENOSPC;
1876
1877 WARN_ON_ONCE(counter->ctx->parent_ctx);
1878 mutex_lock(&counter->child_mutex);
1879 if (read_format & PERF_FORMAT_GROUP)
1880 ret = perf_counter_read_group(counter, read_format, buf);
1881 else
1882 ret = perf_counter_read_one(counter, read_format, buf);
1883 mutex_unlock(&counter->child_mutex);
1884
1885 return ret;
1886}
1887
1888static ssize_t
1889perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1890{
1891 struct perf_counter *counter = file->private_data;
1892
1893 return perf_read_hw(counter, buf, count);
1894}
1895
1896static unsigned int perf_poll(struct file *file, poll_table *wait)
1897{
1898 struct perf_counter *counter = file->private_data;
1899 struct perf_mmap_data *data;
1900 unsigned int events = POLL_HUP;
1901
1902 rcu_read_lock();
1903 data = rcu_dereference(counter->data);
1904 if (data)
1905 events = atomic_xchg(&data->poll, 0);
1906 rcu_read_unlock();
1907
1908 poll_wait(file, &counter->waitq, wait);
1909
1910 return events;
1911}
1912
1913static void perf_counter_reset(struct perf_counter *counter)
1914{
1915 (void)perf_counter_read(counter);
1916 atomic64_set(&counter->count, 0);
1917 perf_counter_update_userpage(counter);
1918}
1919
1920/*
1921 * Holding the top-level counter's child_mutex means that any
1922 * descendant process that has inherited this counter will block
1923 * in sync_child_counter if it goes to exit, thus satisfying the
1924 * task existence requirements of perf_counter_enable/disable.
1925 */
1926static void perf_counter_for_each_child(struct perf_counter *counter,
1927 void (*func)(struct perf_counter *))
1928{
1929 struct perf_counter *child;
1930
1931 WARN_ON_ONCE(counter->ctx->parent_ctx);
1932 mutex_lock(&counter->child_mutex);
1933 func(counter);
1934 list_for_each_entry(child, &counter->child_list, child_list)
1935 func(child);
1936 mutex_unlock(&counter->child_mutex);
1937}
1938
1939static void perf_counter_for_each(struct perf_counter *counter,
1940 void (*func)(struct perf_counter *))
1941{
1942 struct perf_counter_context *ctx = counter->ctx;
1943 struct perf_counter *sibling;
1944
1945 WARN_ON_ONCE(ctx->parent_ctx);
1946 mutex_lock(&ctx->mutex);
1947 counter = counter->group_leader;
1948
1949 perf_counter_for_each_child(counter, func);
1950 func(counter);
1951 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1952 perf_counter_for_each_child(counter, func);
1953 mutex_unlock(&ctx->mutex);
1954}
1955
1956static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1957{
1958 struct perf_counter_context *ctx = counter->ctx;
1959 unsigned long size;
1960 int ret = 0;
1961 u64 value;
1962
1963 if (!counter->attr.sample_period)
1964 return -EINVAL;
1965
1966 size = copy_from_user(&value, arg, sizeof(value));
1967 if (size != sizeof(value))
1968 return -EFAULT;
1969
1970 if (!value)
1971 return -EINVAL;
1972
1973 spin_lock_irq(&ctx->lock);
1974 if (counter->attr.freq) {
1975 if (value > sysctl_perf_counter_sample_rate) {
1976 ret = -EINVAL;
1977 goto unlock;
1978 }
1979
1980 counter->attr.sample_freq = value;
1981 } else {
1982 counter->attr.sample_period = value;
1983 counter->hw.sample_period = value;
1984 }
1985unlock:
1986 spin_unlock_irq(&ctx->lock);
1987
1988 return ret;
1989}
1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1994{
1995 struct perf_counter *counter = file->private_data;
1996 void (*func)(struct perf_counter *);
1997 u32 flags = arg;
1998
1999 switch (cmd) {
2000 case PERF_COUNTER_IOC_ENABLE:
2001 func = perf_counter_enable;
2002 break;
2003 case PERF_COUNTER_IOC_DISABLE:
2004 func = perf_counter_disable;
2005 break;
2006 case PERF_COUNTER_IOC_RESET:
2007 func = perf_counter_reset;
2008 break;
2009
2010 case PERF_COUNTER_IOC_REFRESH:
2011 return perf_counter_refresh(counter, arg);
2012
2013 case PERF_COUNTER_IOC_PERIOD:
2014 return perf_counter_period(counter, (u64 __user *)arg);
2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
2019 default:
2020 return -ENOTTY;
2021 }
2022
2023 if (flags & PERF_IOC_FLAG_GROUP)
2024 perf_counter_for_each(counter, func);
2025 else
2026 perf_counter_for_each_child(counter, func);
2027
2028 return 0;
2029}
2030
2031int perf_counter_task_enable(void)
2032{
2033 struct perf_counter *counter;
2034
2035 mutex_lock(&current->perf_counter_mutex);
2036 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2037 perf_counter_for_each_child(counter, perf_counter_enable);
2038 mutex_unlock(&current->perf_counter_mutex);
2039
2040 return 0;
2041}
2042
2043int perf_counter_task_disable(void)
2044{
2045 struct perf_counter *counter;
2046
2047 mutex_lock(&current->perf_counter_mutex);
2048 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2049 perf_counter_for_each_child(counter, perf_counter_disable);
2050 mutex_unlock(&current->perf_counter_mutex);
2051
2052 return 0;
2053}
2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
2059static int perf_counter_index(struct perf_counter *counter)
2060{
2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2062 return 0;
2063
2064 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2065}
2066
2067/*
2068 * Callers need to ensure there can be no nesting of this function, otherwise
2069 * the seqlock logic goes bad. We can not serialize this because the arch
2070 * code calls this from NMI context.
2071 */
2072void perf_counter_update_userpage(struct perf_counter *counter)
2073{
2074 struct perf_counter_mmap_page *userpg;
2075 struct perf_mmap_data *data;
2076
2077 rcu_read_lock();
2078 data = rcu_dereference(counter->data);
2079 if (!data)
2080 goto unlock;
2081
2082 userpg = data->user_page;
2083
2084 /*
2085 * Disable preemption so as to not let the corresponding user-space
2086 * spin too long if we get preempted.
2087 */
2088 preempt_disable();
2089 ++userpg->lock;
2090 barrier();
2091 userpg->index = perf_counter_index(counter);
2092 userpg->offset = atomic64_read(&counter->count);
2093 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2094 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2095
2096 userpg->time_enabled = counter->total_time_enabled +
2097 atomic64_read(&counter->child_total_time_enabled);
2098
2099 userpg->time_running = counter->total_time_running +
2100 atomic64_read(&counter->child_total_time_running);
2101
2102 barrier();
2103 ++userpg->lock;
2104 preempt_enable();
2105unlock:
2106 rcu_read_unlock();
2107}
2108
2109static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2110{
2111 struct perf_counter *counter = vma->vm_file->private_data;
2112 struct perf_mmap_data *data;
2113 int ret = VM_FAULT_SIGBUS;
2114
2115 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2116 if (vmf->pgoff == 0)
2117 ret = 0;
2118 return ret;
2119 }
2120
2121 rcu_read_lock();
2122 data = rcu_dereference(counter->data);
2123 if (!data)
2124 goto unlock;
2125
2126 if (vmf->pgoff == 0) {
2127 vmf->page = virt_to_page(data->user_page);
2128 } else {
2129 int nr = vmf->pgoff - 1;
2130
2131 if ((unsigned)nr > data->nr_pages)
2132 goto unlock;
2133
2134 if (vmf->flags & FAULT_FLAG_WRITE)
2135 goto unlock;
2136
2137 vmf->page = virt_to_page(data->data_pages[nr]);
2138 }
2139
2140 get_page(vmf->page);
2141 vmf->page->mapping = vma->vm_file->f_mapping;
2142 vmf->page->index = vmf->pgoff;
2143
2144 ret = 0;
2145unlock:
2146 rcu_read_unlock();
2147
2148 return ret;
2149}
2150
2151static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2152{
2153 struct perf_mmap_data *data;
2154 unsigned long size;
2155 int i;
2156
2157 WARN_ON(atomic_read(&counter->mmap_count));
2158
2159 size = sizeof(struct perf_mmap_data);
2160 size += nr_pages * sizeof(void *);
2161
2162 data = kzalloc(size, GFP_KERNEL);
2163 if (!data)
2164 goto fail;
2165
2166 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2167 if (!data->user_page)
2168 goto fail_user_page;
2169
2170 for (i = 0; i < nr_pages; i++) {
2171 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2172 if (!data->data_pages[i])
2173 goto fail_data_pages;
2174 }
2175
2176 data->nr_pages = nr_pages;
2177 atomic_set(&data->lock, -1);
2178
2179 rcu_assign_pointer(counter->data, data);
2180
2181 return 0;
2182
2183fail_data_pages:
2184 for (i--; i >= 0; i--)
2185 free_page((unsigned long)data->data_pages[i]);
2186
2187 free_page((unsigned long)data->user_page);
2188
2189fail_user_page:
2190 kfree(data);
2191
2192fail:
2193 return -ENOMEM;
2194}
2195
2196static void perf_mmap_free_page(unsigned long addr)
2197{
2198 struct page *page = virt_to_page((void *)addr);
2199
2200 page->mapping = NULL;
2201 __free_page(page);
2202}
2203
2204static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2205{
2206 struct perf_mmap_data *data;
2207 int i;
2208
2209 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2210
2211 perf_mmap_free_page((unsigned long)data->user_page);
2212 for (i = 0; i < data->nr_pages; i++)
2213 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2214
2215 kfree(data);
2216}
2217
2218static void perf_mmap_data_free(struct perf_counter *counter)
2219{
2220 struct perf_mmap_data *data = counter->data;
2221
2222 WARN_ON(atomic_read(&counter->mmap_count));
2223
2224 rcu_assign_pointer(counter->data, NULL);
2225 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2226}
2227
2228static void perf_mmap_open(struct vm_area_struct *vma)
2229{
2230 struct perf_counter *counter = vma->vm_file->private_data;
2231
2232 atomic_inc(&counter->mmap_count);
2233}
2234
2235static void perf_mmap_close(struct vm_area_struct *vma)
2236{
2237 struct perf_counter *counter = vma->vm_file->private_data;
2238
2239 WARN_ON_ONCE(counter->ctx->parent_ctx);
2240 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2241 struct user_struct *user = current_user();
2242
2243 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2244 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2245 perf_mmap_data_free(counter);
2246 mutex_unlock(&counter->mmap_mutex);
2247 }
2248}
2249
2250static struct vm_operations_struct perf_mmap_vmops = {
2251 .open = perf_mmap_open,
2252 .close = perf_mmap_close,
2253 .fault = perf_mmap_fault,
2254 .page_mkwrite = perf_mmap_fault,
2255};
2256
2257static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2258{
2259 struct perf_counter *counter = file->private_data;
2260 unsigned long user_locked, user_lock_limit;
2261 struct user_struct *user = current_user();
2262 unsigned long locked, lock_limit;
2263 unsigned long vma_size;
2264 unsigned long nr_pages;
2265 long user_extra, extra;
2266 int ret = 0;
2267
2268 if (!(vma->vm_flags & VM_SHARED))
2269 return -EINVAL;
2270
2271 vma_size = vma->vm_end - vma->vm_start;
2272 nr_pages = (vma_size / PAGE_SIZE) - 1;
2273
2274 /*
2275 * If we have data pages ensure they're a power-of-two number, so we
2276 * can do bitmasks instead of modulo.
2277 */
2278 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2279 return -EINVAL;
2280
2281 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2282 return -EINVAL;
2283
2284 if (vma->vm_pgoff != 0)
2285 return -EINVAL;
2286
2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2295 if (nr_pages != counter->data->nr_pages)
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 user_extra = nr_pages + 1;
2301 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2302
2303 /*
2304 * Increase the limit linearly with more CPUs:
2305 */
2306 user_lock_limit *= num_online_cpus();
2307
2308 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2309
2310 extra = 0;
2311 if (user_locked > user_lock_limit)
2312 extra = user_locked - user_lock_limit;
2313
2314 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2315 lock_limit >>= PAGE_SHIFT;
2316 locked = vma->vm_mm->locked_vm + extra;
2317
2318 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2319 ret = -EPERM;
2320 goto unlock;
2321 }
2322
2323 WARN_ON(counter->data);
2324 ret = perf_mmap_data_alloc(counter, nr_pages);
2325 if (ret)
2326 goto unlock;
2327
2328 atomic_set(&counter->mmap_count, 1);
2329 atomic_long_add(user_extra, &user->locked_vm);
2330 vma->vm_mm->locked_vm += extra;
2331 counter->data->nr_locked = extra;
2332 if (vma->vm_flags & VM_WRITE)
2333 counter->data->writable = 1;
2334
2335unlock:
2336 mutex_unlock(&counter->mmap_mutex);
2337
2338 vma->vm_flags |= VM_RESERVED;
2339 vma->vm_ops = &perf_mmap_vmops;
2340
2341 return ret;
2342}
2343
2344static int perf_fasync(int fd, struct file *filp, int on)
2345{
2346 struct inode *inode = filp->f_path.dentry->d_inode;
2347 struct perf_counter *counter = filp->private_data;
2348 int retval;
2349
2350 mutex_lock(&inode->i_mutex);
2351 retval = fasync_helper(fd, filp, on, &counter->fasync);
2352 mutex_unlock(&inode->i_mutex);
2353
2354 if (retval < 0)
2355 return retval;
2356
2357 return 0;
2358}
2359
2360static const struct file_operations perf_fops = {
2361 .release = perf_release,
2362 .read = perf_read,
2363 .poll = perf_poll,
2364 .unlocked_ioctl = perf_ioctl,
2365 .compat_ioctl = perf_ioctl,
2366 .mmap = perf_mmap,
2367 .fasync = perf_fasync,
2368};
2369
2370/*
2371 * Perf counter wakeup
2372 *
2373 * If there's data, ensure we set the poll() state and publish everything
2374 * to user-space before waking everybody up.
2375 */
2376
2377void perf_counter_wakeup(struct perf_counter *counter)
2378{
2379 wake_up_all(&counter->waitq);
2380
2381 if (counter->pending_kill) {
2382 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2383 counter->pending_kill = 0;
2384 }
2385}
2386
2387/*
2388 * Pending wakeups
2389 *
2390 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2391 *
2392 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2393 * single linked list and use cmpxchg() to add entries lockless.
2394 */
2395
2396static void perf_pending_counter(struct perf_pending_entry *entry)
2397{
2398 struct perf_counter *counter = container_of(entry,
2399 struct perf_counter, pending);
2400
2401 if (counter->pending_disable) {
2402 counter->pending_disable = 0;
2403 __perf_counter_disable(counter);
2404 }
2405
2406 if (counter->pending_wakeup) {
2407 counter->pending_wakeup = 0;
2408 perf_counter_wakeup(counter);
2409 }
2410}
2411
2412#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2413
2414static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2415 PENDING_TAIL,
2416};
2417
2418static void perf_pending_queue(struct perf_pending_entry *entry,
2419 void (*func)(struct perf_pending_entry *))
2420{
2421 struct perf_pending_entry **head;
2422
2423 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2424 return;
2425
2426 entry->func = func;
2427
2428 head = &get_cpu_var(perf_pending_head);
2429
2430 do {
2431 entry->next = *head;
2432 } while (cmpxchg(head, entry->next, entry) != entry->next);
2433
2434 set_perf_counter_pending();
2435
2436 put_cpu_var(perf_pending_head);
2437}
2438
2439static int __perf_pending_run(void)
2440{
2441 struct perf_pending_entry *list;
2442 int nr = 0;
2443
2444 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2445 while (list != PENDING_TAIL) {
2446 void (*func)(struct perf_pending_entry *);
2447 struct perf_pending_entry *entry = list;
2448
2449 list = list->next;
2450
2451 func = entry->func;
2452 entry->next = NULL;
2453 /*
2454 * Ensure we observe the unqueue before we issue the wakeup,
2455 * so that we won't be waiting forever.
2456 * -- see perf_not_pending().
2457 */
2458 smp_wmb();
2459
2460 func(entry);
2461 nr++;
2462 }
2463
2464 return nr;
2465}
2466
2467static inline int perf_not_pending(struct perf_counter *counter)
2468{
2469 /*
2470 * If we flush on whatever cpu we run, there is a chance we don't
2471 * need to wait.
2472 */
2473 get_cpu();
2474 __perf_pending_run();
2475 put_cpu();
2476
2477 /*
2478 * Ensure we see the proper queue state before going to sleep
2479 * so that we do not miss the wakeup. -- see perf_pending_handle()
2480 */
2481 smp_rmb();
2482 return counter->pending.next == NULL;
2483}
2484
2485static void perf_pending_sync(struct perf_counter *counter)
2486{
2487 wait_event(counter->waitq, perf_not_pending(counter));
2488}
2489
2490void perf_counter_do_pending(void)
2491{
2492 __perf_pending_run();
2493}
2494
2495/*
2496 * Callchain support -- arch specific
2497 */
2498
2499__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2500{
2501 return NULL;
2502}
2503
2504/*
2505 * Output
2506 */
2507
2508struct perf_output_handle {
2509 struct perf_counter *counter;
2510 struct perf_mmap_data *data;
2511 unsigned long head;
2512 unsigned long offset;
2513 int nmi;
2514 int sample;
2515 int locked;
2516 unsigned long flags;
2517};
2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{
2522 unsigned long tail;
2523 unsigned long mask;
2524
2525 if (!data->writable)
2526 return true;
2527
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536
2537 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask;
2539
2540 if ((int)(head - offset) < 0)
2541 return false;
2542
2543 return true;
2544}
2545
2546static void perf_output_wakeup(struct perf_output_handle *handle)
2547{
2548 atomic_set(&handle->data->poll, POLL_IN);
2549
2550 if (handle->nmi) {
2551 handle->counter->pending_wakeup = 1;
2552 perf_pending_queue(&handle->counter->pending,
2553 perf_pending_counter);
2554 } else
2555 perf_counter_wakeup(handle->counter);
2556}
2557
2558/*
2559 * Curious locking construct.
2560 *
2561 * We need to ensure a later event doesn't publish a head when a former
2562 * event isn't done writing. However since we need to deal with NMIs we
2563 * cannot fully serialize things.
2564 *
2565 * What we do is serialize between CPUs so we only have to deal with NMI
2566 * nesting on a single CPU.
2567 *
2568 * We only publish the head (and generate a wakeup) when the outer-most
2569 * event completes.
2570 */
2571static void perf_output_lock(struct perf_output_handle *handle)
2572{
2573 struct perf_mmap_data *data = handle->data;
2574 int cpu;
2575
2576 handle->locked = 0;
2577
2578 local_irq_save(handle->flags);
2579 cpu = smp_processor_id();
2580
2581 if (in_nmi() && atomic_read(&data->lock) == cpu)
2582 return;
2583
2584 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2585 cpu_relax();
2586
2587 handle->locked = 1;
2588}
2589
2590static void perf_output_unlock(struct perf_output_handle *handle)
2591{
2592 struct perf_mmap_data *data = handle->data;
2593 unsigned long head;
2594 int cpu;
2595
2596 data->done_head = data->head;
2597
2598 if (!handle->locked)
2599 goto out;
2600
2601again:
2602 /*
2603 * The xchg implies a full barrier that ensures all writes are done
2604 * before we publish the new head, matched by a rmb() in userspace when
2605 * reading this position.
2606 */
2607 while ((head = atomic_long_xchg(&data->done_head, 0)))
2608 data->user_page->data_head = head;
2609
2610 /*
2611 * NMI can happen here, which means we can miss a done_head update.
2612 */
2613
2614 cpu = atomic_xchg(&data->lock, -1);
2615 WARN_ON_ONCE(cpu != smp_processor_id());
2616
2617 /*
2618 * Therefore we have to validate we did not indeed do so.
2619 */
2620 if (unlikely(atomic_long_read(&data->done_head))) {
2621 /*
2622 * Since we had it locked, we can lock it again.
2623 */
2624 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2625 cpu_relax();
2626
2627 goto again;
2628 }
2629
2630 if (atomic_xchg(&data->wakeup, 0))
2631 perf_output_wakeup(handle);
2632out:
2633 local_irq_restore(handle->flags);
2634}
2635
2636static void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len)
2638{
2639 unsigned int pages_mask;
2640 unsigned int offset;
2641 unsigned int size;
2642 void **pages;
2643
2644 offset = handle->offset;
2645 pages_mask = handle->data->nr_pages - 1;
2646 pages = handle->data->data_pages;
2647
2648 do {
2649 unsigned int page_offset;
2650 int nr;
2651
2652 nr = (offset >> PAGE_SHIFT) & pages_mask;
2653 page_offset = offset & (PAGE_SIZE - 1);
2654 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2655
2656 memcpy(pages[nr] + page_offset, buf, size);
2657
2658 len -= size;
2659 buf += size;
2660 offset += size;
2661 } while (len);
2662
2663 handle->offset = offset;
2664
2665 /*
2666 * Check we didn't copy past our reservation window, taking the
2667 * possible unsigned int wrap into account.
2668 */
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670}
2671
2672#define perf_output_put(handle, x) \
2673 perf_output_copy((handle), &(x), sizeof(x))
2674
2675static int perf_output_begin(struct perf_output_handle *handle,
2676 struct perf_counter *counter, unsigned int size,
2677 int nmi, int sample)
2678{
2679 struct perf_counter *output_counter;
2680 struct perf_mmap_data *data;
2681 unsigned int offset, head;
2682 int have_lost;
2683 struct {
2684 struct perf_event_header header;
2685 u64 id;
2686 u64 lost;
2687 } lost_event;
2688
2689 rcu_read_lock();
2690 /*
2691 * For inherited counters we send all the output towards the parent.
2692 */
2693 if (counter->parent)
2694 counter = counter->parent;
2695
2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2700 data = rcu_dereference(counter->data);
2701 if (!data)
2702 goto out;
2703
2704 handle->data = data;
2705 handle->counter = counter;
2706 handle->nmi = nmi;
2707 handle->sample = sample;
2708
2709 if (!data->nr_pages)
2710 goto fail;
2711
2712 have_lost = atomic_read(&data->lost);
2713 if (have_lost)
2714 size += sizeof(lost_event);
2715
2716 perf_output_lock(handle);
2717
2718 do {
2719 offset = head = atomic_long_read(&data->head);
2720 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head)))
2722 goto fail;
2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2724
2725 handle->offset = offset;
2726 handle->head = head;
2727
2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2729 atomic_set(&data->wakeup, 1);
2730
2731 if (have_lost) {
2732 lost_event.header.type = PERF_EVENT_LOST;
2733 lost_event.header.misc = 0;
2734 lost_event.header.size = sizeof(lost_event);
2735 lost_event.id = counter->id;
2736 lost_event.lost = atomic_xchg(&data->lost, 0);
2737
2738 perf_output_put(handle, lost_event);
2739 }
2740
2741 return 0;
2742
2743fail:
2744 atomic_inc(&data->lost);
2745 perf_output_unlock(handle);
2746out:
2747 rcu_read_unlock();
2748
2749 return -ENOSPC;
2750}
2751
2752static void perf_output_end(struct perf_output_handle *handle)
2753{
2754 struct perf_counter *counter = handle->counter;
2755 struct perf_mmap_data *data = handle->data;
2756
2757 int wakeup_events = counter->attr.wakeup_events;
2758
2759 if (handle->sample && wakeup_events) {
2760 int events = atomic_inc_return(&data->events);
2761 if (events >= wakeup_events) {
2762 atomic_sub(wakeup_events, &data->events);
2763 atomic_set(&data->wakeup, 1);
2764 }
2765 }
2766
2767 perf_output_unlock(handle);
2768 rcu_read_unlock();
2769}
2770
2771static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2772{
2773 /*
2774 * only top level counters have the pid namespace they were created in
2775 */
2776 if (counter->parent)
2777 counter = counter->parent;
2778
2779 return task_tgid_nr_ns(p, counter->ns);
2780}
2781
2782static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2783{
2784 /*
2785 * only top level counters have the pid namespace they were created in
2786 */
2787 if (counter->parent)
2788 counter = counter->parent;
2789
2790 return task_pid_nr_ns(p, counter->ns);
2791}
2792
2793static void perf_output_read_one(struct perf_output_handle *handle,
2794 struct perf_counter *counter)
2795{
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[4];
2798 int n = 0;
2799
2800 values[n++] = atomic64_read(&counter->count);
2801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2802 values[n++] = counter->total_time_enabled +
2803 atomic64_read(&counter->child_total_time_enabled);
2804 }
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2806 values[n++] = counter->total_time_running +
2807 atomic64_read(&counter->child_total_time_running);
2808 }
2809 if (read_format & PERF_FORMAT_ID)
2810 values[n++] = primary_counter_id(counter);
2811
2812 perf_output_copy(handle, values, n * sizeof(u64));
2813}
2814
2815/*
2816 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2817 */
2818static void perf_output_read_group(struct perf_output_handle *handle,
2819 struct perf_counter *counter)
2820{
2821 struct perf_counter *leader = counter->group_leader, *sub;
2822 u64 read_format = counter->attr.read_format;
2823 u64 values[5];
2824 int n = 0;
2825
2826 values[n++] = 1 + leader->nr_siblings;
2827
2828 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2829 values[n++] = leader->total_time_enabled;
2830
2831 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2832 values[n++] = leader->total_time_running;
2833
2834 if (leader != counter)
2835 leader->pmu->read(leader);
2836
2837 values[n++] = atomic64_read(&leader->count);
2838 if (read_format & PERF_FORMAT_ID)
2839 values[n++] = primary_counter_id(leader);
2840
2841 perf_output_copy(handle, values, n * sizeof(u64));
2842
2843 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2844 n = 0;
2845
2846 if (sub != counter)
2847 sub->pmu->read(sub);
2848
2849 values[n++] = atomic64_read(&sub->count);
2850 if (read_format & PERF_FORMAT_ID)
2851 values[n++] = primary_counter_id(sub);
2852
2853 perf_output_copy(handle, values, n * sizeof(u64));
2854 }
2855}
2856
2857static void perf_output_read(struct perf_output_handle *handle,
2858 struct perf_counter *counter)
2859{
2860 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2861 perf_output_read_group(handle, counter);
2862 else
2863 perf_output_read_one(handle, counter);
2864}
2865
2866void perf_counter_output(struct perf_counter *counter, int nmi,
2867 struct perf_sample_data *data)
2868{
2869 int ret;
2870 u64 sample_type = counter->attr.sample_type;
2871 struct perf_output_handle handle;
2872 struct perf_event_header header;
2873 u64 ip;
2874 struct {
2875 u32 pid, tid;
2876 } tid_entry;
2877 struct perf_callchain_entry *callchain = NULL;
2878 int callchain_size = 0;
2879 u64 time;
2880 struct {
2881 u32 cpu, reserved;
2882 } cpu_entry;
2883
2884 header.type = PERF_EVENT_SAMPLE;
2885 header.size = sizeof(header);
2886
2887 header.misc = 0;
2888 header.misc |= perf_misc_flags(data->regs);
2889
2890 if (sample_type & PERF_SAMPLE_IP) {
2891 ip = perf_instruction_pointer(data->regs);
2892 header.size += sizeof(ip);
2893 }
2894
2895 if (sample_type & PERF_SAMPLE_TID) {
2896 /* namespace issues */
2897 tid_entry.pid = perf_counter_pid(counter, current);
2898 tid_entry.tid = perf_counter_tid(counter, current);
2899
2900 header.size += sizeof(tid_entry);
2901 }
2902
2903 if (sample_type & PERF_SAMPLE_TIME) {
2904 /*
2905 * Maybe do better on x86 and provide cpu_clock_nmi()
2906 */
2907 time = sched_clock();
2908
2909 header.size += sizeof(u64);
2910 }
2911
2912 if (sample_type & PERF_SAMPLE_ADDR)
2913 header.size += sizeof(u64);
2914
2915 if (sample_type & PERF_SAMPLE_ID)
2916 header.size += sizeof(u64);
2917
2918 if (sample_type & PERF_SAMPLE_STREAM_ID)
2919 header.size += sizeof(u64);
2920
2921 if (sample_type & PERF_SAMPLE_CPU) {
2922 header.size += sizeof(cpu_entry);
2923
2924 cpu_entry.cpu = raw_smp_processor_id();
2925 cpu_entry.reserved = 0;
2926 }
2927
2928 if (sample_type & PERF_SAMPLE_PERIOD)
2929 header.size += sizeof(u64);
2930
2931 if (sample_type & PERF_SAMPLE_READ)
2932 header.size += perf_counter_read_size(counter);
2933
2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2935 callchain = perf_callchain(data->regs);
2936
2937 if (callchain) {
2938 callchain_size = (1 + callchain->nr) * sizeof(u64);
2939 header.size += callchain_size;
2940 } else
2941 header.size += sizeof(u64);
2942 }
2943
2944 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32);
2946
2947 if (data->raw)
2948 size += data->raw->size;
2949 else
2950 size += sizeof(u32);
2951
2952 WARN_ON_ONCE(size & (sizeof(u64)-1));
2953 header.size += size;
2954 }
2955
2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2957 if (ret)
2958 return;
2959
2960 perf_output_put(&handle, header);
2961
2962 if (sample_type & PERF_SAMPLE_IP)
2963 perf_output_put(&handle, ip);
2964
2965 if (sample_type & PERF_SAMPLE_TID)
2966 perf_output_put(&handle, tid_entry);
2967
2968 if (sample_type & PERF_SAMPLE_TIME)
2969 perf_output_put(&handle, time);
2970
2971 if (sample_type & PERF_SAMPLE_ADDR)
2972 perf_output_put(&handle, data->addr);
2973
2974 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter);
2976
2977 perf_output_put(&handle, id);
2978 }
2979
2980 if (sample_type & PERF_SAMPLE_STREAM_ID)
2981 perf_output_put(&handle, counter->id);
2982
2983 if (sample_type & PERF_SAMPLE_CPU)
2984 perf_output_put(&handle, cpu_entry);
2985
2986 if (sample_type & PERF_SAMPLE_PERIOD)
2987 perf_output_put(&handle, data->period);
2988
2989 if (sample_type & PERF_SAMPLE_READ)
2990 perf_output_read(&handle, counter);
2991
2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2993 if (callchain)
2994 perf_output_copy(&handle, callchain, callchain_size);
2995 else {
2996 u64 nr = 0;
2997 perf_output_put(&handle, nr);
2998 }
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) {
3003 perf_output_put(&handle, data->raw->size);
3004 perf_output_copy(&handle, data->raw->data, data->raw->size);
3005 } else {
3006 struct {
3007 u32 size;
3008 u32 data;
3009 } raw = {
3010 .size = sizeof(u32),
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 }
3016
3017 perf_output_end(&handle);
3018}
3019
3020/*
3021 * read event
3022 */
3023
3024struct perf_read_event {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 tid;
3029};
3030
3031static void
3032perf_counter_read_event(struct perf_counter *counter,
3033 struct task_struct *task)
3034{
3035 struct perf_output_handle handle;
3036 struct perf_read_event event = {
3037 .header = {
3038 .type = PERF_EVENT_READ,
3039 .misc = 0,
3040 .size = sizeof(event) + perf_counter_read_size(counter),
3041 },
3042 .pid = perf_counter_pid(counter, task),
3043 .tid = perf_counter_tid(counter, task),
3044 };
3045 int ret;
3046
3047 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3048 if (ret)
3049 return;
3050
3051 perf_output_put(&handle, event);
3052 perf_output_read(&handle, counter);
3053
3054 perf_output_end(&handle);
3055}
3056
3057/*
3058 * task tracking -- fork/exit
3059 *
3060 * enabled by: attr.comm | attr.mmap | attr.task
3061 */
3062
3063struct perf_task_event {
3064 struct task_struct *task;
3065 struct perf_counter_context *task_ctx;
3066
3067 struct {
3068 struct perf_event_header header;
3069
3070 u32 pid;
3071 u32 ppid;
3072 u32 tid;
3073 u32 ptid;
3074 } event;
3075};
3076
3077static void perf_counter_task_output(struct perf_counter *counter,
3078 struct perf_task_event *task_event)
3079{
3080 struct perf_output_handle handle;
3081 int size = task_event->event.header.size;
3082 struct task_struct *task = task_event->task;
3083 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3084
3085 if (ret)
3086 return;
3087
3088 task_event->event.pid = perf_counter_pid(counter, task);
3089 task_event->event.ppid = perf_counter_pid(counter, current);
3090
3091 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current);
3093
3094 perf_output_put(&handle, task_event->event);
3095 perf_output_end(&handle);
3096}
3097
3098static int perf_counter_task_match(struct perf_counter *counter)
3099{
3100 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3101 return 1;
3102
3103 return 0;
3104}
3105
3106static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3107 struct perf_task_event *task_event)
3108{
3109 struct perf_counter *counter;
3110
3111 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3112 return;
3113
3114 rcu_read_lock();
3115 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3116 if (perf_counter_task_match(counter))
3117 perf_counter_task_output(counter, task_event);
3118 }
3119 rcu_read_unlock();
3120}
3121
3122static void perf_counter_task_event(struct perf_task_event *task_event)
3123{
3124 struct perf_cpu_context *cpuctx;
3125 struct perf_counter_context *ctx = task_event->task_ctx;
3126
3127 cpuctx = &get_cpu_var(perf_cpu_context);
3128 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3129 put_cpu_var(perf_cpu_context);
3130
3131 rcu_read_lock();
3132 if (!ctx)
3133 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3134 if (ctx)
3135 perf_counter_task_ctx(ctx, task_event);
3136 rcu_read_unlock();
3137}
3138
3139static void perf_counter_task(struct task_struct *task,
3140 struct perf_counter_context *task_ctx,
3141 int new)
3142{
3143 struct perf_task_event task_event;
3144
3145 if (!atomic_read(&nr_comm_counters) &&
3146 !atomic_read(&nr_mmap_counters) &&
3147 !atomic_read(&nr_task_counters))
3148 return;
3149
3150 task_event = (struct perf_task_event){
3151 .task = task,
3152 .task_ctx = task_ctx,
3153 .event = {
3154 .header = {
3155 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3156 .misc = 0,
3157 .size = sizeof(task_event.event),
3158 },
3159 /* .pid */
3160 /* .ppid */
3161 /* .tid */
3162 /* .ptid */
3163 },
3164 };
3165
3166 perf_counter_task_event(&task_event);
3167}
3168
3169void perf_counter_fork(struct task_struct *task)
3170{
3171 perf_counter_task(task, NULL, 1);
3172}
3173
3174/*
3175 * comm tracking
3176 */
3177
3178struct perf_comm_event {
3179 struct task_struct *task;
3180 char *comm;
3181 int comm_size;
3182
3183 struct {
3184 struct perf_event_header header;
3185
3186 u32 pid;
3187 u32 tid;
3188 } event;
3189};
3190
3191static void perf_counter_comm_output(struct perf_counter *counter,
3192 struct perf_comm_event *comm_event)
3193{
3194 struct perf_output_handle handle;
3195 int size = comm_event->event.header.size;
3196 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3197
3198 if (ret)
3199 return;
3200
3201 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3202 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3203
3204 perf_output_put(&handle, comm_event->event);
3205 perf_output_copy(&handle, comm_event->comm,
3206 comm_event->comm_size);
3207 perf_output_end(&handle);
3208}
3209
3210static int perf_counter_comm_match(struct perf_counter *counter)
3211{
3212 if (counter->attr.comm)
3213 return 1;
3214
3215 return 0;
3216}
3217
3218static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3219 struct perf_comm_event *comm_event)
3220{
3221 struct perf_counter *counter;
3222
3223 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3224 return;
3225
3226 rcu_read_lock();
3227 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3228 if (perf_counter_comm_match(counter))
3229 perf_counter_comm_output(counter, comm_event);
3230 }
3231 rcu_read_unlock();
3232}
3233
3234static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3235{
3236 struct perf_cpu_context *cpuctx;
3237 struct perf_counter_context *ctx;
3238 unsigned int size;
3239 char comm[TASK_COMM_LEN];
3240
3241 memset(comm, 0, sizeof(comm));
3242 strncpy(comm, comm_event->task->comm, sizeof(comm));
3243 size = ALIGN(strlen(comm)+1, sizeof(u64));
3244
3245 comm_event->comm = comm;
3246 comm_event->comm_size = size;
3247
3248 comm_event->event.header.size = sizeof(comm_event->event) + size;
3249
3250 cpuctx = &get_cpu_var(perf_cpu_context);
3251 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3252 put_cpu_var(perf_cpu_context);
3253
3254 rcu_read_lock();
3255 /*
3256 * doesn't really matter which of the child contexts the
3257 * events ends up in.
3258 */
3259 ctx = rcu_dereference(current->perf_counter_ctxp);
3260 if (ctx)
3261 perf_counter_comm_ctx(ctx, comm_event);
3262 rcu_read_unlock();
3263}
3264
3265void perf_counter_comm(struct task_struct *task)
3266{
3267 struct perf_comm_event comm_event;
3268
3269 if (task->perf_counter_ctxp)
3270 perf_counter_enable_on_exec(task);
3271
3272 if (!atomic_read(&nr_comm_counters))
3273 return;
3274
3275 comm_event = (struct perf_comm_event){
3276 .task = task,
3277 /* .comm */
3278 /* .comm_size */
3279 .event = {
3280 .header = {
3281 .type = PERF_EVENT_COMM,
3282 .misc = 0,
3283 /* .size */
3284 },
3285 /* .pid */
3286 /* .tid */
3287 },
3288 };
3289
3290 perf_counter_comm_event(&comm_event);
3291}
3292
3293/*
3294 * mmap tracking
3295 */
3296
3297struct perf_mmap_event {
3298 struct vm_area_struct *vma;
3299
3300 const char *file_name;
3301 int file_size;
3302
3303 struct {
3304 struct perf_event_header header;
3305
3306 u32 pid;
3307 u32 tid;
3308 u64 start;
3309 u64 len;
3310 u64 pgoff;
3311 } event;
3312};
3313
3314static void perf_counter_mmap_output(struct perf_counter *counter,
3315 struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_output_handle handle;
3318 int size = mmap_event->event.header.size;
3319 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3320
3321 if (ret)
3322 return;
3323
3324 mmap_event->event.pid = perf_counter_pid(counter, current);
3325 mmap_event->event.tid = perf_counter_tid(counter, current);
3326
3327 perf_output_put(&handle, mmap_event->event);
3328 perf_output_copy(&handle, mmap_event->file_name,
3329 mmap_event->file_size);
3330 perf_output_end(&handle);
3331}
3332
3333static int perf_counter_mmap_match(struct perf_counter *counter,
3334 struct perf_mmap_event *mmap_event)
3335{
3336 if (counter->attr.mmap)
3337 return 1;
3338
3339 return 0;
3340}
3341
3342static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3343 struct perf_mmap_event *mmap_event)
3344{
3345 struct perf_counter *counter;
3346
3347 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3348 return;
3349
3350 rcu_read_lock();
3351 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3352 if (perf_counter_mmap_match(counter, mmap_event))
3353 perf_counter_mmap_output(counter, mmap_event);
3354 }
3355 rcu_read_unlock();
3356}
3357
3358static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3359{
3360 struct perf_cpu_context *cpuctx;
3361 struct perf_counter_context *ctx;
3362 struct vm_area_struct *vma = mmap_event->vma;
3363 struct file *file = vma->vm_file;
3364 unsigned int size;
3365 char tmp[16];
3366 char *buf = NULL;
3367 const char *name;
3368
3369 memset(tmp, 0, sizeof(tmp));
3370
3371 if (file) {
3372 /*
3373 * d_path works from the end of the buffer backwards, so we
3374 * need to add enough zero bytes after the string to handle
3375 * the 64bit alignment we do later.
3376 */
3377 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3378 if (!buf) {
3379 name = strncpy(tmp, "//enomem", sizeof(tmp));
3380 goto got_name;
3381 }
3382 name = d_path(&file->f_path, buf, PATH_MAX);
3383 if (IS_ERR(name)) {
3384 name = strncpy(tmp, "//toolong", sizeof(tmp));
3385 goto got_name;
3386 }
3387 } else {
3388 if (arch_vma_name(mmap_event->vma)) {
3389 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3390 sizeof(tmp));
3391 goto got_name;
3392 }
3393
3394 if (!vma->vm_mm) {
3395 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3396 goto got_name;
3397 }
3398
3399 name = strncpy(tmp, "//anon", sizeof(tmp));
3400 goto got_name;
3401 }
3402
3403got_name:
3404 size = ALIGN(strlen(name)+1, sizeof(u64));
3405
3406 mmap_event->file_name = name;
3407 mmap_event->file_size = size;
3408
3409 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3410
3411 cpuctx = &get_cpu_var(perf_cpu_context);
3412 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3413 put_cpu_var(perf_cpu_context);
3414
3415 rcu_read_lock();
3416 /*
3417 * doesn't really matter which of the child contexts the
3418 * events ends up in.
3419 */
3420 ctx = rcu_dereference(current->perf_counter_ctxp);
3421 if (ctx)
3422 perf_counter_mmap_ctx(ctx, mmap_event);
3423 rcu_read_unlock();
3424
3425 kfree(buf);
3426}
3427
3428void __perf_counter_mmap(struct vm_area_struct *vma)
3429{
3430 struct perf_mmap_event mmap_event;
3431
3432 if (!atomic_read(&nr_mmap_counters))
3433 return;
3434
3435 mmap_event = (struct perf_mmap_event){
3436 .vma = vma,
3437 /* .file_name */
3438 /* .file_size */
3439 .event = {
3440 .header = {
3441 .type = PERF_EVENT_MMAP,
3442 .misc = 0,
3443 /* .size */
3444 },
3445 /* .pid */
3446 /* .tid */
3447 .start = vma->vm_start,
3448 .len = vma->vm_end - vma->vm_start,
3449 .pgoff = vma->vm_pgoff,
3450 },
3451 };
3452
3453 perf_counter_mmap_event(&mmap_event);
3454}
3455
3456/*
3457 * IRQ throttle logging
3458 */
3459
3460static void perf_log_throttle(struct perf_counter *counter, int enable)
3461{
3462 struct perf_output_handle handle;
3463 int ret;
3464
3465 struct {
3466 struct perf_event_header header;
3467 u64 time;
3468 u64 id;
3469 u64 stream_id;
3470 } throttle_event = {
3471 .header = {
3472 .type = PERF_EVENT_THROTTLE,
3473 .misc = 0,
3474 .size = sizeof(throttle_event),
3475 },
3476 .time = sched_clock(),
3477 .id = primary_counter_id(counter),
3478 .stream_id = counter->id,
3479 };
3480
3481 if (enable)
3482 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3483
3484 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3485 if (ret)
3486 return;
3487
3488 perf_output_put(&handle, throttle_event);
3489 perf_output_end(&handle);
3490}
3491
3492/*
3493 * Generic counter overflow handling, sampling.
3494 */
3495
3496int perf_counter_overflow(struct perf_counter *counter, int nmi,
3497 struct perf_sample_data *data)
3498{
3499 int events = atomic_read(&counter->event_limit);
3500 int throttle = counter->pmu->unthrottle != NULL;
3501 struct hw_perf_counter *hwc = &counter->hw;
3502 int ret = 0;
3503
3504 if (!throttle) {
3505 hwc->interrupts++;
3506 } else {
3507 if (hwc->interrupts != MAX_INTERRUPTS) {
3508 hwc->interrupts++;
3509 if (HZ * hwc->interrupts >
3510 (u64)sysctl_perf_counter_sample_rate) {
3511 hwc->interrupts = MAX_INTERRUPTS;
3512 perf_log_throttle(counter, 0);
3513 ret = 1;
3514 }
3515 } else {
3516 /*
3517 * Keep re-disabling counters even though on the previous
3518 * pass we disabled it - just in case we raced with a
3519 * sched-in and the counter got enabled again:
3520 */
3521 ret = 1;
3522 }
3523 }
3524
3525 if (counter->attr.freq) {
3526 u64 now = sched_clock();
3527 s64 delta = now - hwc->freq_stamp;
3528
3529 hwc->freq_stamp = now;
3530
3531 if (delta > 0 && delta < TICK_NSEC)
3532 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3533 }
3534
3535 /*
3536 * XXX event_limit might not quite work as expected on inherited
3537 * counters
3538 */
3539
3540 counter->pending_kill = POLL_IN;
3541 if (events && atomic_dec_and_test(&counter->event_limit)) {
3542 ret = 1;
3543 counter->pending_kill = POLL_HUP;
3544 if (nmi) {
3545 counter->pending_disable = 1;
3546 perf_pending_queue(&counter->pending,
3547 perf_pending_counter);
3548 } else
3549 perf_counter_disable(counter);
3550 }
3551
3552 perf_counter_output(counter, nmi, data);
3553 return ret;
3554}
3555
3556/*
3557 * Generic software counter infrastructure
3558 */
3559
3560/*
3561 * We directly increment counter->count and keep a second value in
3562 * counter->hw.period_left to count intervals. This period counter
3563 * is kept in the range [-sample_period, 0] so that we can use the
3564 * sign as trigger.
3565 */
3566
3567static u64 perf_swcounter_set_period(struct perf_counter *counter)
3568{
3569 struct hw_perf_counter *hwc = &counter->hw;
3570 u64 period = hwc->last_period;
3571 u64 nr, offset;
3572 s64 old, val;
3573
3574 hwc->last_period = hwc->sample_period;
3575
3576again:
3577 old = val = atomic64_read(&hwc->period_left);
3578 if (val < 0)
3579 return 0;
3580
3581 nr = div64_u64(period + val, period);
3582 offset = nr * period;
3583 val -= offset;
3584 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3585 goto again;
3586
3587 return nr;
3588}
3589
3590static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data)
3592{
3593 struct hw_perf_counter *hwc = &counter->hw;
3594 u64 overflow;
3595
3596 data->period = counter->hw.last_period;
3597 overflow = perf_swcounter_set_period(counter);
3598
3599 if (hwc->interrupts == MAX_INTERRUPTS)
3600 return;
3601
3602 for (; overflow; overflow--) {
3603 if (perf_counter_overflow(counter, nmi, data)) {
3604 /*
3605 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS.
3607 */
3608 break;
3609 }
3610 }
3611}
3612
3613static void perf_swcounter_unthrottle(struct perf_counter *counter)
3614{
3615 /*
3616 * Nothing to do, we already reset hwc->interrupts.
3617 */
3618}
3619
3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3621 int nmi, struct perf_sample_data *data)
3622{
3623 struct hw_perf_counter *hwc = &counter->hw;
3624
3625 atomic64_add(nr, &counter->count);
3626
3627 if (!hwc->sample_period)
3628 return;
3629
3630 if (!data->regs)
3631 return;
3632
3633 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data);
3635}
3636
3637static int perf_swcounter_is_counting(struct perf_counter *counter)
3638{
3639 /*
3640 * The counter is active, we're good!
3641 */
3642 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3643 return 1;
3644
3645 /*
3646 * The counter is off/error, not counting.
3647 */
3648 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3649 return 0;
3650
3651 /*
3652 * The counter is inactive, if the context is active
3653 * we're part of a group that didn't make it on the 'pmu',
3654 * not counting.
3655 */
3656 if (counter->ctx->is_active)
3657 return 0;
3658
3659 /*
3660 * We're inactive and the context is too, this means the
3661 * task is scheduled out, we're counting events that happen
3662 * to us, like migration events.
3663 */
3664 return 1;
3665}
3666
3667static int perf_swcounter_match(struct perf_counter *counter,
3668 enum perf_type_id type,
3669 u32 event, struct pt_regs *regs)
3670{
3671 if (!perf_swcounter_is_counting(counter))
3672 return 0;
3673
3674 if (counter->attr.type != type)
3675 return 0;
3676 if (counter->attr.config != event)
3677 return 0;
3678
3679 if (regs) {
3680 if (counter->attr.exclude_user && user_mode(regs))
3681 return 0;
3682
3683 if (counter->attr.exclude_kernel && !user_mode(regs))
3684 return 0;
3685 }
3686
3687 return 1;
3688}
3689
3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3691 enum perf_type_id type,
3692 u32 event, u64 nr, int nmi,
3693 struct perf_sample_data *data)
3694{
3695 struct perf_counter *counter;
3696
3697 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3698 return;
3699
3700 rcu_read_lock();
3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3702 if (perf_swcounter_match(counter, type, event, data->regs))
3703 perf_swcounter_add(counter, nr, nmi, data);
3704 }
3705 rcu_read_unlock();
3706}
3707
3708static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3709{
3710 if (in_nmi())
3711 return &cpuctx->recursion[3];
3712
3713 if (in_irq())
3714 return &cpuctx->recursion[2];
3715
3716 if (in_softirq())
3717 return &cpuctx->recursion[1];
3718
3719 return &cpuctx->recursion[0];
3720}
3721
3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3723 u64 nr, int nmi,
3724 struct perf_sample_data *data)
3725{
3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3727 int *recursion = perf_swcounter_recursion_context(cpuctx);
3728 struct perf_counter_context *ctx;
3729
3730 if (*recursion)
3731 goto out;
3732
3733 (*recursion)++;
3734 barrier();
3735
3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3737 nr, nmi, data);
3738 rcu_read_lock();
3739 /*
3740 * doesn't really matter which of the child contexts the
3741 * events ends up in.
3742 */
3743 ctx = rcu_dereference(current->perf_counter_ctxp);
3744 if (ctx)
3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3746 rcu_read_unlock();
3747
3748 barrier();
3749 (*recursion)--;
3750
3751out:
3752 put_cpu_var(perf_cpu_context);
3753}
3754
3755void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3756 struct pt_regs *regs, u64 addr)
3757{
3758 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr,
3761 };
3762
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3764}
3765
3766static void perf_swcounter_read(struct perf_counter *counter)
3767{
3768}
3769
3770static int perf_swcounter_enable(struct perf_counter *counter)
3771{
3772 struct hw_perf_counter *hwc = &counter->hw;
3773
3774 if (hwc->sample_period) {
3775 hwc->last_period = hwc->sample_period;
3776 perf_swcounter_set_period(counter);
3777 }
3778 return 0;
3779}
3780
3781static void perf_swcounter_disable(struct perf_counter *counter)
3782{
3783}
3784
3785static const struct pmu perf_ops_generic = {
3786 .enable = perf_swcounter_enable,
3787 .disable = perf_swcounter_disable,
3788 .read = perf_swcounter_read,
3789 .unthrottle = perf_swcounter_unthrottle,
3790};
3791
3792/*
3793 * hrtimer based swcounter callback
3794 */
3795
3796static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{
3798 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data;
3800 struct perf_counter *counter;
3801 u64 period;
3802
3803 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3804 counter->pmu->read(counter);
3805
3806 data.addr = 0;
3807 data.regs = get_irq_regs();
3808 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP.
3811 */
3812 if ((counter->attr.exclude_kernel || !data.regs) &&
3813 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current);
3815
3816 if (data.regs) {
3817 if (perf_counter_overflow(counter, 0, &data))
3818 ret = HRTIMER_NORESTART;
3819 }
3820
3821 period = max_t(u64, 10000, counter->hw.sample_period);
3822 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3823
3824 return ret;
3825}
3826
3827/*
3828 * Software counter: cpu wall time clock
3829 */
3830
3831static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3832{
3833 int cpu = raw_smp_processor_id();
3834 s64 prev;
3835 u64 now;
3836
3837 now = cpu_clock(cpu);
3838 prev = atomic64_read(&counter->hw.prev_count);
3839 atomic64_set(&counter->hw.prev_count, now);
3840 atomic64_add(now - prev, &counter->count);
3841}
3842
3843static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3844{
3845 struct hw_perf_counter *hwc = &counter->hw;
3846 int cpu = raw_smp_processor_id();
3847
3848 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3849 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3850 hwc->hrtimer.function = perf_swcounter_hrtimer;
3851 if (hwc->sample_period) {
3852 u64 period = max_t(u64, 10000, hwc->sample_period);
3853 __hrtimer_start_range_ns(&hwc->hrtimer,
3854 ns_to_ktime(period), 0,
3855 HRTIMER_MODE_REL, 0);
3856 }
3857
3858 return 0;
3859}
3860
3861static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3862{
3863 if (counter->hw.sample_period)
3864 hrtimer_cancel(&counter->hw.hrtimer);
3865 cpu_clock_perf_counter_update(counter);
3866}
3867
3868static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3869{
3870 cpu_clock_perf_counter_update(counter);
3871}
3872
3873static const struct pmu perf_ops_cpu_clock = {
3874 .enable = cpu_clock_perf_counter_enable,
3875 .disable = cpu_clock_perf_counter_disable,
3876 .read = cpu_clock_perf_counter_read,
3877};
3878
3879/*
3880 * Software counter: task time clock
3881 */
3882
3883static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3884{
3885 u64 prev;
3886 s64 delta;
3887
3888 prev = atomic64_xchg(&counter->hw.prev_count, now);
3889 delta = now - prev;
3890 atomic64_add(delta, &counter->count);
3891}
3892
3893static int task_clock_perf_counter_enable(struct perf_counter *counter)
3894{
3895 struct hw_perf_counter *hwc = &counter->hw;
3896 u64 now;
3897
3898 now = counter->ctx->time;
3899
3900 atomic64_set(&hwc->prev_count, now);
3901 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3902 hwc->hrtimer.function = perf_swcounter_hrtimer;
3903 if (hwc->sample_period) {
3904 u64 period = max_t(u64, 10000, hwc->sample_period);
3905 __hrtimer_start_range_ns(&hwc->hrtimer,
3906 ns_to_ktime(period), 0,
3907 HRTIMER_MODE_REL, 0);
3908 }
3909
3910 return 0;
3911}
3912
3913static void task_clock_perf_counter_disable(struct perf_counter *counter)
3914{
3915 if (counter->hw.sample_period)
3916 hrtimer_cancel(&counter->hw.hrtimer);
3917 task_clock_perf_counter_update(counter, counter->ctx->time);
3918
3919}
3920
3921static void task_clock_perf_counter_read(struct perf_counter *counter)
3922{
3923 u64 time;
3924
3925 if (!in_nmi()) {
3926 update_context_time(counter->ctx);
3927 time = counter->ctx->time;
3928 } else {
3929 u64 now = perf_clock();
3930 u64 delta = now - counter->ctx->timestamp;
3931 time = counter->ctx->time + delta;
3932 }
3933
3934 task_clock_perf_counter_update(counter, time);
3935}
3936
3937static const struct pmu perf_ops_task_clock = {
3938 .enable = task_clock_perf_counter_enable,
3939 .disable = task_clock_perf_counter_disable,
3940 .read = task_clock_perf_counter_read,
3941};
3942
3943#ifdef CONFIG_EVENT_PROFILE
3944void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3945 int entry_size)
3946{
3947 struct perf_raw_record raw = {
3948 .size = entry_size,
3949 .data = record,
3950 };
3951
3952 struct perf_sample_data data = {
3953 .regs = get_irq_regs(),
3954 .addr = addr,
3955 .raw = &raw,
3956 };
3957
3958 if (!data.regs)
3959 data.regs = task_pt_regs(current);
3960
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3962}
3963EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3964
3965extern int ftrace_profile_enable(int);
3966extern void ftrace_profile_disable(int);
3967
3968static void tp_perf_counter_destroy(struct perf_counter *counter)
3969{
3970 ftrace_profile_disable(counter->attr.config);
3971}
3972
3973static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3974{
3975 /*
3976 * Raw tracepoint data is a severe data leak, only allow root to
3977 * have these.
3978 */
3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3981 !capable(CAP_SYS_ADMIN))
3982 return ERR_PTR(-EPERM);
3983
3984 if (ftrace_profile_enable(counter->attr.config))
3985 return NULL;
3986
3987 counter->destroy = tp_perf_counter_destroy;
3988
3989 return &perf_ops_generic;
3990}
3991#else
3992static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3993{
3994 return NULL;
3995}
3996#endif
3997
3998atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3999
4000static void sw_perf_counter_destroy(struct perf_counter *counter)
4001{
4002 u64 event = counter->attr.config;
4003
4004 WARN_ON(counter->parent);
4005
4006 atomic_dec(&perf_swcounter_enabled[event]);
4007}
4008
4009static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
4010{
4011 const struct pmu *pmu = NULL;
4012 u64 event = counter->attr.config;
4013
4014 /*
4015 * Software counters (currently) can't in general distinguish
4016 * between user, kernel and hypervisor events.
4017 * However, context switches and cpu migrations are considered
4018 * to be kernel events, and page faults are never hypervisor
4019 * events.
4020 */
4021 switch (event) {
4022 case PERF_COUNT_SW_CPU_CLOCK:
4023 pmu = &perf_ops_cpu_clock;
4024
4025 break;
4026 case PERF_COUNT_SW_TASK_CLOCK:
4027 /*
4028 * If the user instantiates this as a per-cpu counter,
4029 * use the cpu_clock counter instead.
4030 */
4031 if (counter->ctx->task)
4032 pmu = &perf_ops_task_clock;
4033 else
4034 pmu = &perf_ops_cpu_clock;
4035
4036 break;
4037 case PERF_COUNT_SW_PAGE_FAULTS:
4038 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4039 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4040 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4041 case PERF_COUNT_SW_CPU_MIGRATIONS:
4042 if (!counter->parent) {
4043 atomic_inc(&perf_swcounter_enabled[event]);
4044 counter->destroy = sw_perf_counter_destroy;
4045 }
4046 pmu = &perf_ops_generic;
4047 break;
4048 }
4049
4050 return pmu;
4051}
4052
4053/*
4054 * Allocate and initialize a counter structure
4055 */
4056static struct perf_counter *
4057perf_counter_alloc(struct perf_counter_attr *attr,
4058 int cpu,
4059 struct perf_counter_context *ctx,
4060 struct perf_counter *group_leader,
4061 struct perf_counter *parent_counter,
4062 gfp_t gfpflags)
4063{
4064 const struct pmu *pmu;
4065 struct perf_counter *counter;
4066 struct hw_perf_counter *hwc;
4067 long err;
4068
4069 counter = kzalloc(sizeof(*counter), gfpflags);
4070 if (!counter)
4071 return ERR_PTR(-ENOMEM);
4072
4073 /*
4074 * Single counters are their own group leaders, with an
4075 * empty sibling list:
4076 */
4077 if (!group_leader)
4078 group_leader = counter;
4079
4080 mutex_init(&counter->child_mutex);
4081 INIT_LIST_HEAD(&counter->child_list);
4082
4083 INIT_LIST_HEAD(&counter->list_entry);
4084 INIT_LIST_HEAD(&counter->event_entry);
4085 INIT_LIST_HEAD(&counter->sibling_list);
4086 init_waitqueue_head(&counter->waitq);
4087
4088 mutex_init(&counter->mmap_mutex);
4089
4090 counter->cpu = cpu;
4091 counter->attr = *attr;
4092 counter->group_leader = group_leader;
4093 counter->pmu = NULL;
4094 counter->ctx = ctx;
4095 counter->oncpu = -1;
4096
4097 counter->parent = parent_counter;
4098
4099 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4100 counter->id = atomic64_inc_return(&perf_counter_id);
4101
4102 counter->state = PERF_COUNTER_STATE_INACTIVE;
4103
4104 if (attr->disabled)
4105 counter->state = PERF_COUNTER_STATE_OFF;
4106
4107 pmu = NULL;
4108
4109 hwc = &counter->hw;
4110 hwc->sample_period = attr->sample_period;
4111 if (attr->freq && attr->sample_freq)
4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
4114
4115 atomic64_set(&hwc->period_left, hwc->sample_period);
4116
4117 /*
4118 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4119 */
4120 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4121 goto done;
4122
4123 switch (attr->type) {
4124 case PERF_TYPE_RAW:
4125 case PERF_TYPE_HARDWARE:
4126 case PERF_TYPE_HW_CACHE:
4127 pmu = hw_perf_counter_init(counter);
4128 break;
4129
4130 case PERF_TYPE_SOFTWARE:
4131 pmu = sw_perf_counter_init(counter);
4132 break;
4133
4134 case PERF_TYPE_TRACEPOINT:
4135 pmu = tp_perf_counter_init(counter);
4136 break;
4137
4138 default:
4139 break;
4140 }
4141done:
4142 err = 0;
4143 if (!pmu)
4144 err = -EINVAL;
4145 else if (IS_ERR(pmu))
4146 err = PTR_ERR(pmu);
4147
4148 if (err) {
4149 if (counter->ns)
4150 put_pid_ns(counter->ns);
4151 kfree(counter);
4152 return ERR_PTR(err);
4153 }
4154
4155 counter->pmu = pmu;
4156
4157 if (!counter->parent) {
4158 atomic_inc(&nr_counters);
4159 if (counter->attr.mmap)
4160 atomic_inc(&nr_mmap_counters);
4161 if (counter->attr.comm)
4162 atomic_inc(&nr_comm_counters);
4163 if (counter->attr.task)
4164 atomic_inc(&nr_task_counters);
4165 }
4166
4167 return counter;
4168}
4169
4170static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4171 struct perf_counter_attr *attr)
4172{
4173 int ret;
4174 u32 size;
4175
4176 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4177 return -EFAULT;
4178
4179 /*
4180 * zero the full structure, so that a short copy will be nice.
4181 */
4182 memset(attr, 0, sizeof(*attr));
4183
4184 ret = get_user(size, &uattr->size);
4185 if (ret)
4186 return ret;
4187
4188 if (size > PAGE_SIZE) /* silly large */
4189 goto err_size;
4190
4191 if (!size) /* abi compat */
4192 size = PERF_ATTR_SIZE_VER0;
4193
4194 if (size < PERF_ATTR_SIZE_VER0)
4195 goto err_size;
4196
4197 /*
4198 * If we're handed a bigger struct than we know of,
4199 * ensure all the unknown bits are 0.
4200 */
4201 if (size > sizeof(*attr)) {
4202 unsigned long val;
4203 unsigned long __user *addr;
4204 unsigned long __user *end;
4205
4206 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4207 sizeof(unsigned long));
4208 end = PTR_ALIGN((void __user *)uattr + size,
4209 sizeof(unsigned long));
4210
4211 for (; addr < end; addr += sizeof(unsigned long)) {
4212 ret = get_user(val, addr);
4213 if (ret)
4214 return ret;
4215 if (val)
4216 goto err_size;
4217 }
4218 }
4219
4220 ret = copy_from_user(attr, uattr, size);
4221 if (ret)
4222 return -EFAULT;
4223
4224 /*
4225 * If the type exists, the corresponding creation will verify
4226 * the attr->config.
4227 */
4228 if (attr->type >= PERF_TYPE_MAX)
4229 return -EINVAL;
4230
4231 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4232 return -EINVAL;
4233
4234 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4235 return -EINVAL;
4236
4237 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4238 return -EINVAL;
4239
4240out:
4241 return ret;
4242
4243err_size:
4244 put_user(sizeof(*attr), &uattr->size);
4245 ret = -E2BIG;
4246 goto out;
4247}
4248
4249int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4250{
4251 struct perf_counter *output_counter = NULL;
4252 struct file *output_file = NULL;
4253 struct perf_counter *old_output;
4254 int fput_needed = 0;
4255 int ret = -EINVAL;
4256
4257 if (!output_fd)
4258 goto set;
4259
4260 output_file = fget_light(output_fd, &fput_needed);
4261 if (!output_file)
4262 return -EBADF;
4263
4264 if (output_file->f_op != &perf_fops)
4265 goto out;
4266
4267 output_counter = output_file->private_data;
4268
4269 /* Don't chain output fds */
4270 if (output_counter->output)
4271 goto out;
4272
4273 /* Don't set an output fd when we already have an output channel */
4274 if (counter->data)
4275 goto out;
4276
4277 atomic_long_inc(&output_file->f_count);
4278
4279set:
4280 mutex_lock(&counter->mmap_mutex);
4281 old_output = counter->output;
4282 rcu_assign_pointer(counter->output, output_counter);
4283 mutex_unlock(&counter->mmap_mutex);
4284
4285 if (old_output) {
4286 /*
4287 * we need to make sure no existing perf_output_*()
4288 * is still referencing this counter.
4289 */
4290 synchronize_rcu();
4291 fput(old_output->filp);
4292 }
4293
4294 ret = 0;
4295out:
4296 fput_light(output_file, fput_needed);
4297 return ret;
4298}
4299
4300/**
4301 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4302 *
4303 * @attr_uptr: event type attributes for monitoring/sampling
4304 * @pid: target pid
4305 * @cpu: target cpu
4306 * @group_fd: group leader counter fd
4307 */
4308SYSCALL_DEFINE5(perf_counter_open,
4309 struct perf_counter_attr __user *, attr_uptr,
4310 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4311{
4312 struct perf_counter *counter, *group_leader;
4313 struct perf_counter_attr attr;
4314 struct perf_counter_context *ctx;
4315 struct file *counter_file = NULL;
4316 struct file *group_file = NULL;
4317 int fput_needed = 0;
4318 int fput_needed2 = 0;
4319 int err;
4320
4321 /* for future expandability... */
4322 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4323 return -EINVAL;
4324
4325 err = perf_copy_attr(attr_uptr, &attr);
4326 if (err)
4327 return err;
4328
4329 if (!attr.exclude_kernel) {
4330 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4331 return -EACCES;
4332 }
4333
4334 if (attr.freq) {
4335 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4336 return -EINVAL;
4337 }
4338
4339 /*
4340 * Get the target context (task or percpu):
4341 */
4342 ctx = find_get_context(pid, cpu);
4343 if (IS_ERR(ctx))
4344 return PTR_ERR(ctx);
4345
4346 /*
4347 * Look up the group leader (we will attach this counter to it):
4348 */
4349 group_leader = NULL;
4350 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4351 err = -EINVAL;
4352 group_file = fget_light(group_fd, &fput_needed);
4353 if (!group_file)
4354 goto err_put_context;
4355 if (group_file->f_op != &perf_fops)
4356 goto err_put_context;
4357
4358 group_leader = group_file->private_data;
4359 /*
4360 * Do not allow a recursive hierarchy (this new sibling
4361 * becoming part of another group-sibling):
4362 */
4363 if (group_leader->group_leader != group_leader)
4364 goto err_put_context;
4365 /*
4366 * Do not allow to attach to a group in a different
4367 * task or CPU context:
4368 */
4369 if (group_leader->ctx != ctx)
4370 goto err_put_context;
4371 /*
4372 * Only a group leader can be exclusive or pinned
4373 */
4374 if (attr.exclusive || attr.pinned)
4375 goto err_put_context;
4376 }
4377
4378 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4379 NULL, GFP_KERNEL);
4380 err = PTR_ERR(counter);
4381 if (IS_ERR(counter))
4382 goto err_put_context;
4383
4384 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4385 if (err < 0)
4386 goto err_free_put_context;
4387
4388 counter_file = fget_light(err, &fput_needed2);
4389 if (!counter_file)
4390 goto err_free_put_context;
4391
4392 if (flags & PERF_FLAG_FD_OUTPUT) {
4393 err = perf_counter_set_output(counter, group_fd);
4394 if (err)
4395 goto err_fput_free_put_context;
4396 }
4397
4398 counter->filp = counter_file;
4399 WARN_ON_ONCE(ctx->parent_ctx);
4400 mutex_lock(&ctx->mutex);
4401 perf_install_in_context(ctx, counter, cpu);
4402 ++ctx->generation;
4403 mutex_unlock(&ctx->mutex);
4404
4405 counter->owner = current;
4406 get_task_struct(current);
4407 mutex_lock(&current->perf_counter_mutex);
4408 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4409 mutex_unlock(&current->perf_counter_mutex);
4410
4411err_fput_free_put_context:
4412 fput_light(counter_file, fput_needed2);
4413
4414err_free_put_context:
4415 if (err < 0)
4416 kfree(counter);
4417
4418err_put_context:
4419 if (err < 0)
4420 put_ctx(ctx);
4421
4422 fput_light(group_file, fput_needed);
4423
4424 return err;
4425}
4426
4427/*
4428 * inherit a counter from parent task to child task:
4429 */
4430static struct perf_counter *
4431inherit_counter(struct perf_counter *parent_counter,
4432 struct task_struct *parent,
4433 struct perf_counter_context *parent_ctx,
4434 struct task_struct *child,
4435 struct perf_counter *group_leader,
4436 struct perf_counter_context *child_ctx)
4437{
4438 struct perf_counter *child_counter;
4439
4440 /*
4441 * Instead of creating recursive hierarchies of counters,
4442 * we link inherited counters back to the original parent,
4443 * which has a filp for sure, which we use as the reference
4444 * count:
4445 */
4446 if (parent_counter->parent)
4447 parent_counter = parent_counter->parent;
4448
4449 child_counter = perf_counter_alloc(&parent_counter->attr,
4450 parent_counter->cpu, child_ctx,
4451 group_leader, parent_counter,
4452 GFP_KERNEL);
4453 if (IS_ERR(child_counter))
4454 return child_counter;
4455 get_ctx(child_ctx);
4456
4457 /*
4458 * Make the child state follow the state of the parent counter,
4459 * not its attr.disabled bit. We hold the parent's mutex,
4460 * so we won't race with perf_counter_{en, dis}able_family.
4461 */
4462 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4463 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4464 else
4465 child_counter->state = PERF_COUNTER_STATE_OFF;
4466
4467 if (parent_counter->attr.freq)
4468 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4469
4470 /*
4471 * Link it up in the child's context:
4472 */
4473 add_counter_to_ctx(child_counter, child_ctx);
4474
4475 /*
4476 * Get a reference to the parent filp - we will fput it
4477 * when the child counter exits. This is safe to do because
4478 * we are in the parent and we know that the filp still
4479 * exists and has a nonzero count:
4480 */
4481 atomic_long_inc(&parent_counter->filp->f_count);
4482
4483 /*
4484 * Link this into the parent counter's child list
4485 */
4486 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4487 mutex_lock(&parent_counter->child_mutex);
4488 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4489 mutex_unlock(&parent_counter->child_mutex);
4490
4491 return child_counter;
4492}
4493
4494static int inherit_group(struct perf_counter *parent_counter,
4495 struct task_struct *parent,
4496 struct perf_counter_context *parent_ctx,
4497 struct task_struct *child,
4498 struct perf_counter_context *child_ctx)
4499{
4500 struct perf_counter *leader;
4501 struct perf_counter *sub;
4502 struct perf_counter *child_ctr;
4503
4504 leader = inherit_counter(parent_counter, parent, parent_ctx,
4505 child, NULL, child_ctx);
4506 if (IS_ERR(leader))
4507 return PTR_ERR(leader);
4508 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4509 child_ctr = inherit_counter(sub, parent, parent_ctx,
4510 child, leader, child_ctx);
4511 if (IS_ERR(child_ctr))
4512 return PTR_ERR(child_ctr);
4513 }
4514 return 0;
4515}
4516
4517static void sync_child_counter(struct perf_counter *child_counter,
4518 struct task_struct *child)
4519{
4520 struct perf_counter *parent_counter = child_counter->parent;
4521 u64 child_val;
4522
4523 if (child_counter->attr.inherit_stat)
4524 perf_counter_read_event(child_counter, child);
4525
4526 child_val = atomic64_read(&child_counter->count);
4527
4528 /*
4529 * Add back the child's count to the parent's count:
4530 */
4531 atomic64_add(child_val, &parent_counter->count);
4532 atomic64_add(child_counter->total_time_enabled,
4533 &parent_counter->child_total_time_enabled);
4534 atomic64_add(child_counter->total_time_running,
4535 &parent_counter->child_total_time_running);
4536
4537 /*
4538 * Remove this counter from the parent's list
4539 */
4540 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4541 mutex_lock(&parent_counter->child_mutex);
4542 list_del_init(&child_counter->child_list);
4543 mutex_unlock(&parent_counter->child_mutex);
4544
4545 /*
4546 * Release the parent counter, if this was the last
4547 * reference to it.
4548 */
4549 fput(parent_counter->filp);
4550}
4551
4552static void
4553__perf_counter_exit_task(struct perf_counter *child_counter,
4554 struct perf_counter_context *child_ctx,
4555 struct task_struct *child)
4556{
4557 struct perf_counter *parent_counter;
4558
4559 update_counter_times(child_counter);
4560 perf_counter_remove_from_context(child_counter);
4561
4562 parent_counter = child_counter->parent;
4563 /*
4564 * It can happen that parent exits first, and has counters
4565 * that are still around due to the child reference. These
4566 * counters need to be zapped - but otherwise linger.
4567 */
4568 if (parent_counter) {
4569 sync_child_counter(child_counter, child);
4570 free_counter(child_counter);
4571 }
4572}
4573
4574/*
4575 * When a child task exits, feed back counter values to parent counters.
4576 */
4577void perf_counter_exit_task(struct task_struct *child)
4578{
4579 struct perf_counter *child_counter, *tmp;
4580 struct perf_counter_context *child_ctx;
4581 unsigned long flags;
4582
4583 if (likely(!child->perf_counter_ctxp)) {
4584 perf_counter_task(child, NULL, 0);
4585 return;
4586 }
4587
4588 local_irq_save(flags);
4589 /*
4590 * We can't reschedule here because interrupts are disabled,
4591 * and either child is current or it is a task that can't be
4592 * scheduled, so we are now safe from rescheduling changing
4593 * our context.
4594 */
4595 child_ctx = child->perf_counter_ctxp;
4596 __perf_counter_task_sched_out(child_ctx);
4597
4598 /*
4599 * Take the context lock here so that if find_get_context is
4600 * reading child->perf_counter_ctxp, we wait until it has
4601 * incremented the context's refcount before we do put_ctx below.
4602 */
4603 spin_lock(&child_ctx->lock);
4604 child->perf_counter_ctxp = NULL;
4605 /*
4606 * If this context is a clone; unclone it so it can't get
4607 * swapped to another process while we're removing all
4608 * the counters from it.
4609 */
4610 unclone_ctx(child_ctx);
4611 spin_unlock_irqrestore(&child_ctx->lock, flags);
4612
4613 /*
4614 * Report the task dead after unscheduling the counters so that we
4615 * won't get any samples after PERF_EVENT_EXIT. We can however still
4616 * get a few PERF_EVENT_READ events.
4617 */
4618 perf_counter_task(child, child_ctx, 0);
4619
4620 /*
4621 * We can recurse on the same lock type through:
4622 *
4623 * __perf_counter_exit_task()
4624 * sync_child_counter()
4625 * fput(parent_counter->filp)
4626 * perf_release()
4627 * mutex_lock(&ctx->mutex)
4628 *
4629 * But since its the parent context it won't be the same instance.
4630 */
4631 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4632
4633again:
4634 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4635 list_entry)
4636 __perf_counter_exit_task(child_counter, child_ctx, child);
4637
4638 /*
4639 * If the last counter was a group counter, it will have appended all
4640 * its siblings to the list, but we obtained 'tmp' before that which
4641 * will still point to the list head terminating the iteration.
4642 */
4643 if (!list_empty(&child_ctx->counter_list))
4644 goto again;
4645
4646 mutex_unlock(&child_ctx->mutex);
4647
4648 put_ctx(child_ctx);
4649}
4650
4651/*
4652 * free an unexposed, unused context as created by inheritance by
4653 * init_task below, used by fork() in case of fail.
4654 */
4655void perf_counter_free_task(struct task_struct *task)
4656{
4657 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4658 struct perf_counter *counter, *tmp;
4659
4660 if (!ctx)
4661 return;
4662
4663 mutex_lock(&ctx->mutex);
4664again:
4665 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4666 struct perf_counter *parent = counter->parent;
4667
4668 if (WARN_ON_ONCE(!parent))
4669 continue;
4670
4671 mutex_lock(&parent->child_mutex);
4672 list_del_init(&counter->child_list);
4673 mutex_unlock(&parent->child_mutex);
4674
4675 fput(parent->filp);
4676
4677 list_del_counter(counter, ctx);
4678 free_counter(counter);
4679 }
4680
4681 if (!list_empty(&ctx->counter_list))
4682 goto again;
4683
4684 mutex_unlock(&ctx->mutex);
4685
4686 put_ctx(ctx);
4687}
4688
4689/*
4690 * Initialize the perf_counter context in task_struct
4691 */
4692int perf_counter_init_task(struct task_struct *child)
4693{
4694 struct perf_counter_context *child_ctx, *parent_ctx;
4695 struct perf_counter_context *cloned_ctx;
4696 struct perf_counter *counter;
4697 struct task_struct *parent = current;
4698 int inherited_all = 1;
4699 int ret = 0;
4700
4701 child->perf_counter_ctxp = NULL;
4702
4703 mutex_init(&child->perf_counter_mutex);
4704 INIT_LIST_HEAD(&child->perf_counter_list);
4705
4706 if (likely(!parent->perf_counter_ctxp))
4707 return 0;
4708
4709 /*
4710 * This is executed from the parent task context, so inherit
4711 * counters that have been marked for cloning.
4712 * First allocate and initialize a context for the child.
4713 */
4714
4715 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4716 if (!child_ctx)
4717 return -ENOMEM;
4718
4719 __perf_counter_init_context(child_ctx, child);
4720 child->perf_counter_ctxp = child_ctx;
4721 get_task_struct(child);
4722
4723 /*
4724 * If the parent's context is a clone, pin it so it won't get
4725 * swapped under us.
4726 */
4727 parent_ctx = perf_pin_task_context(parent);
4728
4729 /*
4730 * No need to check if parent_ctx != NULL here; since we saw
4731 * it non-NULL earlier, the only reason for it to become NULL
4732 * is if we exit, and since we're currently in the middle of
4733 * a fork we can't be exiting at the same time.
4734 */
4735
4736 /*
4737 * Lock the parent list. No need to lock the child - not PID
4738 * hashed yet and not running, so nobody can access it.
4739 */
4740 mutex_lock(&parent_ctx->mutex);
4741
4742 /*
4743 * We dont have to disable NMIs - we are only looking at
4744 * the list, not manipulating it:
4745 */
4746 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4747 if (counter != counter->group_leader)
4748 continue;
4749
4750 if (!counter->attr.inherit) {
4751 inherited_all = 0;
4752 continue;
4753 }
4754
4755 ret = inherit_group(counter, parent, parent_ctx,
4756 child, child_ctx);
4757 if (ret) {
4758 inherited_all = 0;
4759 break;
4760 }
4761 }
4762
4763 if (inherited_all) {
4764 /*
4765 * Mark the child context as a clone of the parent
4766 * context, or of whatever the parent is a clone of.
4767 * Note that if the parent is a clone, it could get
4768 * uncloned at any point, but that doesn't matter
4769 * because the list of counters and the generation
4770 * count can't have changed since we took the mutex.
4771 */
4772 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4773 if (cloned_ctx) {
4774 child_ctx->parent_ctx = cloned_ctx;
4775 child_ctx->parent_gen = parent_ctx->parent_gen;
4776 } else {
4777 child_ctx->parent_ctx = parent_ctx;
4778 child_ctx->parent_gen = parent_ctx->generation;
4779 }
4780 get_ctx(child_ctx->parent_ctx);
4781 }
4782
4783 mutex_unlock(&parent_ctx->mutex);
4784
4785 perf_unpin_context(parent_ctx);
4786
4787 return ret;
4788}
4789
4790static void __cpuinit perf_counter_init_cpu(int cpu)
4791{
4792 struct perf_cpu_context *cpuctx;
4793
4794 cpuctx = &per_cpu(perf_cpu_context, cpu);
4795 __perf_counter_init_context(&cpuctx->ctx, NULL);
4796
4797 spin_lock(&perf_resource_lock);
4798 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4799 spin_unlock(&perf_resource_lock);
4800
4801 hw_perf_counter_setup(cpu);
4802}
4803
4804#ifdef CONFIG_HOTPLUG_CPU
4805static void __perf_counter_exit_cpu(void *info)
4806{
4807 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4808 struct perf_counter_context *ctx = &cpuctx->ctx;
4809 struct perf_counter *counter, *tmp;
4810
4811 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4812 __perf_counter_remove_from_context(counter);
4813}
4814static void perf_counter_exit_cpu(int cpu)
4815{
4816 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4817 struct perf_counter_context *ctx = &cpuctx->ctx;
4818
4819 mutex_lock(&ctx->mutex);
4820 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4821 mutex_unlock(&ctx->mutex);
4822}
4823#else
4824static inline void perf_counter_exit_cpu(int cpu) { }
4825#endif
4826
4827static int __cpuinit
4828perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4829{
4830 unsigned int cpu = (long)hcpu;
4831
4832 switch (action) {
4833
4834 case CPU_UP_PREPARE:
4835 case CPU_UP_PREPARE_FROZEN:
4836 perf_counter_init_cpu(cpu);
4837 break;
4838
4839 case CPU_ONLINE:
4840 case CPU_ONLINE_FROZEN:
4841 hw_perf_counter_setup_online(cpu);
4842 break;
4843
4844 case CPU_DOWN_PREPARE:
4845 case CPU_DOWN_PREPARE_FROZEN:
4846 perf_counter_exit_cpu(cpu);
4847 break;
4848
4849 default:
4850 break;
4851 }
4852
4853 return NOTIFY_OK;
4854}
4855
4856/*
4857 * This has to have a higher priority than migration_notifier in sched.c.
4858 */
4859static struct notifier_block __cpuinitdata perf_cpu_nb = {
4860 .notifier_call = perf_cpu_notify,
4861 .priority = 20,
4862};
4863
4864void __init perf_counter_init(void)
4865{
4866 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4867 (void *)(long)smp_processor_id());
4868 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4869 (void *)(long)smp_processor_id());
4870 register_cpu_notifier(&perf_cpu_nb);
4871}
4872
4873static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4874{
4875 return sprintf(buf, "%d\n", perf_reserved_percpu);
4876}
4877
4878static ssize_t
4879perf_set_reserve_percpu(struct sysdev_class *class,
4880 const char *buf,
4881 size_t count)
4882{
4883 struct perf_cpu_context *cpuctx;
4884 unsigned long val;
4885 int err, cpu, mpt;
4886
4887 err = strict_strtoul(buf, 10, &val);
4888 if (err)
4889 return err;
4890 if (val > perf_max_counters)
4891 return -EINVAL;
4892
4893 spin_lock(&perf_resource_lock);
4894 perf_reserved_percpu = val;
4895 for_each_online_cpu(cpu) {
4896 cpuctx = &per_cpu(perf_cpu_context, cpu);
4897 spin_lock_irq(&cpuctx->ctx.lock);
4898 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4899 perf_max_counters - perf_reserved_percpu);
4900 cpuctx->max_pertask = mpt;
4901 spin_unlock_irq(&cpuctx->ctx.lock);
4902 }
4903 spin_unlock(&perf_resource_lock);
4904
4905 return count;
4906}
4907
4908static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4909{
4910 return sprintf(buf, "%d\n", perf_overcommit);
4911}
4912
4913static ssize_t
4914perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4915{
4916 unsigned long val;
4917 int err;
4918
4919 err = strict_strtoul(buf, 10, &val);
4920 if (err)
4921 return err;
4922 if (val > 1)
4923 return -EINVAL;
4924
4925 spin_lock(&perf_resource_lock);
4926 perf_overcommit = val;
4927 spin_unlock(&perf_resource_lock);
4928
4929 return count;
4930}
4931
4932static SYSDEV_CLASS_ATTR(
4933 reserve_percpu,
4934 0644,
4935 perf_show_reserve_percpu,
4936 perf_set_reserve_percpu
4937 );
4938
4939static SYSDEV_CLASS_ATTR(
4940 overcommit,
4941 0644,
4942 perf_show_overcommit,
4943 perf_set_overcommit
4944 );
4945
4946static struct attribute *perfclass_attrs[] = {
4947 &attr_reserve_percpu.attr,
4948 &attr_overcommit.attr,
4949 NULL
4950};
4951
4952static struct attribute_group perfclass_attr_group = {
4953 .attrs = perfclass_attrs,
4954 .name = "perf_counters",
4955};
4956
4957static int __init perf_counter_sysfs_init(void)
4958{
4959 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4960 &perfclass_attr_group);
4961}
4962device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..12b5ec39bf97
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5174 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
24#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
31#include <linux/ftrace_event.h>
32
33#include <asm/irq_regs.h>
34
35/*
36 * Each CPU has a list of per CPU events:
37 */
38DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
39
40int perf_max_events __read_mostly = 1;
41static int perf_reserved_percpu __read_mostly;
42static int perf_overcommit __read_mostly = 1;
43
44static atomic_t nr_events __read_mostly;
45static atomic_t nr_mmap_events __read_mostly;
46static atomic_t nr_comm_events __read_mostly;
47static atomic_t nr_task_events __read_mostly;
48
49/*
50 * perf event paranoia level:
51 * -1 - not paranoid at all
52 * 0 - disallow raw tracepoint access for unpriv
53 * 1 - disallow cpu events for unpriv
54 * 2 - disallow kernel profiling for unpriv
55 */
56int sysctl_perf_event_paranoid __read_mostly = 1;
57
58static inline bool perf_paranoid_tracepoint_raw(void)
59{
60 return sysctl_perf_event_paranoid > -1;
61}
62
63static inline bool perf_paranoid_cpu(void)
64{
65 return sysctl_perf_event_paranoid > 0;
66}
67
68static inline bool perf_paranoid_kernel(void)
69{
70 return sysctl_perf_event_paranoid > 1;
71}
72
73int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
74
75/*
76 * max perf event sample rate
77 */
78int sysctl_perf_event_sample_rate __read_mostly = 100000;
79
80static atomic64_t perf_event_id;
81
82/*
83 * Lock for (sysadmin-configurable) event reservations:
84 */
85static DEFINE_SPINLOCK(perf_resource_lock);
86
87/*
88 * Architecture provided APIs - weak aliases:
89 */
90extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
91{
92 return NULL;
93}
94
95void __weak hw_perf_disable(void) { barrier(); }
96void __weak hw_perf_enable(void) { barrier(); }
97
98void __weak hw_perf_event_setup(int cpu) { barrier(); }
99void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
100
101int __weak
102hw_perf_group_sched_in(struct perf_event *group_leader,
103 struct perf_cpu_context *cpuctx,
104 struct perf_event_context *ctx, int cpu)
105{
106 return 0;
107}
108
109void __weak perf_event_print_debug(void) { }
110
111static DEFINE_PER_CPU(int, perf_disable_count);
112
113void __perf_disable(void)
114{
115 __get_cpu_var(perf_disable_count)++;
116}
117
118bool __perf_enable(void)
119{
120 return !--__get_cpu_var(perf_disable_count);
121}
122
123void perf_disable(void)
124{
125 __perf_disable();
126 hw_perf_disable();
127}
128
129void perf_enable(void)
130{
131 if (__perf_enable())
132 hw_perf_enable();
133}
134
135static void get_ctx(struct perf_event_context *ctx)
136{
137 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
138}
139
140static void free_ctx(struct rcu_head *head)
141{
142 struct perf_event_context *ctx;
143
144 ctx = container_of(head, struct perf_event_context, rcu_head);
145 kfree(ctx);
146}
147
148static void put_ctx(struct perf_event_context *ctx)
149{
150 if (atomic_dec_and_test(&ctx->refcount)) {
151 if (ctx->parent_ctx)
152 put_ctx(ctx->parent_ctx);
153 if (ctx->task)
154 put_task_struct(ctx->task);
155 call_rcu(&ctx->rcu_head, free_ctx);
156 }
157}
158
159static void unclone_ctx(struct perf_event_context *ctx)
160{
161 if (ctx->parent_ctx) {
162 put_ctx(ctx->parent_ctx);
163 ctx->parent_ctx = NULL;
164 }
165}
166
167/*
168 * If we inherit events we want to return the parent event id
169 * to userspace.
170 */
171static u64 primary_event_id(struct perf_event *event)
172{
173 u64 id = event->id;
174
175 if (event->parent)
176 id = event->parent->id;
177
178 return id;
179}
180
181/*
182 * Get the perf_event_context for a task and lock it.
183 * This has to cope with with the fact that until it is locked,
184 * the context could get moved to another task.
185 */
186static struct perf_event_context *
187perf_lock_task_context(struct task_struct *task, unsigned long *flags)
188{
189 struct perf_event_context *ctx;
190
191 rcu_read_lock();
192 retry:
193 ctx = rcu_dereference(task->perf_event_ctxp);
194 if (ctx) {
195 /*
196 * If this context is a clone of another, it might
197 * get swapped for another underneath us by
198 * perf_event_task_sched_out, though the
199 * rcu_read_lock() protects us from any context
200 * getting freed. Lock the context and check if it
201 * got swapped before we could get the lock, and retry
202 * if so. If we locked the right context, then it
203 * can't get swapped on us any more.
204 */
205 spin_lock_irqsave(&ctx->lock, *flags);
206 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
207 spin_unlock_irqrestore(&ctx->lock, *flags);
208 goto retry;
209 }
210
211 if (!atomic_inc_not_zero(&ctx->refcount)) {
212 spin_unlock_irqrestore(&ctx->lock, *flags);
213 ctx = NULL;
214 }
215 }
216 rcu_read_unlock();
217 return ctx;
218}
219
220/*
221 * Get the context for a task and increment its pin_count so it
222 * can't get swapped to another task. This also increments its
223 * reference count so that the context can't get freed.
224 */
225static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
226{
227 struct perf_event_context *ctx;
228 unsigned long flags;
229
230 ctx = perf_lock_task_context(task, &flags);
231 if (ctx) {
232 ++ctx->pin_count;
233 spin_unlock_irqrestore(&ctx->lock, flags);
234 }
235 return ctx;
236}
237
238static void perf_unpin_context(struct perf_event_context *ctx)
239{
240 unsigned long flags;
241
242 spin_lock_irqsave(&ctx->lock, flags);
243 --ctx->pin_count;
244 spin_unlock_irqrestore(&ctx->lock, flags);
245 put_ctx(ctx);
246}
247
248/*
249 * Add a event from the lists for its context.
250 * Must be called with ctx->mutex and ctx->lock held.
251 */
252static void
253list_add_event(struct perf_event *event, struct perf_event_context *ctx)
254{
255 struct perf_event *group_leader = event->group_leader;
256
257 /*
258 * Depending on whether it is a standalone or sibling event,
259 * add it straight to the context's event list, or to the group
260 * leader's sibling list:
261 */
262 if (group_leader == event)
263 list_add_tail(&event->group_entry, &ctx->group_list);
264 else {
265 list_add_tail(&event->group_entry, &group_leader->sibling_list);
266 group_leader->nr_siblings++;
267 }
268
269 list_add_rcu(&event->event_entry, &ctx->event_list);
270 ctx->nr_events++;
271 if (event->attr.inherit_stat)
272 ctx->nr_stat++;
273}
274
275/*
276 * Remove a event from the lists for its context.
277 * Must be called with ctx->mutex and ctx->lock held.
278 */
279static void
280list_del_event(struct perf_event *event, struct perf_event_context *ctx)
281{
282 struct perf_event *sibling, *tmp;
283
284 if (list_empty(&event->group_entry))
285 return;
286 ctx->nr_events--;
287 if (event->attr.inherit_stat)
288 ctx->nr_stat--;
289
290 list_del_init(&event->group_entry);
291 list_del_rcu(&event->event_entry);
292
293 if (event->group_leader != event)
294 event->group_leader->nr_siblings--;
295
296 /*
297 * If this was a group event with sibling events then
298 * upgrade the siblings to singleton events by adding them
299 * to the context list directly:
300 */
301 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
302
303 list_move_tail(&sibling->group_entry, &ctx->group_list);
304 sibling->group_leader = sibling;
305 }
306}
307
308static void
309event_sched_out(struct perf_event *event,
310 struct perf_cpu_context *cpuctx,
311 struct perf_event_context *ctx)
312{
313 if (event->state != PERF_EVENT_STATE_ACTIVE)
314 return;
315
316 event->state = PERF_EVENT_STATE_INACTIVE;
317 if (event->pending_disable) {
318 event->pending_disable = 0;
319 event->state = PERF_EVENT_STATE_OFF;
320 }
321 event->tstamp_stopped = ctx->time;
322 event->pmu->disable(event);
323 event->oncpu = -1;
324
325 if (!is_software_event(event))
326 cpuctx->active_oncpu--;
327 ctx->nr_active--;
328 if (event->attr.exclusive || !cpuctx->active_oncpu)
329 cpuctx->exclusive = 0;
330}
331
332static void
333group_sched_out(struct perf_event *group_event,
334 struct perf_cpu_context *cpuctx,
335 struct perf_event_context *ctx)
336{
337 struct perf_event *event;
338
339 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
340 return;
341
342 event_sched_out(group_event, cpuctx, ctx);
343
344 /*
345 * Schedule out siblings (if any):
346 */
347 list_for_each_entry(event, &group_event->sibling_list, group_entry)
348 event_sched_out(event, cpuctx, ctx);
349
350 if (group_event->attr.exclusive)
351 cpuctx->exclusive = 0;
352}
353
354/*
355 * Cross CPU call to remove a performance event
356 *
357 * We disable the event on the hardware level first. After that we
358 * remove it from the context list.
359 */
360static void __perf_event_remove_from_context(void *info)
361{
362 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
363 struct perf_event *event = info;
364 struct perf_event_context *ctx = event->ctx;
365
366 /*
367 * If this is a task context, we need to check whether it is
368 * the current task context of this cpu. If not it has been
369 * scheduled out before the smp call arrived.
370 */
371 if (ctx->task && cpuctx->task_ctx != ctx)
372 return;
373
374 spin_lock(&ctx->lock);
375 /*
376 * Protect the list operation against NMI by disabling the
377 * events on a global level.
378 */
379 perf_disable();
380
381 event_sched_out(event, cpuctx, ctx);
382
383 list_del_event(event, ctx);
384
385 if (!ctx->task) {
386 /*
387 * Allow more per task events with respect to the
388 * reservation:
389 */
390 cpuctx->max_pertask =
391 min(perf_max_events - ctx->nr_events,
392 perf_max_events - perf_reserved_percpu);
393 }
394
395 perf_enable();
396 spin_unlock(&ctx->lock);
397}
398
399
400/*
401 * Remove the event from a task's (or a CPU's) list of events.
402 *
403 * Must be called with ctx->mutex held.
404 *
405 * CPU events are removed with a smp call. For task events we only
406 * call when the task is on a CPU.
407 *
408 * If event->ctx is a cloned context, callers must make sure that
409 * every task struct that event->ctx->task could possibly point to
410 * remains valid. This is OK when called from perf_release since
411 * that only calls us on the top-level context, which can't be a clone.
412 * When called from perf_event_exit_task, it's OK because the
413 * context has been detached from its task.
414 */
415static void perf_event_remove_from_context(struct perf_event *event)
416{
417 struct perf_event_context *ctx = event->ctx;
418 struct task_struct *task = ctx->task;
419
420 if (!task) {
421 /*
422 * Per cpu events are removed via an smp call and
423 * the removal is always sucessful.
424 */
425 smp_call_function_single(event->cpu,
426 __perf_event_remove_from_context,
427 event, 1);
428 return;
429 }
430
431retry:
432 task_oncpu_function_call(task, __perf_event_remove_from_context,
433 event);
434
435 spin_lock_irq(&ctx->lock);
436 /*
437 * If the context is active we need to retry the smp call.
438 */
439 if (ctx->nr_active && !list_empty(&event->group_entry)) {
440 spin_unlock_irq(&ctx->lock);
441 goto retry;
442 }
443
444 /*
445 * The lock prevents that this context is scheduled in so we
446 * can remove the event safely, if the call above did not
447 * succeed.
448 */
449 if (!list_empty(&event->group_entry)) {
450 list_del_event(event, ctx);
451 }
452 spin_unlock_irq(&ctx->lock);
453}
454
455static inline u64 perf_clock(void)
456{
457 return cpu_clock(smp_processor_id());
458}
459
460/*
461 * Update the record of the current time in a context.
462 */
463static void update_context_time(struct perf_event_context *ctx)
464{
465 u64 now = perf_clock();
466
467 ctx->time += now - ctx->timestamp;
468 ctx->timestamp = now;
469}
470
471/*
472 * Update the total_time_enabled and total_time_running fields for a event.
473 */
474static void update_event_times(struct perf_event *event)
475{
476 struct perf_event_context *ctx = event->ctx;
477 u64 run_end;
478
479 if (event->state < PERF_EVENT_STATE_INACTIVE ||
480 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
481 return;
482
483 event->total_time_enabled = ctx->time - event->tstamp_enabled;
484
485 if (event->state == PERF_EVENT_STATE_INACTIVE)
486 run_end = event->tstamp_stopped;
487 else
488 run_end = ctx->time;
489
490 event->total_time_running = run_end - event->tstamp_running;
491}
492
493/*
494 * Update total_time_enabled and total_time_running for all events in a group.
495 */
496static void update_group_times(struct perf_event *leader)
497{
498 struct perf_event *event;
499
500 update_event_times(leader);
501 list_for_each_entry(event, &leader->sibling_list, group_entry)
502 update_event_times(event);
503}
504
505/*
506 * Cross CPU call to disable a performance event
507 */
508static void __perf_event_disable(void *info)
509{
510 struct perf_event *event = info;
511 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
512 struct perf_event_context *ctx = event->ctx;
513
514 /*
515 * If this is a per-task event, need to check whether this
516 * event's task is the current task on this cpu.
517 */
518 if (ctx->task && cpuctx->task_ctx != ctx)
519 return;
520
521 spin_lock(&ctx->lock);
522
523 /*
524 * If the event is on, turn it off.
525 * If it is in error state, leave it in error state.
526 */
527 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
528 update_context_time(ctx);
529 update_group_times(event);
530 if (event == event->group_leader)
531 group_sched_out(event, cpuctx, ctx);
532 else
533 event_sched_out(event, cpuctx, ctx);
534 event->state = PERF_EVENT_STATE_OFF;
535 }
536
537 spin_unlock(&ctx->lock);
538}
539
540/*
541 * Disable a event.
542 *
543 * If event->ctx is a cloned context, callers must make sure that
544 * every task struct that event->ctx->task could possibly point to
545 * remains valid. This condition is satisifed when called through
546 * perf_event_for_each_child or perf_event_for_each because they
547 * hold the top-level event's child_mutex, so any descendant that
548 * goes to exit will block in sync_child_event.
549 * When called from perf_pending_event it's OK because event->ctx
550 * is the current context on this CPU and preemption is disabled,
551 * hence we can't get into perf_event_task_sched_out for this context.
552 */
553static void perf_event_disable(struct perf_event *event)
554{
555 struct perf_event_context *ctx = event->ctx;
556 struct task_struct *task = ctx->task;
557
558 if (!task) {
559 /*
560 * Disable the event on the cpu that it's on
561 */
562 smp_call_function_single(event->cpu, __perf_event_disable,
563 event, 1);
564 return;
565 }
566
567 retry:
568 task_oncpu_function_call(task, __perf_event_disable, event);
569
570 spin_lock_irq(&ctx->lock);
571 /*
572 * If the event is still active, we need to retry the cross-call.
573 */
574 if (event->state == PERF_EVENT_STATE_ACTIVE) {
575 spin_unlock_irq(&ctx->lock);
576 goto retry;
577 }
578
579 /*
580 * Since we have the lock this context can't be scheduled
581 * in, so we can change the state safely.
582 */
583 if (event->state == PERF_EVENT_STATE_INACTIVE) {
584 update_group_times(event);
585 event->state = PERF_EVENT_STATE_OFF;
586 }
587
588 spin_unlock_irq(&ctx->lock);
589}
590
591static int
592event_sched_in(struct perf_event *event,
593 struct perf_cpu_context *cpuctx,
594 struct perf_event_context *ctx,
595 int cpu)
596{
597 if (event->state <= PERF_EVENT_STATE_OFF)
598 return 0;
599
600 event->state = PERF_EVENT_STATE_ACTIVE;
601 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
602 /*
603 * The new state must be visible before we turn it on in the hardware:
604 */
605 smp_wmb();
606
607 if (event->pmu->enable(event)) {
608 event->state = PERF_EVENT_STATE_INACTIVE;
609 event->oncpu = -1;
610 return -EAGAIN;
611 }
612
613 event->tstamp_running += ctx->time - event->tstamp_stopped;
614
615 if (!is_software_event(event))
616 cpuctx->active_oncpu++;
617 ctx->nr_active++;
618
619 if (event->attr.exclusive)
620 cpuctx->exclusive = 1;
621
622 return 0;
623}
624
625static int
626group_sched_in(struct perf_event *group_event,
627 struct perf_cpu_context *cpuctx,
628 struct perf_event_context *ctx,
629 int cpu)
630{
631 struct perf_event *event, *partial_group;
632 int ret;
633
634 if (group_event->state == PERF_EVENT_STATE_OFF)
635 return 0;
636
637 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
638 if (ret)
639 return ret < 0 ? ret : 0;
640
641 if (event_sched_in(group_event, cpuctx, ctx, cpu))
642 return -EAGAIN;
643
644 /*
645 * Schedule in siblings as one group (if any):
646 */
647 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
648 if (event_sched_in(event, cpuctx, ctx, cpu)) {
649 partial_group = event;
650 goto group_error;
651 }
652 }
653
654 return 0;
655
656group_error:
657 /*
658 * Groups can be scheduled in as one unit only, so undo any
659 * partial group before returning:
660 */
661 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
662 if (event == partial_group)
663 break;
664 event_sched_out(event, cpuctx, ctx);
665 }
666 event_sched_out(group_event, cpuctx, ctx);
667
668 return -EAGAIN;
669}
670
671/*
672 * Return 1 for a group consisting entirely of software events,
673 * 0 if the group contains any hardware events.
674 */
675static int is_software_only_group(struct perf_event *leader)
676{
677 struct perf_event *event;
678
679 if (!is_software_event(leader))
680 return 0;
681
682 list_for_each_entry(event, &leader->sibling_list, group_entry)
683 if (!is_software_event(event))
684 return 0;
685
686 return 1;
687}
688
689/*
690 * Work out whether we can put this event group on the CPU now.
691 */
692static int group_can_go_on(struct perf_event *event,
693 struct perf_cpu_context *cpuctx,
694 int can_add_hw)
695{
696 /*
697 * Groups consisting entirely of software events can always go on.
698 */
699 if (is_software_only_group(event))
700 return 1;
701 /*
702 * If an exclusive group is already on, no other hardware
703 * events can go on.
704 */
705 if (cpuctx->exclusive)
706 return 0;
707 /*
708 * If this group is exclusive and there are already
709 * events on the CPU, it can't go on.
710 */
711 if (event->attr.exclusive && cpuctx->active_oncpu)
712 return 0;
713 /*
714 * Otherwise, try to add it if all previous groups were able
715 * to go on.
716 */
717 return can_add_hw;
718}
719
720static void add_event_to_ctx(struct perf_event *event,
721 struct perf_event_context *ctx)
722{
723 list_add_event(event, ctx);
724 event->tstamp_enabled = ctx->time;
725 event->tstamp_running = ctx->time;
726 event->tstamp_stopped = ctx->time;
727}
728
729/*
730 * Cross CPU call to install and enable a performance event
731 *
732 * Must be called with ctx->mutex held
733 */
734static void __perf_install_in_context(void *info)
735{
736 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
737 struct perf_event *event = info;
738 struct perf_event_context *ctx = event->ctx;
739 struct perf_event *leader = event->group_leader;
740 int cpu = smp_processor_id();
741 int err;
742
743 /*
744 * If this is a task context, we need to check whether it is
745 * the current task context of this cpu. If not it has been
746 * scheduled out before the smp call arrived.
747 * Or possibly this is the right context but it isn't
748 * on this cpu because it had no events.
749 */
750 if (ctx->task && cpuctx->task_ctx != ctx) {
751 if (cpuctx->task_ctx || ctx->task != current)
752 return;
753 cpuctx->task_ctx = ctx;
754 }
755
756 spin_lock(&ctx->lock);
757 ctx->is_active = 1;
758 update_context_time(ctx);
759
760 /*
761 * Protect the list operation against NMI by disabling the
762 * events on a global level. NOP for non NMI based events.
763 */
764 perf_disable();
765
766 add_event_to_ctx(event, ctx);
767
768 /*
769 * Don't put the event on if it is disabled or if
770 * it is in a group and the group isn't on.
771 */
772 if (event->state != PERF_EVENT_STATE_INACTIVE ||
773 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
774 goto unlock;
775
776 /*
777 * An exclusive event can't go on if there are already active
778 * hardware events, and no hardware event can go on if there
779 * is already an exclusive event on.
780 */
781 if (!group_can_go_on(event, cpuctx, 1))
782 err = -EEXIST;
783 else
784 err = event_sched_in(event, cpuctx, ctx, cpu);
785
786 if (err) {
787 /*
788 * This event couldn't go on. If it is in a group
789 * then we have to pull the whole group off.
790 * If the event group is pinned then put it in error state.
791 */
792 if (leader != event)
793 group_sched_out(leader, cpuctx, ctx);
794 if (leader->attr.pinned) {
795 update_group_times(leader);
796 leader->state = PERF_EVENT_STATE_ERROR;
797 }
798 }
799
800 if (!err && !ctx->task && cpuctx->max_pertask)
801 cpuctx->max_pertask--;
802
803 unlock:
804 perf_enable();
805
806 spin_unlock(&ctx->lock);
807}
808
809/*
810 * Attach a performance event to a context
811 *
812 * First we add the event to the list with the hardware enable bit
813 * in event->hw_config cleared.
814 *
815 * If the event is attached to a task which is on a CPU we use a smp
816 * call to enable it in the task context. The task might have been
817 * scheduled away, but we check this in the smp call again.
818 *
819 * Must be called with ctx->mutex held.
820 */
821static void
822perf_install_in_context(struct perf_event_context *ctx,
823 struct perf_event *event,
824 int cpu)
825{
826 struct task_struct *task = ctx->task;
827
828 if (!task) {
829 /*
830 * Per cpu events are installed via an smp call and
831 * the install is always sucessful.
832 */
833 smp_call_function_single(cpu, __perf_install_in_context,
834 event, 1);
835 return;
836 }
837
838retry:
839 task_oncpu_function_call(task, __perf_install_in_context,
840 event);
841
842 spin_lock_irq(&ctx->lock);
843 /*
844 * we need to retry the smp call.
845 */
846 if (ctx->is_active && list_empty(&event->group_entry)) {
847 spin_unlock_irq(&ctx->lock);
848 goto retry;
849 }
850
851 /*
852 * The lock prevents that this context is scheduled in so we
853 * can add the event safely, if it the call above did not
854 * succeed.
855 */
856 if (list_empty(&event->group_entry))
857 add_event_to_ctx(event, ctx);
858 spin_unlock_irq(&ctx->lock);
859}
860
861/*
862 * Put a event into inactive state and update time fields.
863 * Enabling the leader of a group effectively enables all
864 * the group members that aren't explicitly disabled, so we
865 * have to update their ->tstamp_enabled also.
866 * Note: this works for group members as well as group leaders
867 * since the non-leader members' sibling_lists will be empty.
868 */
869static void __perf_event_mark_enabled(struct perf_event *event,
870 struct perf_event_context *ctx)
871{
872 struct perf_event *sub;
873
874 event->state = PERF_EVENT_STATE_INACTIVE;
875 event->tstamp_enabled = ctx->time - event->total_time_enabled;
876 list_for_each_entry(sub, &event->sibling_list, group_entry)
877 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
878 sub->tstamp_enabled =
879 ctx->time - sub->total_time_enabled;
880}
881
882/*
883 * Cross CPU call to enable a performance event
884 */
885static void __perf_event_enable(void *info)
886{
887 struct perf_event *event = info;
888 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
889 struct perf_event_context *ctx = event->ctx;
890 struct perf_event *leader = event->group_leader;
891 int err;
892
893 /*
894 * If this is a per-task event, need to check whether this
895 * event's task is the current task on this cpu.
896 */
897 if (ctx->task && cpuctx->task_ctx != ctx) {
898 if (cpuctx->task_ctx || ctx->task != current)
899 return;
900 cpuctx->task_ctx = ctx;
901 }
902
903 spin_lock(&ctx->lock);
904 ctx->is_active = 1;
905 update_context_time(ctx);
906
907 if (event->state >= PERF_EVENT_STATE_INACTIVE)
908 goto unlock;
909 __perf_event_mark_enabled(event, ctx);
910
911 /*
912 * If the event is in a group and isn't the group leader,
913 * then don't put it on unless the group is on.
914 */
915 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
916 goto unlock;
917
918 if (!group_can_go_on(event, cpuctx, 1)) {
919 err = -EEXIST;
920 } else {
921 perf_disable();
922 if (event == leader)
923 err = group_sched_in(event, cpuctx, ctx,
924 smp_processor_id());
925 else
926 err = event_sched_in(event, cpuctx, ctx,
927 smp_processor_id());
928 perf_enable();
929 }
930
931 if (err) {
932 /*
933 * If this event can't go on and it's part of a
934 * group, then the whole group has to come off.
935 */
936 if (leader != event)
937 group_sched_out(leader, cpuctx, ctx);
938 if (leader->attr.pinned) {
939 update_group_times(leader);
940 leader->state = PERF_EVENT_STATE_ERROR;
941 }
942 }
943
944 unlock:
945 spin_unlock(&ctx->lock);
946}
947
948/*
949 * Enable a event.
950 *
951 * If event->ctx is a cloned context, callers must make sure that
952 * every task struct that event->ctx->task could possibly point to
953 * remains valid. This condition is satisfied when called through
954 * perf_event_for_each_child or perf_event_for_each as described
955 * for perf_event_disable.
956 */
957static void perf_event_enable(struct perf_event *event)
958{
959 struct perf_event_context *ctx = event->ctx;
960 struct task_struct *task = ctx->task;
961
962 if (!task) {
963 /*
964 * Enable the event on the cpu that it's on
965 */
966 smp_call_function_single(event->cpu, __perf_event_enable,
967 event, 1);
968 return;
969 }
970
971 spin_lock_irq(&ctx->lock);
972 if (event->state >= PERF_EVENT_STATE_INACTIVE)
973 goto out;
974
975 /*
976 * If the event is in error state, clear that first.
977 * That way, if we see the event in error state below, we
978 * know that it has gone back into error state, as distinct
979 * from the task having been scheduled away before the
980 * cross-call arrived.
981 */
982 if (event->state == PERF_EVENT_STATE_ERROR)
983 event->state = PERF_EVENT_STATE_OFF;
984
985 retry:
986 spin_unlock_irq(&ctx->lock);
987 task_oncpu_function_call(task, __perf_event_enable, event);
988
989 spin_lock_irq(&ctx->lock);
990
991 /*
992 * If the context is active and the event is still off,
993 * we need to retry the cross-call.
994 */
995 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
996 goto retry;
997
998 /*
999 * Since we have the lock this context can't be scheduled
1000 * in, so we can change the state safely.
1001 */
1002 if (event->state == PERF_EVENT_STATE_OFF)
1003 __perf_event_mark_enabled(event, ctx);
1004
1005 out:
1006 spin_unlock_irq(&ctx->lock);
1007}
1008
1009static int perf_event_refresh(struct perf_event *event, int refresh)
1010{
1011 /*
1012 * not supported on inherited events
1013 */
1014 if (event->attr.inherit)
1015 return -EINVAL;
1016
1017 atomic_add(refresh, &event->event_limit);
1018 perf_event_enable(event);
1019
1020 return 0;
1021}
1022
1023void __perf_event_sched_out(struct perf_event_context *ctx,
1024 struct perf_cpu_context *cpuctx)
1025{
1026 struct perf_event *event;
1027
1028 spin_lock(&ctx->lock);
1029 ctx->is_active = 0;
1030 if (likely(!ctx->nr_events))
1031 goto out;
1032 update_context_time(ctx);
1033
1034 perf_disable();
1035 if (ctx->nr_active)
1036 list_for_each_entry(event, &ctx->group_list, group_entry)
1037 group_sched_out(event, cpuctx, ctx);
1038
1039 perf_enable();
1040 out:
1041 spin_unlock(&ctx->lock);
1042}
1043
1044/*
1045 * Test whether two contexts are equivalent, i.e. whether they
1046 * have both been cloned from the same version of the same context
1047 * and they both have the same number of enabled events.
1048 * If the number of enabled events is the same, then the set
1049 * of enabled events should be the same, because these are both
1050 * inherited contexts, therefore we can't access individual events
1051 * in them directly with an fd; we can only enable/disable all
1052 * events via prctl, or enable/disable all events in a family
1053 * via ioctl, which will have the same effect on both contexts.
1054 */
1055static int context_equiv(struct perf_event_context *ctx1,
1056 struct perf_event_context *ctx2)
1057{
1058 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1059 && ctx1->parent_gen == ctx2->parent_gen
1060 && !ctx1->pin_count && !ctx2->pin_count;
1061}
1062
1063static void __perf_event_read(void *event);
1064
1065static void __perf_event_sync_stat(struct perf_event *event,
1066 struct perf_event *next_event)
1067{
1068 u64 value;
1069
1070 if (!event->attr.inherit_stat)
1071 return;
1072
1073 /*
1074 * Update the event value, we cannot use perf_event_read()
1075 * because we're in the middle of a context switch and have IRQs
1076 * disabled, which upsets smp_call_function_single(), however
1077 * we know the event must be on the current CPU, therefore we
1078 * don't need to use it.
1079 */
1080 switch (event->state) {
1081 case PERF_EVENT_STATE_ACTIVE:
1082 __perf_event_read(event);
1083 break;
1084
1085 case PERF_EVENT_STATE_INACTIVE:
1086 update_event_times(event);
1087 break;
1088
1089 default:
1090 break;
1091 }
1092
1093 /*
1094 * In order to keep per-task stats reliable we need to flip the event
1095 * values when we flip the contexts.
1096 */
1097 value = atomic64_read(&next_event->count);
1098 value = atomic64_xchg(&event->count, value);
1099 atomic64_set(&next_event->count, value);
1100
1101 swap(event->total_time_enabled, next_event->total_time_enabled);
1102 swap(event->total_time_running, next_event->total_time_running);
1103
1104 /*
1105 * Since we swizzled the values, update the user visible data too.
1106 */
1107 perf_event_update_userpage(event);
1108 perf_event_update_userpage(next_event);
1109}
1110
1111#define list_next_entry(pos, member) \
1112 list_entry(pos->member.next, typeof(*pos), member)
1113
1114static void perf_event_sync_stat(struct perf_event_context *ctx,
1115 struct perf_event_context *next_ctx)
1116{
1117 struct perf_event *event, *next_event;
1118
1119 if (!ctx->nr_stat)
1120 return;
1121
1122 event = list_first_entry(&ctx->event_list,
1123 struct perf_event, event_entry);
1124
1125 next_event = list_first_entry(&next_ctx->event_list,
1126 struct perf_event, event_entry);
1127
1128 while (&event->event_entry != &ctx->event_list &&
1129 &next_event->event_entry != &next_ctx->event_list) {
1130
1131 __perf_event_sync_stat(event, next_event);
1132
1133 event = list_next_entry(event, event_entry);
1134 next_event = list_next_entry(next_event, event_entry);
1135 }
1136}
1137
1138/*
1139 * Called from scheduler to remove the events of the current task,
1140 * with interrupts disabled.
1141 *
1142 * We stop each event and update the event value in event->count.
1143 *
1144 * This does not protect us against NMI, but disable()
1145 * sets the disabled bit in the control field of event _before_
1146 * accessing the event control register. If a NMI hits, then it will
1147 * not restart the event.
1148 */
1149void perf_event_task_sched_out(struct task_struct *task,
1150 struct task_struct *next, int cpu)
1151{
1152 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1153 struct perf_event_context *ctx = task->perf_event_ctxp;
1154 struct perf_event_context *next_ctx;
1155 struct perf_event_context *parent;
1156 struct pt_regs *regs;
1157 int do_switch = 1;
1158
1159 regs = task_pt_regs(task);
1160 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1161
1162 if (likely(!ctx || !cpuctx->task_ctx))
1163 return;
1164
1165 update_context_time(ctx);
1166
1167 rcu_read_lock();
1168 parent = rcu_dereference(ctx->parent_ctx);
1169 next_ctx = next->perf_event_ctxp;
1170 if (parent && next_ctx &&
1171 rcu_dereference(next_ctx->parent_ctx) == parent) {
1172 /*
1173 * Looks like the two contexts are clones, so we might be
1174 * able to optimize the context switch. We lock both
1175 * contexts and check that they are clones under the
1176 * lock (including re-checking that neither has been
1177 * uncloned in the meantime). It doesn't matter which
1178 * order we take the locks because no other cpu could
1179 * be trying to lock both of these tasks.
1180 */
1181 spin_lock(&ctx->lock);
1182 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1183 if (context_equiv(ctx, next_ctx)) {
1184 /*
1185 * XXX do we need a memory barrier of sorts
1186 * wrt to rcu_dereference() of perf_event_ctxp
1187 */
1188 task->perf_event_ctxp = next_ctx;
1189 next->perf_event_ctxp = ctx;
1190 ctx->task = next;
1191 next_ctx->task = task;
1192 do_switch = 0;
1193
1194 perf_event_sync_stat(ctx, next_ctx);
1195 }
1196 spin_unlock(&next_ctx->lock);
1197 spin_unlock(&ctx->lock);
1198 }
1199 rcu_read_unlock();
1200
1201 if (do_switch) {
1202 __perf_event_sched_out(ctx, cpuctx);
1203 cpuctx->task_ctx = NULL;
1204 }
1205}
1206
1207/*
1208 * Called with IRQs disabled
1209 */
1210static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1211{
1212 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1213
1214 if (!cpuctx->task_ctx)
1215 return;
1216
1217 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1218 return;
1219
1220 __perf_event_sched_out(ctx, cpuctx);
1221 cpuctx->task_ctx = NULL;
1222}
1223
1224/*
1225 * Called with IRQs disabled
1226 */
1227static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1228{
1229 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1230}
1231
1232static void
1233__perf_event_sched_in(struct perf_event_context *ctx,
1234 struct perf_cpu_context *cpuctx, int cpu)
1235{
1236 struct perf_event *event;
1237 int can_add_hw = 1;
1238
1239 spin_lock(&ctx->lock);
1240 ctx->is_active = 1;
1241 if (likely(!ctx->nr_events))
1242 goto out;
1243
1244 ctx->timestamp = perf_clock();
1245
1246 perf_disable();
1247
1248 /*
1249 * First go through the list and put on any pinned groups
1250 * in order to give them the best chance of going on.
1251 */
1252 list_for_each_entry(event, &ctx->group_list, group_entry) {
1253 if (event->state <= PERF_EVENT_STATE_OFF ||
1254 !event->attr.pinned)
1255 continue;
1256 if (event->cpu != -1 && event->cpu != cpu)
1257 continue;
1258
1259 if (group_can_go_on(event, cpuctx, 1))
1260 group_sched_in(event, cpuctx, ctx, cpu);
1261
1262 /*
1263 * If this pinned group hasn't been scheduled,
1264 * put it in error state.
1265 */
1266 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1267 update_group_times(event);
1268 event->state = PERF_EVENT_STATE_ERROR;
1269 }
1270 }
1271
1272 list_for_each_entry(event, &ctx->group_list, group_entry) {
1273 /*
1274 * Ignore events in OFF or ERROR state, and
1275 * ignore pinned events since we did them already.
1276 */
1277 if (event->state <= PERF_EVENT_STATE_OFF ||
1278 event->attr.pinned)
1279 continue;
1280
1281 /*
1282 * Listen to the 'cpu' scheduling filter constraint
1283 * of events:
1284 */
1285 if (event->cpu != -1 && event->cpu != cpu)
1286 continue;
1287
1288 if (group_can_go_on(event, cpuctx, can_add_hw))
1289 if (group_sched_in(event, cpuctx, ctx, cpu))
1290 can_add_hw = 0;
1291 }
1292 perf_enable();
1293 out:
1294 spin_unlock(&ctx->lock);
1295}
1296
1297/*
1298 * Called from scheduler to add the events of the current task
1299 * with interrupts disabled.
1300 *
1301 * We restore the event value and then enable it.
1302 *
1303 * This does not protect us against NMI, but enable()
1304 * sets the enabled bit in the control field of event _before_
1305 * accessing the event control register. If a NMI hits, then it will
1306 * keep the event running.
1307 */
1308void perf_event_task_sched_in(struct task_struct *task, int cpu)
1309{
1310 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1311 struct perf_event_context *ctx = task->perf_event_ctxp;
1312
1313 if (likely(!ctx))
1314 return;
1315 if (cpuctx->task_ctx == ctx)
1316 return;
1317 __perf_event_sched_in(ctx, cpuctx, cpu);
1318 cpuctx->task_ctx = ctx;
1319}
1320
1321static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1322{
1323 struct perf_event_context *ctx = &cpuctx->ctx;
1324
1325 __perf_event_sched_in(ctx, cpuctx, cpu);
1326}
1327
1328#define MAX_INTERRUPTS (~0ULL)
1329
1330static void perf_log_throttle(struct perf_event *event, int enable);
1331
1332static void perf_adjust_period(struct perf_event *event, u64 events)
1333{
1334 struct hw_perf_event *hwc = &event->hw;
1335 u64 period, sample_period;
1336 s64 delta;
1337
1338 events *= hwc->sample_period;
1339 period = div64_u64(events, event->attr.sample_freq);
1340
1341 delta = (s64)(period - hwc->sample_period);
1342 delta = (delta + 7) / 8; /* low pass filter */
1343
1344 sample_period = hwc->sample_period + delta;
1345
1346 if (!sample_period)
1347 sample_period = 1;
1348
1349 hwc->sample_period = sample_period;
1350}
1351
1352static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1353{
1354 struct perf_event *event;
1355 struct hw_perf_event *hwc;
1356 u64 interrupts, freq;
1357
1358 spin_lock(&ctx->lock);
1359 list_for_each_entry(event, &ctx->group_list, group_entry) {
1360 if (event->state != PERF_EVENT_STATE_ACTIVE)
1361 continue;
1362
1363 hwc = &event->hw;
1364
1365 interrupts = hwc->interrupts;
1366 hwc->interrupts = 0;
1367
1368 /*
1369 * unthrottle events on the tick
1370 */
1371 if (interrupts == MAX_INTERRUPTS) {
1372 perf_log_throttle(event, 1);
1373 event->pmu->unthrottle(event);
1374 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1375 }
1376
1377 if (!event->attr.freq || !event->attr.sample_freq)
1378 continue;
1379
1380 /*
1381 * if the specified freq < HZ then we need to skip ticks
1382 */
1383 if (event->attr.sample_freq < HZ) {
1384 freq = event->attr.sample_freq;
1385
1386 hwc->freq_count += freq;
1387 hwc->freq_interrupts += interrupts;
1388
1389 if (hwc->freq_count < HZ)
1390 continue;
1391
1392 interrupts = hwc->freq_interrupts;
1393 hwc->freq_interrupts = 0;
1394 hwc->freq_count -= HZ;
1395 } else
1396 freq = HZ;
1397
1398 perf_adjust_period(event, freq * interrupts);
1399
1400 /*
1401 * In order to avoid being stalled by an (accidental) huge
1402 * sample period, force reset the sample period if we didn't
1403 * get any events in this freq period.
1404 */
1405 if (!interrupts) {
1406 perf_disable();
1407 event->pmu->disable(event);
1408 atomic64_set(&hwc->period_left, 0);
1409 event->pmu->enable(event);
1410 perf_enable();
1411 }
1412 }
1413 spin_unlock(&ctx->lock);
1414}
1415
1416/*
1417 * Round-robin a context's events:
1418 */
1419static void rotate_ctx(struct perf_event_context *ctx)
1420{
1421 struct perf_event *event;
1422
1423 if (!ctx->nr_events)
1424 return;
1425
1426 spin_lock(&ctx->lock);
1427 /*
1428 * Rotate the first entry last (works just fine for group events too):
1429 */
1430 perf_disable();
1431 list_for_each_entry(event, &ctx->group_list, group_entry) {
1432 list_move_tail(&event->group_entry, &ctx->group_list);
1433 break;
1434 }
1435 perf_enable();
1436
1437 spin_unlock(&ctx->lock);
1438}
1439
1440void perf_event_task_tick(struct task_struct *curr, int cpu)
1441{
1442 struct perf_cpu_context *cpuctx;
1443 struct perf_event_context *ctx;
1444
1445 if (!atomic_read(&nr_events))
1446 return;
1447
1448 cpuctx = &per_cpu(perf_cpu_context, cpu);
1449 ctx = curr->perf_event_ctxp;
1450
1451 perf_ctx_adjust_freq(&cpuctx->ctx);
1452 if (ctx)
1453 perf_ctx_adjust_freq(ctx);
1454
1455 perf_event_cpu_sched_out(cpuctx);
1456 if (ctx)
1457 __perf_event_task_sched_out(ctx);
1458
1459 rotate_ctx(&cpuctx->ctx);
1460 if (ctx)
1461 rotate_ctx(ctx);
1462
1463 perf_event_cpu_sched_in(cpuctx, cpu);
1464 if (ctx)
1465 perf_event_task_sched_in(curr, cpu);
1466}
1467
1468/*
1469 * Enable all of a task's events that have been marked enable-on-exec.
1470 * This expects task == current.
1471 */
1472static void perf_event_enable_on_exec(struct task_struct *task)
1473{
1474 struct perf_event_context *ctx;
1475 struct perf_event *event;
1476 unsigned long flags;
1477 int enabled = 0;
1478
1479 local_irq_save(flags);
1480 ctx = task->perf_event_ctxp;
1481 if (!ctx || !ctx->nr_events)
1482 goto out;
1483
1484 __perf_event_task_sched_out(ctx);
1485
1486 spin_lock(&ctx->lock);
1487
1488 list_for_each_entry(event, &ctx->group_list, group_entry) {
1489 if (!event->attr.enable_on_exec)
1490 continue;
1491 event->attr.enable_on_exec = 0;
1492 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1493 continue;
1494 __perf_event_mark_enabled(event, ctx);
1495 enabled = 1;
1496 }
1497
1498 /*
1499 * Unclone this context if we enabled any event.
1500 */
1501 if (enabled)
1502 unclone_ctx(ctx);
1503
1504 spin_unlock(&ctx->lock);
1505
1506 perf_event_task_sched_in(task, smp_processor_id());
1507 out:
1508 local_irq_restore(flags);
1509}
1510
1511/*
1512 * Cross CPU call to read the hardware event
1513 */
1514static void __perf_event_read(void *info)
1515{
1516 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1517 struct perf_event *event = info;
1518 struct perf_event_context *ctx = event->ctx;
1519 unsigned long flags;
1520
1521 /*
1522 * If this is a task context, we need to check whether it is
1523 * the current task context of this cpu. If not it has been
1524 * scheduled out before the smp call arrived. In that case
1525 * event->count would have been updated to a recent sample
1526 * when the event was scheduled out.
1527 */
1528 if (ctx->task && cpuctx->task_ctx != ctx)
1529 return;
1530
1531 local_irq_save(flags);
1532 if (ctx->is_active)
1533 update_context_time(ctx);
1534 event->pmu->read(event);
1535 update_event_times(event);
1536 local_irq_restore(flags);
1537}
1538
1539static u64 perf_event_read(struct perf_event *event)
1540{
1541 /*
1542 * If event is enabled and currently active on a CPU, update the
1543 * value in the event structure:
1544 */
1545 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1546 smp_call_function_single(event->oncpu,
1547 __perf_event_read, event, 1);
1548 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1549 update_event_times(event);
1550 }
1551
1552 return atomic64_read(&event->count);
1553}
1554
1555/*
1556 * Initialize the perf_event context in a task_struct:
1557 */
1558static void
1559__perf_event_init_context(struct perf_event_context *ctx,
1560 struct task_struct *task)
1561{
1562 memset(ctx, 0, sizeof(*ctx));
1563 spin_lock_init(&ctx->lock);
1564 mutex_init(&ctx->mutex);
1565 INIT_LIST_HEAD(&ctx->group_list);
1566 INIT_LIST_HEAD(&ctx->event_list);
1567 atomic_set(&ctx->refcount, 1);
1568 ctx->task = task;
1569}
1570
1571static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1572{
1573 struct perf_event_context *ctx;
1574 struct perf_cpu_context *cpuctx;
1575 struct task_struct *task;
1576 unsigned long flags;
1577 int err;
1578
1579 /*
1580 * If cpu is not a wildcard then this is a percpu event:
1581 */
1582 if (cpu != -1) {
1583 /* Must be root to operate on a CPU event: */
1584 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1585 return ERR_PTR(-EACCES);
1586
1587 if (cpu < 0 || cpu > num_possible_cpus())
1588 return ERR_PTR(-EINVAL);
1589
1590 /*
1591 * We could be clever and allow to attach a event to an
1592 * offline CPU and activate it when the CPU comes up, but
1593 * that's for later.
1594 */
1595 if (!cpu_isset(cpu, cpu_online_map))
1596 return ERR_PTR(-ENODEV);
1597
1598 cpuctx = &per_cpu(perf_cpu_context, cpu);
1599 ctx = &cpuctx->ctx;
1600 get_ctx(ctx);
1601
1602 return ctx;
1603 }
1604
1605 rcu_read_lock();
1606 if (!pid)
1607 task = current;
1608 else
1609 task = find_task_by_vpid(pid);
1610 if (task)
1611 get_task_struct(task);
1612 rcu_read_unlock();
1613
1614 if (!task)
1615 return ERR_PTR(-ESRCH);
1616
1617 /*
1618 * Can't attach events to a dying task.
1619 */
1620 err = -ESRCH;
1621 if (task->flags & PF_EXITING)
1622 goto errout;
1623
1624 /* Reuse ptrace permission checks for now. */
1625 err = -EACCES;
1626 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1627 goto errout;
1628
1629 retry:
1630 ctx = perf_lock_task_context(task, &flags);
1631 if (ctx) {
1632 unclone_ctx(ctx);
1633 spin_unlock_irqrestore(&ctx->lock, flags);
1634 }
1635
1636 if (!ctx) {
1637 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1638 err = -ENOMEM;
1639 if (!ctx)
1640 goto errout;
1641 __perf_event_init_context(ctx, task);
1642 get_ctx(ctx);
1643 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1644 /*
1645 * We raced with some other task; use
1646 * the context they set.
1647 */
1648 kfree(ctx);
1649 goto retry;
1650 }
1651 get_task_struct(task);
1652 }
1653
1654 put_task_struct(task);
1655 return ctx;
1656
1657 errout:
1658 put_task_struct(task);
1659 return ERR_PTR(err);
1660}
1661
1662static void perf_event_free_filter(struct perf_event *event);
1663
1664static void free_event_rcu(struct rcu_head *head)
1665{
1666 struct perf_event *event;
1667
1668 event = container_of(head, struct perf_event, rcu_head);
1669 if (event->ns)
1670 put_pid_ns(event->ns);
1671 perf_event_free_filter(event);
1672 kfree(event);
1673}
1674
1675static void perf_pending_sync(struct perf_event *event);
1676
1677static void free_event(struct perf_event *event)
1678{
1679 perf_pending_sync(event);
1680
1681 if (!event->parent) {
1682 atomic_dec(&nr_events);
1683 if (event->attr.mmap)
1684 atomic_dec(&nr_mmap_events);
1685 if (event->attr.comm)
1686 atomic_dec(&nr_comm_events);
1687 if (event->attr.task)
1688 atomic_dec(&nr_task_events);
1689 }
1690
1691 if (event->output) {
1692 fput(event->output->filp);
1693 event->output = NULL;
1694 }
1695
1696 if (event->destroy)
1697 event->destroy(event);
1698
1699 put_ctx(event->ctx);
1700 call_rcu(&event->rcu_head, free_event_rcu);
1701}
1702
1703/*
1704 * Called when the last reference to the file is gone.
1705 */
1706static int perf_release(struct inode *inode, struct file *file)
1707{
1708 struct perf_event *event = file->private_data;
1709 struct perf_event_context *ctx = event->ctx;
1710
1711 file->private_data = NULL;
1712
1713 WARN_ON_ONCE(ctx->parent_ctx);
1714 mutex_lock(&ctx->mutex);
1715 perf_event_remove_from_context(event);
1716 mutex_unlock(&ctx->mutex);
1717
1718 mutex_lock(&event->owner->perf_event_mutex);
1719 list_del_init(&event->owner_entry);
1720 mutex_unlock(&event->owner->perf_event_mutex);
1721 put_task_struct(event->owner);
1722
1723 free_event(event);
1724
1725 return 0;
1726}
1727
1728static int perf_event_read_size(struct perf_event *event)
1729{
1730 int entry = sizeof(u64); /* value */
1731 int size = 0;
1732 int nr = 1;
1733
1734 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1735 size += sizeof(u64);
1736
1737 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1738 size += sizeof(u64);
1739
1740 if (event->attr.read_format & PERF_FORMAT_ID)
1741 entry += sizeof(u64);
1742
1743 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1744 nr += event->group_leader->nr_siblings;
1745 size += sizeof(u64);
1746 }
1747
1748 size += entry * nr;
1749
1750 return size;
1751}
1752
1753static u64 perf_event_read_value(struct perf_event *event)
1754{
1755 struct perf_event *child;
1756 u64 total = 0;
1757
1758 total += perf_event_read(event);
1759 list_for_each_entry(child, &event->child_list, child_list)
1760 total += perf_event_read(child);
1761
1762 return total;
1763}
1764
1765static int perf_event_read_entry(struct perf_event *event,
1766 u64 read_format, char __user *buf)
1767{
1768 int n = 0, count = 0;
1769 u64 values[2];
1770
1771 values[n++] = perf_event_read_value(event);
1772 if (read_format & PERF_FORMAT_ID)
1773 values[n++] = primary_event_id(event);
1774
1775 count = n * sizeof(u64);
1776
1777 if (copy_to_user(buf, values, count))
1778 return -EFAULT;
1779
1780 return count;
1781}
1782
1783static int perf_event_read_group(struct perf_event *event,
1784 u64 read_format, char __user *buf)
1785{
1786 struct perf_event *leader = event->group_leader, *sub;
1787 int n = 0, size = 0, err = -EFAULT;
1788 u64 values[3];
1789
1790 values[n++] = 1 + leader->nr_siblings;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1792 values[n++] = leader->total_time_enabled +
1793 atomic64_read(&leader->child_total_time_enabled);
1794 }
1795 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1796 values[n++] = leader->total_time_running +
1797 atomic64_read(&leader->child_total_time_running);
1798 }
1799
1800 size = n * sizeof(u64);
1801
1802 if (copy_to_user(buf, values, size))
1803 return -EFAULT;
1804
1805 err = perf_event_read_entry(leader, read_format, buf + size);
1806 if (err < 0)
1807 return err;
1808
1809 size += err;
1810
1811 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1812 err = perf_event_read_entry(sub, read_format,
1813 buf + size);
1814 if (err < 0)
1815 return err;
1816
1817 size += err;
1818 }
1819
1820 return size;
1821}
1822
1823static int perf_event_read_one(struct perf_event *event,
1824 u64 read_format, char __user *buf)
1825{
1826 u64 values[4];
1827 int n = 0;
1828
1829 values[n++] = perf_event_read_value(event);
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1831 values[n++] = event->total_time_enabled +
1832 atomic64_read(&event->child_total_time_enabled);
1833 }
1834 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1835 values[n++] = event->total_time_running +
1836 atomic64_read(&event->child_total_time_running);
1837 }
1838 if (read_format & PERF_FORMAT_ID)
1839 values[n++] = primary_event_id(event);
1840
1841 if (copy_to_user(buf, values, n * sizeof(u64)))
1842 return -EFAULT;
1843
1844 return n * sizeof(u64);
1845}
1846
1847/*
1848 * Read the performance event - simple non blocking version for now
1849 */
1850static ssize_t
1851perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1852{
1853 u64 read_format = event->attr.read_format;
1854 int ret;
1855
1856 /*
1857 * Return end-of-file for a read on a event that is in
1858 * error state (i.e. because it was pinned but it couldn't be
1859 * scheduled on to the CPU at some point).
1860 */
1861 if (event->state == PERF_EVENT_STATE_ERROR)
1862 return 0;
1863
1864 if (count < perf_event_read_size(event))
1865 return -ENOSPC;
1866
1867 WARN_ON_ONCE(event->ctx->parent_ctx);
1868 mutex_lock(&event->child_mutex);
1869 if (read_format & PERF_FORMAT_GROUP)
1870 ret = perf_event_read_group(event, read_format, buf);
1871 else
1872 ret = perf_event_read_one(event, read_format, buf);
1873 mutex_unlock(&event->child_mutex);
1874
1875 return ret;
1876}
1877
1878static ssize_t
1879perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1880{
1881 struct perf_event *event = file->private_data;
1882
1883 return perf_read_hw(event, buf, count);
1884}
1885
1886static unsigned int perf_poll(struct file *file, poll_table *wait)
1887{
1888 struct perf_event *event = file->private_data;
1889 struct perf_mmap_data *data;
1890 unsigned int events = POLL_HUP;
1891
1892 rcu_read_lock();
1893 data = rcu_dereference(event->data);
1894 if (data)
1895 events = atomic_xchg(&data->poll, 0);
1896 rcu_read_unlock();
1897
1898 poll_wait(file, &event->waitq, wait);
1899
1900 return events;
1901}
1902
1903static void perf_event_reset(struct perf_event *event)
1904{
1905 (void)perf_event_read(event);
1906 atomic64_set(&event->count, 0);
1907 perf_event_update_userpage(event);
1908}
1909
1910/*
1911 * Holding the top-level event's child_mutex means that any
1912 * descendant process that has inherited this event will block
1913 * in sync_child_event if it goes to exit, thus satisfying the
1914 * task existence requirements of perf_event_enable/disable.
1915 */
1916static void perf_event_for_each_child(struct perf_event *event,
1917 void (*func)(struct perf_event *))
1918{
1919 struct perf_event *child;
1920
1921 WARN_ON_ONCE(event->ctx->parent_ctx);
1922 mutex_lock(&event->child_mutex);
1923 func(event);
1924 list_for_each_entry(child, &event->child_list, child_list)
1925 func(child);
1926 mutex_unlock(&event->child_mutex);
1927}
1928
1929static void perf_event_for_each(struct perf_event *event,
1930 void (*func)(struct perf_event *))
1931{
1932 struct perf_event_context *ctx = event->ctx;
1933 struct perf_event *sibling;
1934
1935 WARN_ON_ONCE(ctx->parent_ctx);
1936 mutex_lock(&ctx->mutex);
1937 event = event->group_leader;
1938
1939 perf_event_for_each_child(event, func);
1940 func(event);
1941 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1942 perf_event_for_each_child(event, func);
1943 mutex_unlock(&ctx->mutex);
1944}
1945
1946static int perf_event_period(struct perf_event *event, u64 __user *arg)
1947{
1948 struct perf_event_context *ctx = event->ctx;
1949 unsigned long size;
1950 int ret = 0;
1951 u64 value;
1952
1953 if (!event->attr.sample_period)
1954 return -EINVAL;
1955
1956 size = copy_from_user(&value, arg, sizeof(value));
1957 if (size != sizeof(value))
1958 return -EFAULT;
1959
1960 if (!value)
1961 return -EINVAL;
1962
1963 spin_lock_irq(&ctx->lock);
1964 if (event->attr.freq) {
1965 if (value > sysctl_perf_event_sample_rate) {
1966 ret = -EINVAL;
1967 goto unlock;
1968 }
1969
1970 event->attr.sample_freq = value;
1971 } else {
1972 event->attr.sample_period = value;
1973 event->hw.sample_period = value;
1974 }
1975unlock:
1976 spin_unlock_irq(&ctx->lock);
1977
1978 return ret;
1979}
1980
1981static int perf_event_set_output(struct perf_event *event, int output_fd);
1982static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1983
1984static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1985{
1986 struct perf_event *event = file->private_data;
1987 void (*func)(struct perf_event *);
1988 u32 flags = arg;
1989
1990 switch (cmd) {
1991 case PERF_EVENT_IOC_ENABLE:
1992 func = perf_event_enable;
1993 break;
1994 case PERF_EVENT_IOC_DISABLE:
1995 func = perf_event_disable;
1996 break;
1997 case PERF_EVENT_IOC_RESET:
1998 func = perf_event_reset;
1999 break;
2000
2001 case PERF_EVENT_IOC_REFRESH:
2002 return perf_event_refresh(event, arg);
2003
2004 case PERF_EVENT_IOC_PERIOD:
2005 return perf_event_period(event, (u64 __user *)arg);
2006
2007 case PERF_EVENT_IOC_SET_OUTPUT:
2008 return perf_event_set_output(event, arg);
2009
2010 case PERF_EVENT_IOC_SET_FILTER:
2011 return perf_event_set_filter(event, (void __user *)arg);
2012
2013 default:
2014 return -ENOTTY;
2015 }
2016
2017 if (flags & PERF_IOC_FLAG_GROUP)
2018 perf_event_for_each(event, func);
2019 else
2020 perf_event_for_each_child(event, func);
2021
2022 return 0;
2023}
2024
2025int perf_event_task_enable(void)
2026{
2027 struct perf_event *event;
2028
2029 mutex_lock(&current->perf_event_mutex);
2030 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2031 perf_event_for_each_child(event, perf_event_enable);
2032 mutex_unlock(&current->perf_event_mutex);
2033
2034 return 0;
2035}
2036
2037int perf_event_task_disable(void)
2038{
2039 struct perf_event *event;
2040
2041 mutex_lock(&current->perf_event_mutex);
2042 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2043 perf_event_for_each_child(event, perf_event_disable);
2044 mutex_unlock(&current->perf_event_mutex);
2045
2046 return 0;
2047}
2048
2049#ifndef PERF_EVENT_INDEX_OFFSET
2050# define PERF_EVENT_INDEX_OFFSET 0
2051#endif
2052
2053static int perf_event_index(struct perf_event *event)
2054{
2055 if (event->state != PERF_EVENT_STATE_ACTIVE)
2056 return 0;
2057
2058 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2059}
2060
2061/*
2062 * Callers need to ensure there can be no nesting of this function, otherwise
2063 * the seqlock logic goes bad. We can not serialize this because the arch
2064 * code calls this from NMI context.
2065 */
2066void perf_event_update_userpage(struct perf_event *event)
2067{
2068 struct perf_event_mmap_page *userpg;
2069 struct perf_mmap_data *data;
2070
2071 rcu_read_lock();
2072 data = rcu_dereference(event->data);
2073 if (!data)
2074 goto unlock;
2075
2076 userpg = data->user_page;
2077
2078 /*
2079 * Disable preemption so as to not let the corresponding user-space
2080 * spin too long if we get preempted.
2081 */
2082 preempt_disable();
2083 ++userpg->lock;
2084 barrier();
2085 userpg->index = perf_event_index(event);
2086 userpg->offset = atomic64_read(&event->count);
2087 if (event->state == PERF_EVENT_STATE_ACTIVE)
2088 userpg->offset -= atomic64_read(&event->hw.prev_count);
2089
2090 userpg->time_enabled = event->total_time_enabled +
2091 atomic64_read(&event->child_total_time_enabled);
2092
2093 userpg->time_running = event->total_time_running +
2094 atomic64_read(&event->child_total_time_running);
2095
2096 barrier();
2097 ++userpg->lock;
2098 preempt_enable();
2099unlock:
2100 rcu_read_unlock();
2101}
2102
2103static unsigned long perf_data_size(struct perf_mmap_data *data)
2104{
2105 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2106}
2107
2108#ifndef CONFIG_PERF_USE_VMALLOC
2109
2110/*
2111 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2112 */
2113
2114static struct page *
2115perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2116{
2117 if (pgoff > data->nr_pages)
2118 return NULL;
2119
2120 if (pgoff == 0)
2121 return virt_to_page(data->user_page);
2122
2123 return virt_to_page(data->data_pages[pgoff - 1]);
2124}
2125
2126static struct perf_mmap_data *
2127perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2128{
2129 struct perf_mmap_data *data;
2130 unsigned long size;
2131 int i;
2132
2133 WARN_ON(atomic_read(&event->mmap_count));
2134
2135 size = sizeof(struct perf_mmap_data);
2136 size += nr_pages * sizeof(void *);
2137
2138 data = kzalloc(size, GFP_KERNEL);
2139 if (!data)
2140 goto fail;
2141
2142 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2143 if (!data->user_page)
2144 goto fail_user_page;
2145
2146 for (i = 0; i < nr_pages; i++) {
2147 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2148 if (!data->data_pages[i])
2149 goto fail_data_pages;
2150 }
2151
2152 data->data_order = 0;
2153 data->nr_pages = nr_pages;
2154
2155 return data;
2156
2157fail_data_pages:
2158 for (i--; i >= 0; i--)
2159 free_page((unsigned long)data->data_pages[i]);
2160
2161 free_page((unsigned long)data->user_page);
2162
2163fail_user_page:
2164 kfree(data);
2165
2166fail:
2167 return NULL;
2168}
2169
2170static void perf_mmap_free_page(unsigned long addr)
2171{
2172 struct page *page = virt_to_page((void *)addr);
2173
2174 page->mapping = NULL;
2175 __free_page(page);
2176}
2177
2178static void perf_mmap_data_free(struct perf_mmap_data *data)
2179{
2180 int i;
2181
2182 perf_mmap_free_page((unsigned long)data->user_page);
2183 for (i = 0; i < data->nr_pages; i++)
2184 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2185}
2186
2187#else
2188
2189/*
2190 * Back perf_mmap() with vmalloc memory.
2191 *
2192 * Required for architectures that have d-cache aliasing issues.
2193 */
2194
2195static struct page *
2196perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2197{
2198 if (pgoff > (1UL << data->data_order))
2199 return NULL;
2200
2201 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2202}
2203
2204static void perf_mmap_unmark_page(void *addr)
2205{
2206 struct page *page = vmalloc_to_page(addr);
2207
2208 page->mapping = NULL;
2209}
2210
2211static void perf_mmap_data_free_work(struct work_struct *work)
2212{
2213 struct perf_mmap_data *data;
2214 void *base;
2215 int i, nr;
2216
2217 data = container_of(work, struct perf_mmap_data, work);
2218 nr = 1 << data->data_order;
2219
2220 base = data->user_page;
2221 for (i = 0; i < nr + 1; i++)
2222 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2223
2224 vfree(base);
2225}
2226
2227static void perf_mmap_data_free(struct perf_mmap_data *data)
2228{
2229 schedule_work(&data->work);
2230}
2231
2232static struct perf_mmap_data *
2233perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2234{
2235 struct perf_mmap_data *data;
2236 unsigned long size;
2237 void *all_buf;
2238
2239 WARN_ON(atomic_read(&event->mmap_count));
2240
2241 size = sizeof(struct perf_mmap_data);
2242 size += sizeof(void *);
2243
2244 data = kzalloc(size, GFP_KERNEL);
2245 if (!data)
2246 goto fail;
2247
2248 INIT_WORK(&data->work, perf_mmap_data_free_work);
2249
2250 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2251 if (!all_buf)
2252 goto fail_all_buf;
2253
2254 data->user_page = all_buf;
2255 data->data_pages[0] = all_buf + PAGE_SIZE;
2256 data->data_order = ilog2(nr_pages);
2257 data->nr_pages = 1;
2258
2259 return data;
2260
2261fail_all_buf:
2262 kfree(data);
2263
2264fail:
2265 return NULL;
2266}
2267
2268#endif
2269
2270static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2271{
2272 struct perf_event *event = vma->vm_file->private_data;
2273 struct perf_mmap_data *data;
2274 int ret = VM_FAULT_SIGBUS;
2275
2276 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2277 if (vmf->pgoff == 0)
2278 ret = 0;
2279 return ret;
2280 }
2281
2282 rcu_read_lock();
2283 data = rcu_dereference(event->data);
2284 if (!data)
2285 goto unlock;
2286
2287 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2288 goto unlock;
2289
2290 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2291 if (!vmf->page)
2292 goto unlock;
2293
2294 get_page(vmf->page);
2295 vmf->page->mapping = vma->vm_file->f_mapping;
2296 vmf->page->index = vmf->pgoff;
2297
2298 ret = 0;
2299unlock:
2300 rcu_read_unlock();
2301
2302 return ret;
2303}
2304
2305static void
2306perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307{
2308 long max_size = perf_data_size(data);
2309
2310 atomic_set(&data->lock, -1);
2311
2312 if (event->attr.watermark) {
2313 data->watermark = min_t(long, max_size,
2314 event->attr.wakeup_watermark);
2315 }
2316
2317 if (!data->watermark)
2318 data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2319
2320
2321 rcu_assign_pointer(event->data, data);
2322}
2323
2324static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2325{
2326 struct perf_mmap_data *data;
2327
2328 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2329 perf_mmap_data_free(data);
2330 kfree(data);
2331}
2332
2333static void perf_mmap_data_release(struct perf_event *event)
2334{
2335 struct perf_mmap_data *data = event->data;
2336
2337 WARN_ON(atomic_read(&event->mmap_count));
2338
2339 rcu_assign_pointer(event->data, NULL);
2340 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2341}
2342
2343static void perf_mmap_open(struct vm_area_struct *vma)
2344{
2345 struct perf_event *event = vma->vm_file->private_data;
2346
2347 atomic_inc(&event->mmap_count);
2348}
2349
2350static void perf_mmap_close(struct vm_area_struct *vma)
2351{
2352 struct perf_event *event = vma->vm_file->private_data;
2353
2354 WARN_ON_ONCE(event->ctx->parent_ctx);
2355 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2356 unsigned long size = perf_data_size(event->data);
2357 struct user_struct *user = current_user();
2358
2359 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2360 vma->vm_mm->locked_vm -= event->data->nr_locked;
2361 perf_mmap_data_release(event);
2362 mutex_unlock(&event->mmap_mutex);
2363 }
2364}
2365
2366static const struct vm_operations_struct perf_mmap_vmops = {
2367 .open = perf_mmap_open,
2368 .close = perf_mmap_close,
2369 .fault = perf_mmap_fault,
2370 .page_mkwrite = perf_mmap_fault,
2371};
2372
2373static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2374{
2375 struct perf_event *event = file->private_data;
2376 unsigned long user_locked, user_lock_limit;
2377 struct user_struct *user = current_user();
2378 unsigned long locked, lock_limit;
2379 struct perf_mmap_data *data;
2380 unsigned long vma_size;
2381 unsigned long nr_pages;
2382 long user_extra, extra;
2383 int ret = 0;
2384
2385 if (!(vma->vm_flags & VM_SHARED))
2386 return -EINVAL;
2387
2388 vma_size = vma->vm_end - vma->vm_start;
2389 nr_pages = (vma_size / PAGE_SIZE) - 1;
2390
2391 /*
2392 * If we have data pages ensure they're a power-of-two number, so we
2393 * can do bitmasks instead of modulo.
2394 */
2395 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2396 return -EINVAL;
2397
2398 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2399 return -EINVAL;
2400
2401 if (vma->vm_pgoff != 0)
2402 return -EINVAL;
2403
2404 WARN_ON_ONCE(event->ctx->parent_ctx);
2405 mutex_lock(&event->mmap_mutex);
2406 if (event->output) {
2407 ret = -EINVAL;
2408 goto unlock;
2409 }
2410
2411 if (atomic_inc_not_zero(&event->mmap_count)) {
2412 if (nr_pages != event->data->nr_pages)
2413 ret = -EINVAL;
2414 goto unlock;
2415 }
2416
2417 user_extra = nr_pages + 1;
2418 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2419
2420 /*
2421 * Increase the limit linearly with more CPUs:
2422 */
2423 user_lock_limit *= num_online_cpus();
2424
2425 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2426
2427 extra = 0;
2428 if (user_locked > user_lock_limit)
2429 extra = user_locked - user_lock_limit;
2430
2431 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2432 lock_limit >>= PAGE_SHIFT;
2433 locked = vma->vm_mm->locked_vm + extra;
2434
2435 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2436 !capable(CAP_IPC_LOCK)) {
2437 ret = -EPERM;
2438 goto unlock;
2439 }
2440
2441 WARN_ON(event->data);
2442
2443 data = perf_mmap_data_alloc(event, nr_pages);
2444 ret = -ENOMEM;
2445 if (!data)
2446 goto unlock;
2447
2448 ret = 0;
2449 perf_mmap_data_init(event, data);
2450
2451 atomic_set(&event->mmap_count, 1);
2452 atomic_long_add(user_extra, &user->locked_vm);
2453 vma->vm_mm->locked_vm += extra;
2454 event->data->nr_locked = extra;
2455 if (vma->vm_flags & VM_WRITE)
2456 event->data->writable = 1;
2457
2458unlock:
2459 mutex_unlock(&event->mmap_mutex);
2460
2461 vma->vm_flags |= VM_RESERVED;
2462 vma->vm_ops = &perf_mmap_vmops;
2463
2464 return ret;
2465}
2466
2467static int perf_fasync(int fd, struct file *filp, int on)
2468{
2469 struct inode *inode = filp->f_path.dentry->d_inode;
2470 struct perf_event *event = filp->private_data;
2471 int retval;
2472
2473 mutex_lock(&inode->i_mutex);
2474 retval = fasync_helper(fd, filp, on, &event->fasync);
2475 mutex_unlock(&inode->i_mutex);
2476
2477 if (retval < 0)
2478 return retval;
2479
2480 return 0;
2481}
2482
2483static const struct file_operations perf_fops = {
2484 .release = perf_release,
2485 .read = perf_read,
2486 .poll = perf_poll,
2487 .unlocked_ioctl = perf_ioctl,
2488 .compat_ioctl = perf_ioctl,
2489 .mmap = perf_mmap,
2490 .fasync = perf_fasync,
2491};
2492
2493/*
2494 * Perf event wakeup
2495 *
2496 * If there's data, ensure we set the poll() state and publish everything
2497 * to user-space before waking everybody up.
2498 */
2499
2500void perf_event_wakeup(struct perf_event *event)
2501{
2502 wake_up_all(&event->waitq);
2503
2504 if (event->pending_kill) {
2505 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2506 event->pending_kill = 0;
2507 }
2508}
2509
2510/*
2511 * Pending wakeups
2512 *
2513 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2514 *
2515 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2516 * single linked list and use cmpxchg() to add entries lockless.
2517 */
2518
2519static void perf_pending_event(struct perf_pending_entry *entry)
2520{
2521 struct perf_event *event = container_of(entry,
2522 struct perf_event, pending);
2523
2524 if (event->pending_disable) {
2525 event->pending_disable = 0;
2526 __perf_event_disable(event);
2527 }
2528
2529 if (event->pending_wakeup) {
2530 event->pending_wakeup = 0;
2531 perf_event_wakeup(event);
2532 }
2533}
2534
2535#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2536
2537static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2538 PENDING_TAIL,
2539};
2540
2541static void perf_pending_queue(struct perf_pending_entry *entry,
2542 void (*func)(struct perf_pending_entry *))
2543{
2544 struct perf_pending_entry **head;
2545
2546 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2547 return;
2548
2549 entry->func = func;
2550
2551 head = &get_cpu_var(perf_pending_head);
2552
2553 do {
2554 entry->next = *head;
2555 } while (cmpxchg(head, entry->next, entry) != entry->next);
2556
2557 set_perf_event_pending();
2558
2559 put_cpu_var(perf_pending_head);
2560}
2561
2562static int __perf_pending_run(void)
2563{
2564 struct perf_pending_entry *list;
2565 int nr = 0;
2566
2567 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2568 while (list != PENDING_TAIL) {
2569 void (*func)(struct perf_pending_entry *);
2570 struct perf_pending_entry *entry = list;
2571
2572 list = list->next;
2573
2574 func = entry->func;
2575 entry->next = NULL;
2576 /*
2577 * Ensure we observe the unqueue before we issue the wakeup,
2578 * so that we won't be waiting forever.
2579 * -- see perf_not_pending().
2580 */
2581 smp_wmb();
2582
2583 func(entry);
2584 nr++;
2585 }
2586
2587 return nr;
2588}
2589
2590static inline int perf_not_pending(struct perf_event *event)
2591{
2592 /*
2593 * If we flush on whatever cpu we run, there is a chance we don't
2594 * need to wait.
2595 */
2596 get_cpu();
2597 __perf_pending_run();
2598 put_cpu();
2599
2600 /*
2601 * Ensure we see the proper queue state before going to sleep
2602 * so that we do not miss the wakeup. -- see perf_pending_handle()
2603 */
2604 smp_rmb();
2605 return event->pending.next == NULL;
2606}
2607
2608static void perf_pending_sync(struct perf_event *event)
2609{
2610 wait_event(event->waitq, perf_not_pending(event));
2611}
2612
2613void perf_event_do_pending(void)
2614{
2615 __perf_pending_run();
2616}
2617
2618/*
2619 * Callchain support -- arch specific
2620 */
2621
2622__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2623{
2624 return NULL;
2625}
2626
2627/*
2628 * Output
2629 */
2630static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2631 unsigned long offset, unsigned long head)
2632{
2633 unsigned long mask;
2634
2635 if (!data->writable)
2636 return true;
2637
2638 mask = perf_data_size(data) - 1;
2639
2640 offset = (offset - tail) & mask;
2641 head = (head - tail) & mask;
2642
2643 if ((int)(head - offset) < 0)
2644 return false;
2645
2646 return true;
2647}
2648
2649static void perf_output_wakeup(struct perf_output_handle *handle)
2650{
2651 atomic_set(&handle->data->poll, POLL_IN);
2652
2653 if (handle->nmi) {
2654 handle->event->pending_wakeup = 1;
2655 perf_pending_queue(&handle->event->pending,
2656 perf_pending_event);
2657 } else
2658 perf_event_wakeup(handle->event);
2659}
2660
2661/*
2662 * Curious locking construct.
2663 *
2664 * We need to ensure a later event_id doesn't publish a head when a former
2665 * event_id isn't done writing. However since we need to deal with NMIs we
2666 * cannot fully serialize things.
2667 *
2668 * What we do is serialize between CPUs so we only have to deal with NMI
2669 * nesting on a single CPU.
2670 *
2671 * We only publish the head (and generate a wakeup) when the outer-most
2672 * event_id completes.
2673 */
2674static void perf_output_lock(struct perf_output_handle *handle)
2675{
2676 struct perf_mmap_data *data = handle->data;
2677 int cpu;
2678
2679 handle->locked = 0;
2680
2681 local_irq_save(handle->flags);
2682 cpu = smp_processor_id();
2683
2684 if (in_nmi() && atomic_read(&data->lock) == cpu)
2685 return;
2686
2687 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2688 cpu_relax();
2689
2690 handle->locked = 1;
2691}
2692
2693static void perf_output_unlock(struct perf_output_handle *handle)
2694{
2695 struct perf_mmap_data *data = handle->data;
2696 unsigned long head;
2697 int cpu;
2698
2699 data->done_head = data->head;
2700
2701 if (!handle->locked)
2702 goto out;
2703
2704again:
2705 /*
2706 * The xchg implies a full barrier that ensures all writes are done
2707 * before we publish the new head, matched by a rmb() in userspace when
2708 * reading this position.
2709 */
2710 while ((head = atomic_long_xchg(&data->done_head, 0)))
2711 data->user_page->data_head = head;
2712
2713 /*
2714 * NMI can happen here, which means we can miss a done_head update.
2715 */
2716
2717 cpu = atomic_xchg(&data->lock, -1);
2718 WARN_ON_ONCE(cpu != smp_processor_id());
2719
2720 /*
2721 * Therefore we have to validate we did not indeed do so.
2722 */
2723 if (unlikely(atomic_long_read(&data->done_head))) {
2724 /*
2725 * Since we had it locked, we can lock it again.
2726 */
2727 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2728 cpu_relax();
2729
2730 goto again;
2731 }
2732
2733 if (atomic_xchg(&data->wakeup, 0))
2734 perf_output_wakeup(handle);
2735out:
2736 local_irq_restore(handle->flags);
2737}
2738
2739void perf_output_copy(struct perf_output_handle *handle,
2740 const void *buf, unsigned int len)
2741{
2742 unsigned int pages_mask;
2743 unsigned long offset;
2744 unsigned int size;
2745 void **pages;
2746
2747 offset = handle->offset;
2748 pages_mask = handle->data->nr_pages - 1;
2749 pages = handle->data->data_pages;
2750
2751 do {
2752 unsigned long page_offset;
2753 unsigned long page_size;
2754 int nr;
2755
2756 nr = (offset >> PAGE_SHIFT) & pages_mask;
2757 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2758 page_offset = offset & (page_size - 1);
2759 size = min_t(unsigned int, page_size - page_offset, len);
2760
2761 memcpy(pages[nr] + page_offset, buf, size);
2762
2763 len -= size;
2764 buf += size;
2765 offset += size;
2766 } while (len);
2767
2768 handle->offset = offset;
2769
2770 /*
2771 * Check we didn't copy past our reservation window, taking the
2772 * possible unsigned int wrap into account.
2773 */
2774 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2775}
2776
2777int perf_output_begin(struct perf_output_handle *handle,
2778 struct perf_event *event, unsigned int size,
2779 int nmi, int sample)
2780{
2781 struct perf_event *output_event;
2782 struct perf_mmap_data *data;
2783 unsigned long tail, offset, head;
2784 int have_lost;
2785 struct {
2786 struct perf_event_header header;
2787 u64 id;
2788 u64 lost;
2789 } lost_event;
2790
2791 rcu_read_lock();
2792 /*
2793 * For inherited events we send all the output towards the parent.
2794 */
2795 if (event->parent)
2796 event = event->parent;
2797
2798 output_event = rcu_dereference(event->output);
2799 if (output_event)
2800 event = output_event;
2801
2802 data = rcu_dereference(event->data);
2803 if (!data)
2804 goto out;
2805
2806 handle->data = data;
2807 handle->event = event;
2808 handle->nmi = nmi;
2809 handle->sample = sample;
2810
2811 if (!data->nr_pages)
2812 goto fail;
2813
2814 have_lost = atomic_read(&data->lost);
2815 if (have_lost)
2816 size += sizeof(lost_event);
2817
2818 perf_output_lock(handle);
2819
2820 do {
2821 /*
2822 * Userspace could choose to issue a mb() before updating the
2823 * tail pointer. So that all reads will be completed before the
2824 * write is issued.
2825 */
2826 tail = ACCESS_ONCE(data->user_page->data_tail);
2827 smp_rmb();
2828 offset = head = atomic_long_read(&data->head);
2829 head += size;
2830 if (unlikely(!perf_output_space(data, tail, offset, head)))
2831 goto fail;
2832 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2833
2834 handle->offset = offset;
2835 handle->head = head;
2836
2837 if (head - tail > data->watermark)
2838 atomic_set(&data->wakeup, 1);
2839
2840 if (have_lost) {
2841 lost_event.header.type = PERF_RECORD_LOST;
2842 lost_event.header.misc = 0;
2843 lost_event.header.size = sizeof(lost_event);
2844 lost_event.id = event->id;
2845 lost_event.lost = atomic_xchg(&data->lost, 0);
2846
2847 perf_output_put(handle, lost_event);
2848 }
2849
2850 return 0;
2851
2852fail:
2853 atomic_inc(&data->lost);
2854 perf_output_unlock(handle);
2855out:
2856 rcu_read_unlock();
2857
2858 return -ENOSPC;
2859}
2860
2861void perf_output_end(struct perf_output_handle *handle)
2862{
2863 struct perf_event *event = handle->event;
2864 struct perf_mmap_data *data = handle->data;
2865
2866 int wakeup_events = event->attr.wakeup_events;
2867
2868 if (handle->sample && wakeup_events) {
2869 int events = atomic_inc_return(&data->events);
2870 if (events >= wakeup_events) {
2871 atomic_sub(wakeup_events, &data->events);
2872 atomic_set(&data->wakeup, 1);
2873 }
2874 }
2875
2876 perf_output_unlock(handle);
2877 rcu_read_unlock();
2878}
2879
2880static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2881{
2882 /*
2883 * only top level events have the pid namespace they were created in
2884 */
2885 if (event->parent)
2886 event = event->parent;
2887
2888 return task_tgid_nr_ns(p, event->ns);
2889}
2890
2891static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2892{
2893 /*
2894 * only top level events have the pid namespace they were created in
2895 */
2896 if (event->parent)
2897 event = event->parent;
2898
2899 return task_pid_nr_ns(p, event->ns);
2900}
2901
2902static void perf_output_read_one(struct perf_output_handle *handle,
2903 struct perf_event *event)
2904{
2905 u64 read_format = event->attr.read_format;
2906 u64 values[4];
2907 int n = 0;
2908
2909 values[n++] = atomic64_read(&event->count);
2910 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2911 values[n++] = event->total_time_enabled +
2912 atomic64_read(&event->child_total_time_enabled);
2913 }
2914 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2915 values[n++] = event->total_time_running +
2916 atomic64_read(&event->child_total_time_running);
2917 }
2918 if (read_format & PERF_FORMAT_ID)
2919 values[n++] = primary_event_id(event);
2920
2921 perf_output_copy(handle, values, n * sizeof(u64));
2922}
2923
2924/*
2925 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2926 */
2927static void perf_output_read_group(struct perf_output_handle *handle,
2928 struct perf_event *event)
2929{
2930 struct perf_event *leader = event->group_leader, *sub;
2931 u64 read_format = event->attr.read_format;
2932 u64 values[5];
2933 int n = 0;
2934
2935 values[n++] = 1 + leader->nr_siblings;
2936
2937 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2938 values[n++] = leader->total_time_enabled;
2939
2940 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2941 values[n++] = leader->total_time_running;
2942
2943 if (leader != event)
2944 leader->pmu->read(leader);
2945
2946 values[n++] = atomic64_read(&leader->count);
2947 if (read_format & PERF_FORMAT_ID)
2948 values[n++] = primary_event_id(leader);
2949
2950 perf_output_copy(handle, values, n * sizeof(u64));
2951
2952 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2953 n = 0;
2954
2955 if (sub != event)
2956 sub->pmu->read(sub);
2957
2958 values[n++] = atomic64_read(&sub->count);
2959 if (read_format & PERF_FORMAT_ID)
2960 values[n++] = primary_event_id(sub);
2961
2962 perf_output_copy(handle, values, n * sizeof(u64));
2963 }
2964}
2965
2966static void perf_output_read(struct perf_output_handle *handle,
2967 struct perf_event *event)
2968{
2969 if (event->attr.read_format & PERF_FORMAT_GROUP)
2970 perf_output_read_group(handle, event);
2971 else
2972 perf_output_read_one(handle, event);
2973}
2974
2975void perf_output_sample(struct perf_output_handle *handle,
2976 struct perf_event_header *header,
2977 struct perf_sample_data *data,
2978 struct perf_event *event)
2979{
2980 u64 sample_type = data->type;
2981
2982 perf_output_put(handle, *header);
2983
2984 if (sample_type & PERF_SAMPLE_IP)
2985 perf_output_put(handle, data->ip);
2986
2987 if (sample_type & PERF_SAMPLE_TID)
2988 perf_output_put(handle, data->tid_entry);
2989
2990 if (sample_type & PERF_SAMPLE_TIME)
2991 perf_output_put(handle, data->time);
2992
2993 if (sample_type & PERF_SAMPLE_ADDR)
2994 perf_output_put(handle, data->addr);
2995
2996 if (sample_type & PERF_SAMPLE_ID)
2997 perf_output_put(handle, data->id);
2998
2999 if (sample_type & PERF_SAMPLE_STREAM_ID)
3000 perf_output_put(handle, data->stream_id);
3001
3002 if (sample_type & PERF_SAMPLE_CPU)
3003 perf_output_put(handle, data->cpu_entry);
3004
3005 if (sample_type & PERF_SAMPLE_PERIOD)
3006 perf_output_put(handle, data->period);
3007
3008 if (sample_type & PERF_SAMPLE_READ)
3009 perf_output_read(handle, event);
3010
3011 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3012 if (data->callchain) {
3013 int size = 1;
3014
3015 if (data->callchain)
3016 size += data->callchain->nr;
3017
3018 size *= sizeof(u64);
3019
3020 perf_output_copy(handle, data->callchain, size);
3021 } else {
3022 u64 nr = 0;
3023 perf_output_put(handle, nr);
3024 }
3025 }
3026
3027 if (sample_type & PERF_SAMPLE_RAW) {
3028 if (data->raw) {
3029 perf_output_put(handle, data->raw->size);
3030 perf_output_copy(handle, data->raw->data,
3031 data->raw->size);
3032 } else {
3033 struct {
3034 u32 size;
3035 u32 data;
3036 } raw = {
3037 .size = sizeof(u32),
3038 .data = 0,
3039 };
3040 perf_output_put(handle, raw);
3041 }
3042 }
3043}
3044
3045void perf_prepare_sample(struct perf_event_header *header,
3046 struct perf_sample_data *data,
3047 struct perf_event *event,
3048 struct pt_regs *regs)
3049{
3050 u64 sample_type = event->attr.sample_type;
3051
3052 data->type = sample_type;
3053
3054 header->type = PERF_RECORD_SAMPLE;
3055 header->size = sizeof(*header);
3056
3057 header->misc = 0;
3058 header->misc |= perf_misc_flags(regs);
3059
3060 if (sample_type & PERF_SAMPLE_IP) {
3061 data->ip = perf_instruction_pointer(regs);
3062
3063 header->size += sizeof(data->ip);
3064 }
3065
3066 if (sample_type & PERF_SAMPLE_TID) {
3067 /* namespace issues */
3068 data->tid_entry.pid = perf_event_pid(event, current);
3069 data->tid_entry.tid = perf_event_tid(event, current);
3070
3071 header->size += sizeof(data->tid_entry);
3072 }
3073
3074 if (sample_type & PERF_SAMPLE_TIME) {
3075 data->time = perf_clock();
3076
3077 header->size += sizeof(data->time);
3078 }
3079
3080 if (sample_type & PERF_SAMPLE_ADDR)
3081 header->size += sizeof(data->addr);
3082
3083 if (sample_type & PERF_SAMPLE_ID) {
3084 data->id = primary_event_id(event);
3085
3086 header->size += sizeof(data->id);
3087 }
3088
3089 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3090 data->stream_id = event->id;
3091
3092 header->size += sizeof(data->stream_id);
3093 }
3094
3095 if (sample_type & PERF_SAMPLE_CPU) {
3096 data->cpu_entry.cpu = raw_smp_processor_id();
3097 data->cpu_entry.reserved = 0;
3098
3099 header->size += sizeof(data->cpu_entry);
3100 }
3101
3102 if (sample_type & PERF_SAMPLE_PERIOD)
3103 header->size += sizeof(data->period);
3104
3105 if (sample_type & PERF_SAMPLE_READ)
3106 header->size += perf_event_read_size(event);
3107
3108 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3109 int size = 1;
3110
3111 data->callchain = perf_callchain(regs);
3112
3113 if (data->callchain)
3114 size += data->callchain->nr;
3115
3116 header->size += size * sizeof(u64);
3117 }
3118
3119 if (sample_type & PERF_SAMPLE_RAW) {
3120 int size = sizeof(u32);
3121
3122 if (data->raw)
3123 size += data->raw->size;
3124 else
3125 size += sizeof(u32);
3126
3127 WARN_ON_ONCE(size & (sizeof(u64)-1));
3128 header->size += size;
3129 }
3130}
3131
3132static void perf_event_output(struct perf_event *event, int nmi,
3133 struct perf_sample_data *data,
3134 struct pt_regs *regs)
3135{
3136 struct perf_output_handle handle;
3137 struct perf_event_header header;
3138
3139 perf_prepare_sample(&header, data, event, regs);
3140
3141 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3142 return;
3143
3144 perf_output_sample(&handle, &header, data, event);
3145
3146 perf_output_end(&handle);
3147}
3148
3149/*
3150 * read event_id
3151 */
3152
3153struct perf_read_event {
3154 struct perf_event_header header;
3155
3156 u32 pid;
3157 u32 tid;
3158};
3159
3160static void
3161perf_event_read_event(struct perf_event *event,
3162 struct task_struct *task)
3163{
3164 struct perf_output_handle handle;
3165 struct perf_read_event read_event = {
3166 .header = {
3167 .type = PERF_RECORD_READ,
3168 .misc = 0,
3169 .size = sizeof(read_event) + perf_event_read_size(event),
3170 },
3171 .pid = perf_event_pid(event, task),
3172 .tid = perf_event_tid(event, task),
3173 };
3174 int ret;
3175
3176 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3177 if (ret)
3178 return;
3179
3180 perf_output_put(&handle, read_event);
3181 perf_output_read(&handle, event);
3182
3183 perf_output_end(&handle);
3184}
3185
3186/*
3187 * task tracking -- fork/exit
3188 *
3189 * enabled by: attr.comm | attr.mmap | attr.task
3190 */
3191
3192struct perf_task_event {
3193 struct task_struct *task;
3194 struct perf_event_context *task_ctx;
3195
3196 struct {
3197 struct perf_event_header header;
3198
3199 u32 pid;
3200 u32 ppid;
3201 u32 tid;
3202 u32 ptid;
3203 u64 time;
3204 } event_id;
3205};
3206
3207static void perf_event_task_output(struct perf_event *event,
3208 struct perf_task_event *task_event)
3209{
3210 struct perf_output_handle handle;
3211 int size;
3212 struct task_struct *task = task_event->task;
3213 int ret;
3214
3215 size = task_event->event_id.header.size;
3216 ret = perf_output_begin(&handle, event, size, 0, 0);
3217
3218 if (ret)
3219 return;
3220
3221 task_event->event_id.pid = perf_event_pid(event, task);
3222 task_event->event_id.ppid = perf_event_pid(event, current);
3223
3224 task_event->event_id.tid = perf_event_tid(event, task);
3225 task_event->event_id.ptid = perf_event_tid(event, current);
3226
3227 task_event->event_id.time = perf_clock();
3228
3229 perf_output_put(&handle, task_event->event_id);
3230
3231 perf_output_end(&handle);
3232}
3233
3234static int perf_event_task_match(struct perf_event *event)
3235{
3236 if (event->attr.comm || event->attr.mmap || event->attr.task)
3237 return 1;
3238
3239 return 0;
3240}
3241
3242static void perf_event_task_ctx(struct perf_event_context *ctx,
3243 struct perf_task_event *task_event)
3244{
3245 struct perf_event *event;
3246
3247 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3248 return;
3249
3250 rcu_read_lock();
3251 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3252 if (perf_event_task_match(event))
3253 perf_event_task_output(event, task_event);
3254 }
3255 rcu_read_unlock();
3256}
3257
3258static void perf_event_task_event(struct perf_task_event *task_event)
3259{
3260 struct perf_cpu_context *cpuctx;
3261 struct perf_event_context *ctx = task_event->task_ctx;
3262
3263 cpuctx = &get_cpu_var(perf_cpu_context);
3264 perf_event_task_ctx(&cpuctx->ctx, task_event);
3265 put_cpu_var(perf_cpu_context);
3266
3267 rcu_read_lock();
3268 if (!ctx)
3269 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3270 if (ctx)
3271 perf_event_task_ctx(ctx, task_event);
3272 rcu_read_unlock();
3273}
3274
3275static void perf_event_task(struct task_struct *task,
3276 struct perf_event_context *task_ctx,
3277 int new)
3278{
3279 struct perf_task_event task_event;
3280
3281 if (!atomic_read(&nr_comm_events) &&
3282 !atomic_read(&nr_mmap_events) &&
3283 !atomic_read(&nr_task_events))
3284 return;
3285
3286 task_event = (struct perf_task_event){
3287 .task = task,
3288 .task_ctx = task_ctx,
3289 .event_id = {
3290 .header = {
3291 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3292 .misc = 0,
3293 .size = sizeof(task_event.event_id),
3294 },
3295 /* .pid */
3296 /* .ppid */
3297 /* .tid */
3298 /* .ptid */
3299 },
3300 };
3301
3302 perf_event_task_event(&task_event);
3303}
3304
3305void perf_event_fork(struct task_struct *task)
3306{
3307 perf_event_task(task, NULL, 1);
3308}
3309
3310/*
3311 * comm tracking
3312 */
3313
3314struct perf_comm_event {
3315 struct task_struct *task;
3316 char *comm;
3317 int comm_size;
3318
3319 struct {
3320 struct perf_event_header header;
3321
3322 u32 pid;
3323 u32 tid;
3324 } event_id;
3325};
3326
3327static void perf_event_comm_output(struct perf_event *event,
3328 struct perf_comm_event *comm_event)
3329{
3330 struct perf_output_handle handle;
3331 int size = comm_event->event_id.header.size;
3332 int ret = perf_output_begin(&handle, event, size, 0, 0);
3333
3334 if (ret)
3335 return;
3336
3337 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3338 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3339
3340 perf_output_put(&handle, comm_event->event_id);
3341 perf_output_copy(&handle, comm_event->comm,
3342 comm_event->comm_size);
3343 perf_output_end(&handle);
3344}
3345
3346static int perf_event_comm_match(struct perf_event *event)
3347{
3348 if (event->attr.comm)
3349 return 1;
3350
3351 return 0;
3352}
3353
3354static void perf_event_comm_ctx(struct perf_event_context *ctx,
3355 struct perf_comm_event *comm_event)
3356{
3357 struct perf_event *event;
3358
3359 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3360 return;
3361
3362 rcu_read_lock();
3363 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3364 if (perf_event_comm_match(event))
3365 perf_event_comm_output(event, comm_event);
3366 }
3367 rcu_read_unlock();
3368}
3369
3370static void perf_event_comm_event(struct perf_comm_event *comm_event)
3371{
3372 struct perf_cpu_context *cpuctx;
3373 struct perf_event_context *ctx;
3374 unsigned int size;
3375 char comm[TASK_COMM_LEN];
3376
3377 memset(comm, 0, sizeof(comm));
3378 strncpy(comm, comm_event->task->comm, sizeof(comm));
3379 size = ALIGN(strlen(comm)+1, sizeof(u64));
3380
3381 comm_event->comm = comm;
3382 comm_event->comm_size = size;
3383
3384 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3385
3386 cpuctx = &get_cpu_var(perf_cpu_context);
3387 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3388 put_cpu_var(perf_cpu_context);
3389
3390 rcu_read_lock();
3391 /*
3392 * doesn't really matter which of the child contexts the
3393 * events ends up in.
3394 */
3395 ctx = rcu_dereference(current->perf_event_ctxp);
3396 if (ctx)
3397 perf_event_comm_ctx(ctx, comm_event);
3398 rcu_read_unlock();
3399}
3400
3401void perf_event_comm(struct task_struct *task)
3402{
3403 struct perf_comm_event comm_event;
3404
3405 if (task->perf_event_ctxp)
3406 perf_event_enable_on_exec(task);
3407
3408 if (!atomic_read(&nr_comm_events))
3409 return;
3410
3411 comm_event = (struct perf_comm_event){
3412 .task = task,
3413 /* .comm */
3414 /* .comm_size */
3415 .event_id = {
3416 .header = {
3417 .type = PERF_RECORD_COMM,
3418 .misc = 0,
3419 /* .size */
3420 },
3421 /* .pid */
3422 /* .tid */
3423 },
3424 };
3425
3426 perf_event_comm_event(&comm_event);
3427}
3428
3429/*
3430 * mmap tracking
3431 */
3432
3433struct perf_mmap_event {
3434 struct vm_area_struct *vma;
3435
3436 const char *file_name;
3437 int file_size;
3438
3439 struct {
3440 struct perf_event_header header;
3441
3442 u32 pid;
3443 u32 tid;
3444 u64 start;
3445 u64 len;
3446 u64 pgoff;
3447 } event_id;
3448};
3449
3450static void perf_event_mmap_output(struct perf_event *event,
3451 struct perf_mmap_event *mmap_event)
3452{
3453 struct perf_output_handle handle;
3454 int size = mmap_event->event_id.header.size;
3455 int ret = perf_output_begin(&handle, event, size, 0, 0);
3456
3457 if (ret)
3458 return;
3459
3460 mmap_event->event_id.pid = perf_event_pid(event, current);
3461 mmap_event->event_id.tid = perf_event_tid(event, current);
3462
3463 perf_output_put(&handle, mmap_event->event_id);
3464 perf_output_copy(&handle, mmap_event->file_name,
3465 mmap_event->file_size);
3466 perf_output_end(&handle);
3467}
3468
3469static int perf_event_mmap_match(struct perf_event *event,
3470 struct perf_mmap_event *mmap_event)
3471{
3472 if (event->attr.mmap)
3473 return 1;
3474
3475 return 0;
3476}
3477
3478static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3479 struct perf_mmap_event *mmap_event)
3480{
3481 struct perf_event *event;
3482
3483 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3484 return;
3485
3486 rcu_read_lock();
3487 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3488 if (perf_event_mmap_match(event, mmap_event))
3489 perf_event_mmap_output(event, mmap_event);
3490 }
3491 rcu_read_unlock();
3492}
3493
3494static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3495{
3496 struct perf_cpu_context *cpuctx;
3497 struct perf_event_context *ctx;
3498 struct vm_area_struct *vma = mmap_event->vma;
3499 struct file *file = vma->vm_file;
3500 unsigned int size;
3501 char tmp[16];
3502 char *buf = NULL;
3503 const char *name;
3504
3505 memset(tmp, 0, sizeof(tmp));
3506
3507 if (file) {
3508 /*
3509 * d_path works from the end of the buffer backwards, so we
3510 * need to add enough zero bytes after the string to handle
3511 * the 64bit alignment we do later.
3512 */
3513 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3514 if (!buf) {
3515 name = strncpy(tmp, "//enomem", sizeof(tmp));
3516 goto got_name;
3517 }
3518 name = d_path(&file->f_path, buf, PATH_MAX);
3519 if (IS_ERR(name)) {
3520 name = strncpy(tmp, "//toolong", sizeof(tmp));
3521 goto got_name;
3522 }
3523 } else {
3524 if (arch_vma_name(mmap_event->vma)) {
3525 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3526 sizeof(tmp));
3527 goto got_name;
3528 }
3529
3530 if (!vma->vm_mm) {
3531 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3532 goto got_name;
3533 }
3534
3535 name = strncpy(tmp, "//anon", sizeof(tmp));
3536 goto got_name;
3537 }
3538
3539got_name:
3540 size = ALIGN(strlen(name)+1, sizeof(u64));
3541
3542 mmap_event->file_name = name;
3543 mmap_event->file_size = size;
3544
3545 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3546
3547 cpuctx = &get_cpu_var(perf_cpu_context);
3548 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3549 put_cpu_var(perf_cpu_context);
3550
3551 rcu_read_lock();
3552 /*
3553 * doesn't really matter which of the child contexts the
3554 * events ends up in.
3555 */
3556 ctx = rcu_dereference(current->perf_event_ctxp);
3557 if (ctx)
3558 perf_event_mmap_ctx(ctx, mmap_event);
3559 rcu_read_unlock();
3560
3561 kfree(buf);
3562}
3563
3564void __perf_event_mmap(struct vm_area_struct *vma)
3565{
3566 struct perf_mmap_event mmap_event;
3567
3568 if (!atomic_read(&nr_mmap_events))
3569 return;
3570
3571 mmap_event = (struct perf_mmap_event){
3572 .vma = vma,
3573 /* .file_name */
3574 /* .file_size */
3575 .event_id = {
3576 .header = {
3577 .type = PERF_RECORD_MMAP,
3578 .misc = 0,
3579 /* .size */
3580 },
3581 /* .pid */
3582 /* .tid */
3583 .start = vma->vm_start,
3584 .len = vma->vm_end - vma->vm_start,
3585 .pgoff = vma->vm_pgoff,
3586 },
3587 };
3588
3589 perf_event_mmap_event(&mmap_event);
3590}
3591
3592/*
3593 * IRQ throttle logging
3594 */
3595
3596static void perf_log_throttle(struct perf_event *event, int enable)
3597{
3598 struct perf_output_handle handle;
3599 int ret;
3600
3601 struct {
3602 struct perf_event_header header;
3603 u64 time;
3604 u64 id;
3605 u64 stream_id;
3606 } throttle_event = {
3607 .header = {
3608 .type = PERF_RECORD_THROTTLE,
3609 .misc = 0,
3610 .size = sizeof(throttle_event),
3611 },
3612 .time = perf_clock(),
3613 .id = primary_event_id(event),
3614 .stream_id = event->id,
3615 };
3616
3617 if (enable)
3618 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3619
3620 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3621 if (ret)
3622 return;
3623
3624 perf_output_put(&handle, throttle_event);
3625 perf_output_end(&handle);
3626}
3627
3628/*
3629 * Generic event overflow handling, sampling.
3630 */
3631
3632static int __perf_event_overflow(struct perf_event *event, int nmi,
3633 int throttle, struct perf_sample_data *data,
3634 struct pt_regs *regs)
3635{
3636 int events = atomic_read(&event->event_limit);
3637 struct hw_perf_event *hwc = &event->hw;
3638 int ret = 0;
3639
3640 throttle = (throttle && event->pmu->unthrottle != NULL);
3641
3642 if (!throttle) {
3643 hwc->interrupts++;
3644 } else {
3645 if (hwc->interrupts != MAX_INTERRUPTS) {
3646 hwc->interrupts++;
3647 if (HZ * hwc->interrupts >
3648 (u64)sysctl_perf_event_sample_rate) {
3649 hwc->interrupts = MAX_INTERRUPTS;
3650 perf_log_throttle(event, 0);
3651 ret = 1;
3652 }
3653 } else {
3654 /*
3655 * Keep re-disabling events even though on the previous
3656 * pass we disabled it - just in case we raced with a
3657 * sched-in and the event got enabled again:
3658 */
3659 ret = 1;
3660 }
3661 }
3662
3663 if (event->attr.freq) {
3664 u64 now = perf_clock();
3665 s64 delta = now - hwc->freq_stamp;
3666
3667 hwc->freq_stamp = now;
3668
3669 if (delta > 0 && delta < TICK_NSEC)
3670 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3671 }
3672
3673 /*
3674 * XXX event_limit might not quite work as expected on inherited
3675 * events
3676 */
3677
3678 event->pending_kill = POLL_IN;
3679 if (events && atomic_dec_and_test(&event->event_limit)) {
3680 ret = 1;
3681 event->pending_kill = POLL_HUP;
3682 if (nmi) {
3683 event->pending_disable = 1;
3684 perf_pending_queue(&event->pending,
3685 perf_pending_event);
3686 } else
3687 perf_event_disable(event);
3688 }
3689
3690 perf_event_output(event, nmi, data, regs);
3691 return ret;
3692}
3693
3694int perf_event_overflow(struct perf_event *event, int nmi,
3695 struct perf_sample_data *data,
3696 struct pt_regs *regs)
3697{
3698 return __perf_event_overflow(event, nmi, 1, data, regs);
3699}
3700
3701/*
3702 * Generic software event infrastructure
3703 */
3704
3705/*
3706 * We directly increment event->count and keep a second value in
3707 * event->hw.period_left to count intervals. This period event
3708 * is kept in the range [-sample_period, 0] so that we can use the
3709 * sign as trigger.
3710 */
3711
3712static u64 perf_swevent_set_period(struct perf_event *event)
3713{
3714 struct hw_perf_event *hwc = &event->hw;
3715 u64 period = hwc->last_period;
3716 u64 nr, offset;
3717 s64 old, val;
3718
3719 hwc->last_period = hwc->sample_period;
3720
3721again:
3722 old = val = atomic64_read(&hwc->period_left);
3723 if (val < 0)
3724 return 0;
3725
3726 nr = div64_u64(period + val, period);
3727 offset = nr * period;
3728 val -= offset;
3729 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3730 goto again;
3731
3732 return nr;
3733}
3734
3735static void perf_swevent_overflow(struct perf_event *event,
3736 int nmi, struct perf_sample_data *data,
3737 struct pt_regs *regs)
3738{
3739 struct hw_perf_event *hwc = &event->hw;
3740 int throttle = 0;
3741 u64 overflow;
3742
3743 data->period = event->hw.last_period;
3744 overflow = perf_swevent_set_period(event);
3745
3746 if (hwc->interrupts == MAX_INTERRUPTS)
3747 return;
3748
3749 for (; overflow; overflow--) {
3750 if (__perf_event_overflow(event, nmi, throttle,
3751 data, regs)) {
3752 /*
3753 * We inhibit the overflow from happening when
3754 * hwc->interrupts == MAX_INTERRUPTS.
3755 */
3756 break;
3757 }
3758 throttle = 1;
3759 }
3760}
3761
3762static void perf_swevent_unthrottle(struct perf_event *event)
3763{
3764 /*
3765 * Nothing to do, we already reset hwc->interrupts.
3766 */
3767}
3768
3769static void perf_swevent_add(struct perf_event *event, u64 nr,
3770 int nmi, struct perf_sample_data *data,
3771 struct pt_regs *regs)
3772{
3773 struct hw_perf_event *hwc = &event->hw;
3774
3775 atomic64_add(nr, &event->count);
3776
3777 if (!hwc->sample_period)
3778 return;
3779
3780 if (!regs)
3781 return;
3782
3783 if (!atomic64_add_negative(nr, &hwc->period_left))
3784 perf_swevent_overflow(event, nmi, data, regs);
3785}
3786
3787static int perf_swevent_is_counting(struct perf_event *event)
3788{
3789 /*
3790 * The event is active, we're good!
3791 */
3792 if (event->state == PERF_EVENT_STATE_ACTIVE)
3793 return 1;
3794
3795 /*
3796 * The event is off/error, not counting.
3797 */
3798 if (event->state != PERF_EVENT_STATE_INACTIVE)
3799 return 0;
3800
3801 /*
3802 * The event is inactive, if the context is active
3803 * we're part of a group that didn't make it on the 'pmu',
3804 * not counting.
3805 */
3806 if (event->ctx->is_active)
3807 return 0;
3808
3809 /*
3810 * We're inactive and the context is too, this means the
3811 * task is scheduled out, we're counting events that happen
3812 * to us, like migration events.
3813 */
3814 return 1;
3815}
3816
3817static int perf_tp_event_match(struct perf_event *event,
3818 struct perf_sample_data *data);
3819
3820static int perf_swevent_match(struct perf_event *event,
3821 enum perf_type_id type,
3822 u32 event_id,
3823 struct perf_sample_data *data,
3824 struct pt_regs *regs)
3825{
3826 if (!perf_swevent_is_counting(event))
3827 return 0;
3828
3829 if (event->attr.type != type)
3830 return 0;
3831 if (event->attr.config != event_id)
3832 return 0;
3833
3834 if (regs) {
3835 if (event->attr.exclude_user && user_mode(regs))
3836 return 0;
3837
3838 if (event->attr.exclude_kernel && !user_mode(regs))
3839 return 0;
3840 }
3841
3842 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3843 !perf_tp_event_match(event, data))
3844 return 0;
3845
3846 return 1;
3847}
3848
3849static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3850 enum perf_type_id type,
3851 u32 event_id, u64 nr, int nmi,
3852 struct perf_sample_data *data,
3853 struct pt_regs *regs)
3854{
3855 struct perf_event *event;
3856
3857 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3858 return;
3859
3860 rcu_read_lock();
3861 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3862 if (perf_swevent_match(event, type, event_id, data, regs))
3863 perf_swevent_add(event, nr, nmi, data, regs);
3864 }
3865 rcu_read_unlock();
3866}
3867
3868static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3869{
3870 if (in_nmi())
3871 return &cpuctx->recursion[3];
3872
3873 if (in_irq())
3874 return &cpuctx->recursion[2];
3875
3876 if (in_softirq())
3877 return &cpuctx->recursion[1];
3878
3879 return &cpuctx->recursion[0];
3880}
3881
3882static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3883 u64 nr, int nmi,
3884 struct perf_sample_data *data,
3885 struct pt_regs *regs)
3886{
3887 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3888 int *recursion = perf_swevent_recursion_context(cpuctx);
3889 struct perf_event_context *ctx;
3890
3891 if (*recursion)
3892 goto out;
3893
3894 (*recursion)++;
3895 barrier();
3896
3897 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3898 nr, nmi, data, regs);
3899 rcu_read_lock();
3900 /*
3901 * doesn't really matter which of the child contexts the
3902 * events ends up in.
3903 */
3904 ctx = rcu_dereference(current->perf_event_ctxp);
3905 if (ctx)
3906 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3907 rcu_read_unlock();
3908
3909 barrier();
3910 (*recursion)--;
3911
3912out:
3913 put_cpu_var(perf_cpu_context);
3914}
3915
3916void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3917 struct pt_regs *regs, u64 addr)
3918{
3919 struct perf_sample_data data = {
3920 .addr = addr,
3921 };
3922
3923 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3924 &data, regs);
3925}
3926
3927static void perf_swevent_read(struct perf_event *event)
3928{
3929}
3930
3931static int perf_swevent_enable(struct perf_event *event)
3932{
3933 struct hw_perf_event *hwc = &event->hw;
3934
3935 if (hwc->sample_period) {
3936 hwc->last_period = hwc->sample_period;
3937 perf_swevent_set_period(event);
3938 }
3939 return 0;
3940}
3941
3942static void perf_swevent_disable(struct perf_event *event)
3943{
3944}
3945
3946static const struct pmu perf_ops_generic = {
3947 .enable = perf_swevent_enable,
3948 .disable = perf_swevent_disable,
3949 .read = perf_swevent_read,
3950 .unthrottle = perf_swevent_unthrottle,
3951};
3952
3953/*
3954 * hrtimer based swevent callback
3955 */
3956
3957static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3958{
3959 enum hrtimer_restart ret = HRTIMER_RESTART;
3960 struct perf_sample_data data;
3961 struct pt_regs *regs;
3962 struct perf_event *event;
3963 u64 period;
3964
3965 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3966 event->pmu->read(event);
3967
3968 data.addr = 0;
3969 regs = get_irq_regs();
3970 /*
3971 * In case we exclude kernel IPs or are somehow not in interrupt
3972 * context, provide the next best thing, the user IP.
3973 */
3974 if ((event->attr.exclude_kernel || !regs) &&
3975 !event->attr.exclude_user)
3976 regs = task_pt_regs(current);
3977
3978 if (regs) {
3979 if (perf_event_overflow(event, 0, &data, regs))
3980 ret = HRTIMER_NORESTART;
3981 }
3982
3983 period = max_t(u64, 10000, event->hw.sample_period);
3984 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3985
3986 return ret;
3987}
3988
3989/*
3990 * Software event: cpu wall time clock
3991 */
3992
3993static void cpu_clock_perf_event_update(struct perf_event *event)
3994{
3995 int cpu = raw_smp_processor_id();
3996 s64 prev;
3997 u64 now;
3998
3999 now = cpu_clock(cpu);
4000 prev = atomic64_read(&event->hw.prev_count);
4001 atomic64_set(&event->hw.prev_count, now);
4002 atomic64_add(now - prev, &event->count);
4003}
4004
4005static int cpu_clock_perf_event_enable(struct perf_event *event)
4006{
4007 struct hw_perf_event *hwc = &event->hw;
4008 int cpu = raw_smp_processor_id();
4009
4010 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4011 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4012 hwc->hrtimer.function = perf_swevent_hrtimer;
4013 if (hwc->sample_period) {
4014 u64 period = max_t(u64, 10000, hwc->sample_period);
4015 __hrtimer_start_range_ns(&hwc->hrtimer,
4016 ns_to_ktime(period), 0,
4017 HRTIMER_MODE_REL, 0);
4018 }
4019
4020 return 0;
4021}
4022
4023static void cpu_clock_perf_event_disable(struct perf_event *event)
4024{
4025 if (event->hw.sample_period)
4026 hrtimer_cancel(&event->hw.hrtimer);
4027 cpu_clock_perf_event_update(event);
4028}
4029
4030static void cpu_clock_perf_event_read(struct perf_event *event)
4031{
4032 cpu_clock_perf_event_update(event);
4033}
4034
4035static const struct pmu perf_ops_cpu_clock = {
4036 .enable = cpu_clock_perf_event_enable,
4037 .disable = cpu_clock_perf_event_disable,
4038 .read = cpu_clock_perf_event_read,
4039};
4040
4041/*
4042 * Software event: task time clock
4043 */
4044
4045static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4046{
4047 u64 prev;
4048 s64 delta;
4049
4050 prev = atomic64_xchg(&event->hw.prev_count, now);
4051 delta = now - prev;
4052 atomic64_add(delta, &event->count);
4053}
4054
4055static int task_clock_perf_event_enable(struct perf_event *event)
4056{
4057 struct hw_perf_event *hwc = &event->hw;
4058 u64 now;
4059
4060 now = event->ctx->time;
4061
4062 atomic64_set(&hwc->prev_count, now);
4063 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4064 hwc->hrtimer.function = perf_swevent_hrtimer;
4065 if (hwc->sample_period) {
4066 u64 period = max_t(u64, 10000, hwc->sample_period);
4067 __hrtimer_start_range_ns(&hwc->hrtimer,
4068 ns_to_ktime(period), 0,
4069 HRTIMER_MODE_REL, 0);
4070 }
4071
4072 return 0;
4073}
4074
4075static void task_clock_perf_event_disable(struct perf_event *event)
4076{
4077 if (event->hw.sample_period)
4078 hrtimer_cancel(&event->hw.hrtimer);
4079 task_clock_perf_event_update(event, event->ctx->time);
4080
4081}
4082
4083static void task_clock_perf_event_read(struct perf_event *event)
4084{
4085 u64 time;
4086
4087 if (!in_nmi()) {
4088 update_context_time(event->ctx);
4089 time = event->ctx->time;
4090 } else {
4091 u64 now = perf_clock();
4092 u64 delta = now - event->ctx->timestamp;
4093 time = event->ctx->time + delta;
4094 }
4095
4096 task_clock_perf_event_update(event, time);
4097}
4098
4099static const struct pmu perf_ops_task_clock = {
4100 .enable = task_clock_perf_event_enable,
4101 .disable = task_clock_perf_event_disable,
4102 .read = task_clock_perf_event_read,
4103};
4104
4105#ifdef CONFIG_EVENT_PROFILE
4106
4107void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4108 int entry_size)
4109{
4110 struct perf_raw_record raw = {
4111 .size = entry_size,
4112 .data = record,
4113 };
4114
4115 struct perf_sample_data data = {
4116 .addr = addr,
4117 .raw = &raw,
4118 };
4119
4120 struct pt_regs *regs = get_irq_regs();
4121
4122 if (!regs)
4123 regs = task_pt_regs(current);
4124
4125 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4126 &data, regs);
4127}
4128EXPORT_SYMBOL_GPL(perf_tp_event);
4129
4130static int perf_tp_event_match(struct perf_event *event,
4131 struct perf_sample_data *data)
4132{
4133 void *record = data->raw->data;
4134
4135 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4136 return 1;
4137 return 0;
4138}
4139
4140static void tp_perf_event_destroy(struct perf_event *event)
4141{
4142 ftrace_profile_disable(event->attr.config);
4143}
4144
4145static const struct pmu *tp_perf_event_init(struct perf_event *event)
4146{
4147 /*
4148 * Raw tracepoint data is a severe data leak, only allow root to
4149 * have these.
4150 */
4151 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4152 perf_paranoid_tracepoint_raw() &&
4153 !capable(CAP_SYS_ADMIN))
4154 return ERR_PTR(-EPERM);
4155
4156 if (ftrace_profile_enable(event->attr.config))
4157 return NULL;
4158
4159 event->destroy = tp_perf_event_destroy;
4160
4161 return &perf_ops_generic;
4162}
4163
4164static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4165{
4166 char *filter_str;
4167 int ret;
4168
4169 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4170 return -EINVAL;
4171
4172 filter_str = strndup_user(arg, PAGE_SIZE);
4173 if (IS_ERR(filter_str))
4174 return PTR_ERR(filter_str);
4175
4176 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4177
4178 kfree(filter_str);
4179 return ret;
4180}
4181
4182static void perf_event_free_filter(struct perf_event *event)
4183{
4184 ftrace_profile_free_filter(event);
4185}
4186
4187#else
4188
4189static int perf_tp_event_match(struct perf_event *event,
4190 struct perf_sample_data *data)
4191{
4192 return 1;
4193}
4194
4195static const struct pmu *tp_perf_event_init(struct perf_event *event)
4196{
4197 return NULL;
4198}
4199
4200static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4201{
4202 return -ENOENT;
4203}
4204
4205static void perf_event_free_filter(struct perf_event *event)
4206{
4207}
4208
4209#endif /* CONFIG_EVENT_PROFILE */
4210
4211atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4212
4213static void sw_perf_event_destroy(struct perf_event *event)
4214{
4215 u64 event_id = event->attr.config;
4216
4217 WARN_ON(event->parent);
4218
4219 atomic_dec(&perf_swevent_enabled[event_id]);
4220}
4221
4222static const struct pmu *sw_perf_event_init(struct perf_event *event)
4223{
4224 const struct pmu *pmu = NULL;
4225 u64 event_id = event->attr.config;
4226
4227 /*
4228 * Software events (currently) can't in general distinguish
4229 * between user, kernel and hypervisor events.
4230 * However, context switches and cpu migrations are considered
4231 * to be kernel events, and page faults are never hypervisor
4232 * events.
4233 */
4234 switch (event_id) {
4235 case PERF_COUNT_SW_CPU_CLOCK:
4236 pmu = &perf_ops_cpu_clock;
4237
4238 break;
4239 case PERF_COUNT_SW_TASK_CLOCK:
4240 /*
4241 * If the user instantiates this as a per-cpu event,
4242 * use the cpu_clock event instead.
4243 */
4244 if (event->ctx->task)
4245 pmu = &perf_ops_task_clock;
4246 else
4247 pmu = &perf_ops_cpu_clock;
4248
4249 break;
4250 case PERF_COUNT_SW_PAGE_FAULTS:
4251 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4252 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4253 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4254 case PERF_COUNT_SW_CPU_MIGRATIONS:
4255 if (!event->parent) {
4256 atomic_inc(&perf_swevent_enabled[event_id]);
4257 event->destroy = sw_perf_event_destroy;
4258 }
4259 pmu = &perf_ops_generic;
4260 break;
4261 }
4262
4263 return pmu;
4264}
4265
4266/*
4267 * Allocate and initialize a event structure
4268 */
4269static struct perf_event *
4270perf_event_alloc(struct perf_event_attr *attr,
4271 int cpu,
4272 struct perf_event_context *ctx,
4273 struct perf_event *group_leader,
4274 struct perf_event *parent_event,
4275 gfp_t gfpflags)
4276{
4277 const struct pmu *pmu;
4278 struct perf_event *event;
4279 struct hw_perf_event *hwc;
4280 long err;
4281
4282 event = kzalloc(sizeof(*event), gfpflags);
4283 if (!event)
4284 return ERR_PTR(-ENOMEM);
4285
4286 /*
4287 * Single events are their own group leaders, with an
4288 * empty sibling list:
4289 */
4290 if (!group_leader)
4291 group_leader = event;
4292
4293 mutex_init(&event->child_mutex);
4294 INIT_LIST_HEAD(&event->child_list);
4295
4296 INIT_LIST_HEAD(&event->group_entry);
4297 INIT_LIST_HEAD(&event->event_entry);
4298 INIT_LIST_HEAD(&event->sibling_list);
4299 init_waitqueue_head(&event->waitq);
4300
4301 mutex_init(&event->mmap_mutex);
4302
4303 event->cpu = cpu;
4304 event->attr = *attr;
4305 event->group_leader = group_leader;
4306 event->pmu = NULL;
4307 event->ctx = ctx;
4308 event->oncpu = -1;
4309
4310 event->parent = parent_event;
4311
4312 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4313 event->id = atomic64_inc_return(&perf_event_id);
4314
4315 event->state = PERF_EVENT_STATE_INACTIVE;
4316
4317 if (attr->disabled)
4318 event->state = PERF_EVENT_STATE_OFF;
4319
4320 pmu = NULL;
4321
4322 hwc = &event->hw;
4323 hwc->sample_period = attr->sample_period;
4324 if (attr->freq && attr->sample_freq)
4325 hwc->sample_period = 1;
4326 hwc->last_period = hwc->sample_period;
4327
4328 atomic64_set(&hwc->period_left, hwc->sample_period);
4329
4330 /*
4331 * we currently do not support PERF_FORMAT_GROUP on inherited events
4332 */
4333 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4334 goto done;
4335
4336 switch (attr->type) {
4337 case PERF_TYPE_RAW:
4338 case PERF_TYPE_HARDWARE:
4339 case PERF_TYPE_HW_CACHE:
4340 pmu = hw_perf_event_init(event);
4341 break;
4342
4343 case PERF_TYPE_SOFTWARE:
4344 pmu = sw_perf_event_init(event);
4345 break;
4346
4347 case PERF_TYPE_TRACEPOINT:
4348 pmu = tp_perf_event_init(event);
4349 break;
4350
4351 default:
4352 break;
4353 }
4354done:
4355 err = 0;
4356 if (!pmu)
4357 err = -EINVAL;
4358 else if (IS_ERR(pmu))
4359 err = PTR_ERR(pmu);
4360
4361 if (err) {
4362 if (event->ns)
4363 put_pid_ns(event->ns);
4364 kfree(event);
4365 return ERR_PTR(err);
4366 }
4367
4368 event->pmu = pmu;
4369
4370 if (!event->parent) {
4371 atomic_inc(&nr_events);
4372 if (event->attr.mmap)
4373 atomic_inc(&nr_mmap_events);
4374 if (event->attr.comm)
4375 atomic_inc(&nr_comm_events);
4376 if (event->attr.task)
4377 atomic_inc(&nr_task_events);
4378 }
4379
4380 return event;
4381}
4382
4383static int perf_copy_attr(struct perf_event_attr __user *uattr,
4384 struct perf_event_attr *attr)
4385{
4386 u32 size;
4387 int ret;
4388
4389 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4390 return -EFAULT;
4391
4392 /*
4393 * zero the full structure, so that a short copy will be nice.
4394 */
4395 memset(attr, 0, sizeof(*attr));
4396
4397 ret = get_user(size, &uattr->size);
4398 if (ret)
4399 return ret;
4400
4401 if (size > PAGE_SIZE) /* silly large */
4402 goto err_size;
4403
4404 if (!size) /* abi compat */
4405 size = PERF_ATTR_SIZE_VER0;
4406
4407 if (size < PERF_ATTR_SIZE_VER0)
4408 goto err_size;
4409
4410 /*
4411 * If we're handed a bigger struct than we know of,
4412 * ensure all the unknown bits are 0 - i.e. new
4413 * user-space does not rely on any kernel feature
4414 * extensions we dont know about yet.
4415 */
4416 if (size > sizeof(*attr)) {
4417 unsigned char __user *addr;
4418 unsigned char __user *end;
4419 unsigned char val;
4420
4421 addr = (void __user *)uattr + sizeof(*attr);
4422 end = (void __user *)uattr + size;
4423
4424 for (; addr < end; addr++) {
4425 ret = get_user(val, addr);
4426 if (ret)
4427 return ret;
4428 if (val)
4429 goto err_size;
4430 }
4431 size = sizeof(*attr);
4432 }
4433
4434 ret = copy_from_user(attr, uattr, size);
4435 if (ret)
4436 return -EFAULT;
4437
4438 /*
4439 * If the type exists, the corresponding creation will verify
4440 * the attr->config.
4441 */
4442 if (attr->type >= PERF_TYPE_MAX)
4443 return -EINVAL;
4444
4445 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4446 return -EINVAL;
4447
4448 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4449 return -EINVAL;
4450
4451 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4452 return -EINVAL;
4453
4454out:
4455 return ret;
4456
4457err_size:
4458 put_user(sizeof(*attr), &uattr->size);
4459 ret = -E2BIG;
4460 goto out;
4461}
4462
4463static int perf_event_set_output(struct perf_event *event, int output_fd)
4464{
4465 struct perf_event *output_event = NULL;
4466 struct file *output_file = NULL;
4467 struct perf_event *old_output;
4468 int fput_needed = 0;
4469 int ret = -EINVAL;
4470
4471 if (!output_fd)
4472 goto set;
4473
4474 output_file = fget_light(output_fd, &fput_needed);
4475 if (!output_file)
4476 return -EBADF;
4477
4478 if (output_file->f_op != &perf_fops)
4479 goto out;
4480
4481 output_event = output_file->private_data;
4482
4483 /* Don't chain output fds */
4484 if (output_event->output)
4485 goto out;
4486
4487 /* Don't set an output fd when we already have an output channel */
4488 if (event->data)
4489 goto out;
4490
4491 atomic_long_inc(&output_file->f_count);
4492
4493set:
4494 mutex_lock(&event->mmap_mutex);
4495 old_output = event->output;
4496 rcu_assign_pointer(event->output, output_event);
4497 mutex_unlock(&event->mmap_mutex);
4498
4499 if (old_output) {
4500 /*
4501 * we need to make sure no existing perf_output_*()
4502 * is still referencing this event.
4503 */
4504 synchronize_rcu();
4505 fput(old_output->filp);
4506 }
4507
4508 ret = 0;
4509out:
4510 fput_light(output_file, fput_needed);
4511 return ret;
4512}
4513
4514/**
4515 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4516 *
4517 * @attr_uptr: event_id type attributes for monitoring/sampling
4518 * @pid: target pid
4519 * @cpu: target cpu
4520 * @group_fd: group leader event fd
4521 */
4522SYSCALL_DEFINE5(perf_event_open,
4523 struct perf_event_attr __user *, attr_uptr,
4524 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4525{
4526 struct perf_event *event, *group_leader;
4527 struct perf_event_attr attr;
4528 struct perf_event_context *ctx;
4529 struct file *event_file = NULL;
4530 struct file *group_file = NULL;
4531 int fput_needed = 0;
4532 int fput_needed2 = 0;
4533 int err;
4534
4535 /* for future expandability... */
4536 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4537 return -EINVAL;
4538
4539 err = perf_copy_attr(attr_uptr, &attr);
4540 if (err)
4541 return err;
4542
4543 if (!attr.exclude_kernel) {
4544 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4545 return -EACCES;
4546 }
4547
4548 if (attr.freq) {
4549 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4550 return -EINVAL;
4551 }
4552
4553 /*
4554 * Get the target context (task or percpu):
4555 */
4556 ctx = find_get_context(pid, cpu);
4557 if (IS_ERR(ctx))
4558 return PTR_ERR(ctx);
4559
4560 /*
4561 * Look up the group leader (we will attach this event to it):
4562 */
4563 group_leader = NULL;
4564 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4565 err = -EINVAL;
4566 group_file = fget_light(group_fd, &fput_needed);
4567 if (!group_file)
4568 goto err_put_context;
4569 if (group_file->f_op != &perf_fops)
4570 goto err_put_context;
4571
4572 group_leader = group_file->private_data;
4573 /*
4574 * Do not allow a recursive hierarchy (this new sibling
4575 * becoming part of another group-sibling):
4576 */
4577 if (group_leader->group_leader != group_leader)
4578 goto err_put_context;
4579 /*
4580 * Do not allow to attach to a group in a different
4581 * task or CPU context:
4582 */
4583 if (group_leader->ctx != ctx)
4584 goto err_put_context;
4585 /*
4586 * Only a group leader can be exclusive or pinned
4587 */
4588 if (attr.exclusive || attr.pinned)
4589 goto err_put_context;
4590 }
4591
4592 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4593 NULL, GFP_KERNEL);
4594 err = PTR_ERR(event);
4595 if (IS_ERR(event))
4596 goto err_put_context;
4597
4598 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4599 if (err < 0)
4600 goto err_free_put_context;
4601
4602 event_file = fget_light(err, &fput_needed2);
4603 if (!event_file)
4604 goto err_free_put_context;
4605
4606 if (flags & PERF_FLAG_FD_OUTPUT) {
4607 err = perf_event_set_output(event, group_fd);
4608 if (err)
4609 goto err_fput_free_put_context;
4610 }
4611
4612 event->filp = event_file;
4613 WARN_ON_ONCE(ctx->parent_ctx);
4614 mutex_lock(&ctx->mutex);
4615 perf_install_in_context(ctx, event, cpu);
4616 ++ctx->generation;
4617 mutex_unlock(&ctx->mutex);
4618
4619 event->owner = current;
4620 get_task_struct(current);
4621 mutex_lock(&current->perf_event_mutex);
4622 list_add_tail(&event->owner_entry, &current->perf_event_list);
4623 mutex_unlock(&current->perf_event_mutex);
4624
4625err_fput_free_put_context:
4626 fput_light(event_file, fput_needed2);
4627
4628err_free_put_context:
4629 if (err < 0)
4630 kfree(event);
4631
4632err_put_context:
4633 if (err < 0)
4634 put_ctx(ctx);
4635
4636 fput_light(group_file, fput_needed);
4637
4638 return err;
4639}
4640
4641/*
4642 * inherit a event from parent task to child task:
4643 */
4644static struct perf_event *
4645inherit_event(struct perf_event *parent_event,
4646 struct task_struct *parent,
4647 struct perf_event_context *parent_ctx,
4648 struct task_struct *child,
4649 struct perf_event *group_leader,
4650 struct perf_event_context *child_ctx)
4651{
4652 struct perf_event *child_event;
4653
4654 /*
4655 * Instead of creating recursive hierarchies of events,
4656 * we link inherited events back to the original parent,
4657 * which has a filp for sure, which we use as the reference
4658 * count:
4659 */
4660 if (parent_event->parent)
4661 parent_event = parent_event->parent;
4662
4663 child_event = perf_event_alloc(&parent_event->attr,
4664 parent_event->cpu, child_ctx,
4665 group_leader, parent_event,
4666 GFP_KERNEL);
4667 if (IS_ERR(child_event))
4668 return child_event;
4669 get_ctx(child_ctx);
4670
4671 /*
4672 * Make the child state follow the state of the parent event,
4673 * not its attr.disabled bit. We hold the parent's mutex,
4674 * so we won't race with perf_event_{en, dis}able_family.
4675 */
4676 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4677 child_event->state = PERF_EVENT_STATE_INACTIVE;
4678 else
4679 child_event->state = PERF_EVENT_STATE_OFF;
4680
4681 if (parent_event->attr.freq)
4682 child_event->hw.sample_period = parent_event->hw.sample_period;
4683
4684 /*
4685 * Link it up in the child's context:
4686 */
4687 add_event_to_ctx(child_event, child_ctx);
4688
4689 /*
4690 * Get a reference to the parent filp - we will fput it
4691 * when the child event exits. This is safe to do because
4692 * we are in the parent and we know that the filp still
4693 * exists and has a nonzero count:
4694 */
4695 atomic_long_inc(&parent_event->filp->f_count);
4696
4697 /*
4698 * Link this into the parent event's child list
4699 */
4700 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4701 mutex_lock(&parent_event->child_mutex);
4702 list_add_tail(&child_event->child_list, &parent_event->child_list);
4703 mutex_unlock(&parent_event->child_mutex);
4704
4705 return child_event;
4706}
4707
4708static int inherit_group(struct perf_event *parent_event,
4709 struct task_struct *parent,
4710 struct perf_event_context *parent_ctx,
4711 struct task_struct *child,
4712 struct perf_event_context *child_ctx)
4713{
4714 struct perf_event *leader;
4715 struct perf_event *sub;
4716 struct perf_event *child_ctr;
4717
4718 leader = inherit_event(parent_event, parent, parent_ctx,
4719 child, NULL, child_ctx);
4720 if (IS_ERR(leader))
4721 return PTR_ERR(leader);
4722 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4723 child_ctr = inherit_event(sub, parent, parent_ctx,
4724 child, leader, child_ctx);
4725 if (IS_ERR(child_ctr))
4726 return PTR_ERR(child_ctr);
4727 }
4728 return 0;
4729}
4730
4731static void sync_child_event(struct perf_event *child_event,
4732 struct task_struct *child)
4733{
4734 struct perf_event *parent_event = child_event->parent;
4735 u64 child_val;
4736
4737 if (child_event->attr.inherit_stat)
4738 perf_event_read_event(child_event, child);
4739
4740 child_val = atomic64_read(&child_event->count);
4741
4742 /*
4743 * Add back the child's count to the parent's count:
4744 */
4745 atomic64_add(child_val, &parent_event->count);
4746 atomic64_add(child_event->total_time_enabled,
4747 &parent_event->child_total_time_enabled);
4748 atomic64_add(child_event->total_time_running,
4749 &parent_event->child_total_time_running);
4750
4751 /*
4752 * Remove this event from the parent's list
4753 */
4754 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4755 mutex_lock(&parent_event->child_mutex);
4756 list_del_init(&child_event->child_list);
4757 mutex_unlock(&parent_event->child_mutex);
4758
4759 /*
4760 * Release the parent event, if this was the last
4761 * reference to it.
4762 */
4763 fput(parent_event->filp);
4764}
4765
4766static void
4767__perf_event_exit_task(struct perf_event *child_event,
4768 struct perf_event_context *child_ctx,
4769 struct task_struct *child)
4770{
4771 struct perf_event *parent_event;
4772
4773 update_event_times(child_event);
4774 perf_event_remove_from_context(child_event);
4775
4776 parent_event = child_event->parent;
4777 /*
4778 * It can happen that parent exits first, and has events
4779 * that are still around due to the child reference. These
4780 * events need to be zapped - but otherwise linger.
4781 */
4782 if (parent_event) {
4783 sync_child_event(child_event, child);
4784 free_event(child_event);
4785 }
4786}
4787
4788/*
4789 * When a child task exits, feed back event values to parent events.
4790 */
4791void perf_event_exit_task(struct task_struct *child)
4792{
4793 struct perf_event *child_event, *tmp;
4794 struct perf_event_context *child_ctx;
4795 unsigned long flags;
4796
4797 if (likely(!child->perf_event_ctxp)) {
4798 perf_event_task(child, NULL, 0);
4799 return;
4800 }
4801
4802 local_irq_save(flags);
4803 /*
4804 * We can't reschedule here because interrupts are disabled,
4805 * and either child is current or it is a task that can't be
4806 * scheduled, so we are now safe from rescheduling changing
4807 * our context.
4808 */
4809 child_ctx = child->perf_event_ctxp;
4810 __perf_event_task_sched_out(child_ctx);
4811
4812 /*
4813 * Take the context lock here so that if find_get_context is
4814 * reading child->perf_event_ctxp, we wait until it has
4815 * incremented the context's refcount before we do put_ctx below.
4816 */
4817 spin_lock(&child_ctx->lock);
4818 child->perf_event_ctxp = NULL;
4819 /*
4820 * If this context is a clone; unclone it so it can't get
4821 * swapped to another process while we're removing all
4822 * the events from it.
4823 */
4824 unclone_ctx(child_ctx);
4825 spin_unlock_irqrestore(&child_ctx->lock, flags);
4826
4827 /*
4828 * Report the task dead after unscheduling the events so that we
4829 * won't get any samples after PERF_RECORD_EXIT. We can however still
4830 * get a few PERF_RECORD_READ events.
4831 */
4832 perf_event_task(child, child_ctx, 0);
4833
4834 /*
4835 * We can recurse on the same lock type through:
4836 *
4837 * __perf_event_exit_task()
4838 * sync_child_event()
4839 * fput(parent_event->filp)
4840 * perf_release()
4841 * mutex_lock(&ctx->mutex)
4842 *
4843 * But since its the parent context it won't be the same instance.
4844 */
4845 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4846
4847again:
4848 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4849 group_entry)
4850 __perf_event_exit_task(child_event, child_ctx, child);
4851
4852 /*
4853 * If the last event was a group event, it will have appended all
4854 * its siblings to the list, but we obtained 'tmp' before that which
4855 * will still point to the list head terminating the iteration.
4856 */
4857 if (!list_empty(&child_ctx->group_list))
4858 goto again;
4859
4860 mutex_unlock(&child_ctx->mutex);
4861
4862 put_ctx(child_ctx);
4863}
4864
4865/*
4866 * free an unexposed, unused context as created by inheritance by
4867 * init_task below, used by fork() in case of fail.
4868 */
4869void perf_event_free_task(struct task_struct *task)
4870{
4871 struct perf_event_context *ctx = task->perf_event_ctxp;
4872 struct perf_event *event, *tmp;
4873
4874 if (!ctx)
4875 return;
4876
4877 mutex_lock(&ctx->mutex);
4878again:
4879 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4880 struct perf_event *parent = event->parent;
4881
4882 if (WARN_ON_ONCE(!parent))
4883 continue;
4884
4885 mutex_lock(&parent->child_mutex);
4886 list_del_init(&event->child_list);
4887 mutex_unlock(&parent->child_mutex);
4888
4889 fput(parent->filp);
4890
4891 list_del_event(event, ctx);
4892 free_event(event);
4893 }
4894
4895 if (!list_empty(&ctx->group_list))
4896 goto again;
4897
4898 mutex_unlock(&ctx->mutex);
4899
4900 put_ctx(ctx);
4901}
4902
4903/*
4904 * Initialize the perf_event context in task_struct
4905 */
4906int perf_event_init_task(struct task_struct *child)
4907{
4908 struct perf_event_context *child_ctx, *parent_ctx;
4909 struct perf_event_context *cloned_ctx;
4910 struct perf_event *event;
4911 struct task_struct *parent = current;
4912 int inherited_all = 1;
4913 int ret = 0;
4914
4915 child->perf_event_ctxp = NULL;
4916
4917 mutex_init(&child->perf_event_mutex);
4918 INIT_LIST_HEAD(&child->perf_event_list);
4919
4920 if (likely(!parent->perf_event_ctxp))
4921 return 0;
4922
4923 /*
4924 * This is executed from the parent task context, so inherit
4925 * events that have been marked for cloning.
4926 * First allocate and initialize a context for the child.
4927 */
4928
4929 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4930 if (!child_ctx)
4931 return -ENOMEM;
4932
4933 __perf_event_init_context(child_ctx, child);
4934 child->perf_event_ctxp = child_ctx;
4935 get_task_struct(child);
4936
4937 /*
4938 * If the parent's context is a clone, pin it so it won't get
4939 * swapped under us.
4940 */
4941 parent_ctx = perf_pin_task_context(parent);
4942
4943 /*
4944 * No need to check if parent_ctx != NULL here; since we saw
4945 * it non-NULL earlier, the only reason for it to become NULL
4946 * is if we exit, and since we're currently in the middle of
4947 * a fork we can't be exiting at the same time.
4948 */
4949
4950 /*
4951 * Lock the parent list. No need to lock the child - not PID
4952 * hashed yet and not running, so nobody can access it.
4953 */
4954 mutex_lock(&parent_ctx->mutex);
4955
4956 /*
4957 * We dont have to disable NMIs - we are only looking at
4958 * the list, not manipulating it:
4959 */
4960 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4961
4962 if (!event->attr.inherit) {
4963 inherited_all = 0;
4964 continue;
4965 }
4966
4967 ret = inherit_group(event, parent, parent_ctx,
4968 child, child_ctx);
4969 if (ret) {
4970 inherited_all = 0;
4971 break;
4972 }
4973 }
4974
4975 if (inherited_all) {
4976 /*
4977 * Mark the child context as a clone of the parent
4978 * context, or of whatever the parent is a clone of.
4979 * Note that if the parent is a clone, it could get
4980 * uncloned at any point, but that doesn't matter
4981 * because the list of events and the generation
4982 * count can't have changed since we took the mutex.
4983 */
4984 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4985 if (cloned_ctx) {
4986 child_ctx->parent_ctx = cloned_ctx;
4987 child_ctx->parent_gen = parent_ctx->parent_gen;
4988 } else {
4989 child_ctx->parent_ctx = parent_ctx;
4990 child_ctx->parent_gen = parent_ctx->generation;
4991 }
4992 get_ctx(child_ctx->parent_ctx);
4993 }
4994
4995 mutex_unlock(&parent_ctx->mutex);
4996
4997 perf_unpin_context(parent_ctx);
4998
4999 return ret;
5000}
5001
5002static void __cpuinit perf_event_init_cpu(int cpu)
5003{
5004 struct perf_cpu_context *cpuctx;
5005
5006 cpuctx = &per_cpu(perf_cpu_context, cpu);
5007 __perf_event_init_context(&cpuctx->ctx, NULL);
5008
5009 spin_lock(&perf_resource_lock);
5010 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5011 spin_unlock(&perf_resource_lock);
5012
5013 hw_perf_event_setup(cpu);
5014}
5015
5016#ifdef CONFIG_HOTPLUG_CPU
5017static void __perf_event_exit_cpu(void *info)
5018{
5019 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
5020 struct perf_event_context *ctx = &cpuctx->ctx;
5021 struct perf_event *event, *tmp;
5022
5023 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
5024 __perf_event_remove_from_context(event);
5025}
5026static void perf_event_exit_cpu(int cpu)
5027{
5028 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5029 struct perf_event_context *ctx = &cpuctx->ctx;
5030
5031 mutex_lock(&ctx->mutex);
5032 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5033 mutex_unlock(&ctx->mutex);
5034}
5035#else
5036static inline void perf_event_exit_cpu(int cpu) { }
5037#endif
5038
5039static int __cpuinit
5040perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5041{
5042 unsigned int cpu = (long)hcpu;
5043
5044 switch (action) {
5045
5046 case CPU_UP_PREPARE:
5047 case CPU_UP_PREPARE_FROZEN:
5048 perf_event_init_cpu(cpu);
5049 break;
5050
5051 case CPU_ONLINE:
5052 case CPU_ONLINE_FROZEN:
5053 hw_perf_event_setup_online(cpu);
5054 break;
5055
5056 case CPU_DOWN_PREPARE:
5057 case CPU_DOWN_PREPARE_FROZEN:
5058 perf_event_exit_cpu(cpu);
5059 break;
5060
5061 default:
5062 break;
5063 }
5064
5065 return NOTIFY_OK;
5066}
5067
5068/*
5069 * This has to have a higher priority than migration_notifier in sched.c.
5070 */
5071static struct notifier_block __cpuinitdata perf_cpu_nb = {
5072 .notifier_call = perf_cpu_notify,
5073 .priority = 20,
5074};
5075
5076void __init perf_event_init(void)
5077{
5078 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5079 (void *)(long)smp_processor_id());
5080 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5081 (void *)(long)smp_processor_id());
5082 register_cpu_notifier(&perf_cpu_nb);
5083}
5084
5085static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5086{
5087 return sprintf(buf, "%d\n", perf_reserved_percpu);
5088}
5089
5090static ssize_t
5091perf_set_reserve_percpu(struct sysdev_class *class,
5092 const char *buf,
5093 size_t count)
5094{
5095 struct perf_cpu_context *cpuctx;
5096 unsigned long val;
5097 int err, cpu, mpt;
5098
5099 err = strict_strtoul(buf, 10, &val);
5100 if (err)
5101 return err;
5102 if (val > perf_max_events)
5103 return -EINVAL;
5104
5105 spin_lock(&perf_resource_lock);
5106 perf_reserved_percpu = val;
5107 for_each_online_cpu(cpu) {
5108 cpuctx = &per_cpu(perf_cpu_context, cpu);
5109 spin_lock_irq(&cpuctx->ctx.lock);
5110 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5111 perf_max_events - perf_reserved_percpu);
5112 cpuctx->max_pertask = mpt;
5113 spin_unlock_irq(&cpuctx->ctx.lock);
5114 }
5115 spin_unlock(&perf_resource_lock);
5116
5117 return count;
5118}
5119
5120static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5121{
5122 return sprintf(buf, "%d\n", perf_overcommit);
5123}
5124
5125static ssize_t
5126perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5127{
5128 unsigned long val;
5129 int err;
5130
5131 err = strict_strtoul(buf, 10, &val);
5132 if (err)
5133 return err;
5134 if (val > 1)
5135 return -EINVAL;
5136
5137 spin_lock(&perf_resource_lock);
5138 perf_overcommit = val;
5139 spin_unlock(&perf_resource_lock);
5140
5141 return count;
5142}
5143
5144static SYSDEV_CLASS_ATTR(
5145 reserve_percpu,
5146 0644,
5147 perf_show_reserve_percpu,
5148 perf_set_reserve_percpu
5149 );
5150
5151static SYSDEV_CLASS_ATTR(
5152 overcommit,
5153 0644,
5154 perf_show_overcommit,
5155 perf_set_overcommit
5156 );
5157
5158static struct attribute *perfclass_attrs[] = {
5159 &attr_reserve_percpu.attr,
5160 &attr_overcommit.attr,
5161 NULL
5162};
5163
5164static struct attribute_group perfclass_attr_group = {
5165 .attrs = perfclass_attrs,
5166 .name = "perf_events",
5167};
5168
5169static int __init perf_event_sysfs_init(void)
5170{
5171 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5172 &perfclass_attr_group);
5173}
5174device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..5c9dc228747b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,17 +8,18 @@
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
11 12
12/* 13/*
13 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
14 */ 15 */
15void update_rlimit_cpu(unsigned long rlim_new) 16void update_rlimit_cpu(unsigned long rlim_new)
16{ 17{
17 cputime_t cputime; 18 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
18 20
19 cputime = secs_to_cputime(rlim_new); 21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
542 now); 543 now);
543} 544}
544 545
546static inline int expires_gt(cputime_t expires, cputime_t new_exp)
547{
548 return cputime_eq(expires, cputime_zero) ||
549 cputime_gt(expires, new_exp);
550}
551
552static inline int expires_le(cputime_t expires, cputime_t new_exp)
553{
554 return !cputime_eq(expires, cputime_zero) &&
555 cputime_le(expires, new_exp);
556}
545/* 557/*
546 * Insert the timer on the appropriate list before any timers that 558 * Insert the timer on the appropriate list before any timers that
547 * expire later. This must be called with the tasklist_lock held 559 * expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
586 */ 598 */
587 599
588 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 600 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
601 union cpu_time_count *exp = &nt->expires;
602
589 switch (CPUCLOCK_WHICH(timer->it_clock)) { 603 switch (CPUCLOCK_WHICH(timer->it_clock)) {
590 default: 604 default:
591 BUG(); 605 BUG();
592 case CPUCLOCK_PROF: 606 case CPUCLOCK_PROF:
593 if (cputime_eq(p->cputime_expires.prof_exp, 607 if (expires_gt(p->cputime_expires.prof_exp,
594 cputime_zero) || 608 exp->cpu))
595 cputime_gt(p->cputime_expires.prof_exp, 609 p->cputime_expires.prof_exp = exp->cpu;
596 nt->expires.cpu))
597 p->cputime_expires.prof_exp =
598 nt->expires.cpu;
599 break; 610 break;
600 case CPUCLOCK_VIRT: 611 case CPUCLOCK_VIRT:
601 if (cputime_eq(p->cputime_expires.virt_exp, 612 if (expires_gt(p->cputime_expires.virt_exp,
602 cputime_zero) || 613 exp->cpu))
603 cputime_gt(p->cputime_expires.virt_exp, 614 p->cputime_expires.virt_exp = exp->cpu;
604 nt->expires.cpu))
605 p->cputime_expires.virt_exp =
606 nt->expires.cpu;
607 break; 615 break;
608 case CPUCLOCK_SCHED: 616 case CPUCLOCK_SCHED:
609 if (p->cputime_expires.sched_exp == 0 || 617 if (p->cputime_expires.sched_exp == 0 ||
610 p->cputime_expires.sched_exp > 618 p->cputime_expires.sched_exp > exp->sched)
611 nt->expires.sched)
612 p->cputime_expires.sched_exp = 619 p->cputime_expires.sched_exp =
613 nt->expires.sched; 620 exp->sched;
614 break; 621 break;
615 } 622 }
616 } else { 623 } else {
624 struct signal_struct *const sig = p->signal;
625 union cpu_time_count *exp = &timer->it.cpu.expires;
626
617 /* 627 /*
618 * For a process timer, set the cached expiration time. 628 * For a process timer, set the cached expiration time.
619 */ 629 */
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
621 default: 631 default:
622 BUG(); 632 BUG();
623 case CPUCLOCK_VIRT: 633 case CPUCLOCK_VIRT:
624 if (!cputime_eq(p->signal->it_virt_expires, 634 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
625 cputime_zero) && 635 exp->cpu))
626 cputime_lt(p->signal->it_virt_expires,
627 timer->it.cpu.expires.cpu))
628 break; 636 break;
629 p->signal->cputime_expires.virt_exp = 637 sig->cputime_expires.virt_exp = exp->cpu;
630 timer->it.cpu.expires.cpu;
631 break; 638 break;
632 case CPUCLOCK_PROF: 639 case CPUCLOCK_PROF:
633 if (!cputime_eq(p->signal->it_prof_expires, 640 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
634 cputime_zero) && 641 exp->cpu))
635 cputime_lt(p->signal->it_prof_expires,
636 timer->it.cpu.expires.cpu))
637 break; 642 break;
638 i = p->signal->rlim[RLIMIT_CPU].rlim_cur; 643 i = sig->rlim[RLIMIT_CPU].rlim_cur;
639 if (i != RLIM_INFINITY && 644 if (i != RLIM_INFINITY &&
640 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 645 i <= cputime_to_secs(exp->cpu))
641 break; 646 break;
642 p->signal->cputime_expires.prof_exp = 647 sig->cputime_expires.prof_exp = exp->cpu;
643 timer->it.cpu.expires.cpu;
644 break; 648 break;
645 case CPUCLOCK_SCHED: 649 case CPUCLOCK_SCHED:
646 p->signal->cputime_expires.sched_exp = 650 sig->cputime_expires.sched_exp = exp->sched;
647 timer->it.cpu.expires.sched;
648 break; 651 break;
649 } 652 }
650 } 653 }
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1072} 1075}
1073 1076
1077static u32 onecputick;
1078
1079static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1080 cputime_t *expires, cputime_t cur_time, int signo)
1081{
1082 if (cputime_eq(it->expires, cputime_zero))
1083 return;
1084
1085 if (cputime_ge(cur_time, it->expires)) {
1086 if (!cputime_eq(it->incr, cputime_zero)) {
1087 it->expires = cputime_add(it->expires, it->incr);
1088 it->error += it->incr_error;
1089 if (it->error >= onecputick) {
1090 it->expires = cputime_sub(it->expires,
1091 cputime_one_jiffy);
1092 it->error -= onecputick;
1093 }
1094 } else {
1095 it->expires = cputime_zero;
1096 }
1097
1098 trace_itimer_expire(signo == SIGPROF ?
1099 ITIMER_PROF : ITIMER_VIRTUAL,
1100 tsk->signal->leader_pid, cur_time);
1101 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1102 }
1103
1104 if (!cputime_eq(it->expires, cputime_zero) &&
1105 (cputime_eq(*expires, cputime_zero) ||
1106 cputime_lt(it->expires, *expires))) {
1107 *expires = it->expires;
1108 }
1109}
1110
1074/* 1111/*
1075 * Check for any per-thread CPU timers that have fired and move them 1112 * Check for any per-thread CPU timers that have fired and move them
1076 * off the tsk->*_timers list onto the firing list. Per-thread timers 1113 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
1090 * Don't sample the current process CPU clocks if there are no timers. 1127 * Don't sample the current process CPU clocks if there are no timers.
1091 */ 1128 */
1092 if (list_empty(&timers[CPUCLOCK_PROF]) && 1129 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1093 cputime_eq(sig->it_prof_expires, cputime_zero) && 1130 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1094 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1131 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1095 list_empty(&timers[CPUCLOCK_VIRT]) && 1132 list_empty(&timers[CPUCLOCK_VIRT]) &&
1096 cputime_eq(sig->it_virt_expires, cputime_zero) && 1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1097 list_empty(&timers[CPUCLOCK_SCHED])) { 1134 list_empty(&timers[CPUCLOCK_SCHED])) {
1098 stop_process_timers(tsk); 1135 stop_process_timers(tsk);
1099 return; 1136 return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
1153 /* 1190 /*
1154 * Check for the special case process timers. 1191 * Check for the special case process timers.
1155 */ 1192 */
1156 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1193 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1157 if (cputime_ge(ptime, sig->it_prof_expires)) { 1194 SIGPROF);
1158 /* ITIMER_PROF fires and reloads. */ 1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1159 sig->it_prof_expires = sig->it_prof_incr; 1196 SIGVTALRM);
1160 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1197
1161 sig->it_prof_expires = cputime_add(
1162 sig->it_prof_expires, ptime);
1163 }
1164 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1165 }
1166 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1167 (cputime_eq(prof_expires, cputime_zero) ||
1168 cputime_lt(sig->it_prof_expires, prof_expires))) {
1169 prof_expires = sig->it_prof_expires;
1170 }
1171 }
1172 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1173 if (cputime_ge(utime, sig->it_virt_expires)) {
1174 /* ITIMER_VIRTUAL fires and reloads. */
1175 sig->it_virt_expires = sig->it_virt_incr;
1176 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1177 sig->it_virt_expires = cputime_add(
1178 sig->it_virt_expires, utime);
1179 }
1180 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1181 }
1182 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1183 (cputime_eq(virt_expires, cputime_zero) ||
1184 cputime_lt(sig->it_virt_expires, virt_expires))) {
1185 virt_expires = sig->it_virt_expires;
1186 }
1187 }
1188 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1189 unsigned long psecs = cputime_to_secs(ptime); 1199 unsigned long psecs = cputime_to_secs(ptime);
1190 cputime_t x; 1200 cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 if (!cputime_eq(*oldval, cputime_zero)) { 1467 if (!cputime_eq(*oldval, cputime_zero)) {
1458 if (cputime_le(*oldval, now.cpu)) { 1468 if (cputime_le(*oldval, now.cpu)) {
1459 /* Just about to fire. */ 1469 /* Just about to fire. */
1460 *oldval = jiffies_to_cputime(1); 1470 *oldval = cputime_one_jiffy;
1461 } else { 1471 } else {
1462 *oldval = cputime_sub(*oldval, now.cpu); 1472 *oldval = cputime_sub(*oldval, now.cpu);
1463 } 1473 }
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
1703 .nsleep = thread_cpu_nsleep, 1713 .nsleep = thread_cpu_nsleep,
1704 .nsleep_restart = thread_cpu_nsleep_restart, 1714 .nsleep_restart = thread_cpu_nsleep_restart,
1705 }; 1715 };
1716 struct timespec ts;
1706 1717
1707 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1718 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1708 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1719 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1709 1720
1721 cputime_to_timespec(cputime_one_jiffy, &ts);
1722 onecputick = ts.tv_nsec;
1723 WARN_ON(ts.tv_sec != 0);
1724
1710 return 0; 1725 return 0;
1711} 1726}
1712__initcall(init_posix_cpu_timers); 1727__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 97955b0e44f4..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..b101cdc4df3f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 602033acd6c7..f38b07f78a4e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
206#ifdef CONFIG_BOOT_PRINTK_DELAY 206#ifdef CONFIG_BOOT_PRINTK_DELAY
207 207
208static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 208static unsigned int boot_delay; /* msecs delay after each printk during bootup */
209static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ 209static unsigned long long loops_per_msec; /* based on boot_delay */
210 210
211static int __init boot_delay_setup(char *str) 211static int __init boot_delay_setup(char *str)
212{ 212{
213 unsigned long lpj; 213 unsigned long lpj;
214 unsigned long long loops_per_msec;
215 214
216 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ 215 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
217 loops_per_msec = (unsigned long long)lpj / 1000 * HZ; 216 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str)
220 if (boot_delay > 10 * 1000) 219 if (boot_delay > 10 * 1000)
221 boot_delay = 0; 220 boot_delay = 0;
222 221
223 printk_delay_msec = loops_per_msec; 222 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
224 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 223 "HZ: %d, loops_per_msec: %llu\n",
225 "HZ: %d, printk_delay_msec: %llu\n", 224 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
226 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
227 return 1; 225 return 1;
228} 226}
229__setup("boot_delay=", boot_delay_setup); 227__setup("boot_delay=", boot_delay_setup);
@@ -236,7 +234,7 @@ static void boot_delay_msec(void)
236 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 234 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
237 return; 235 return;
238 236
239 k = (unsigned long long)printk_delay_msec * boot_delay; 237 k = (unsigned long long)loops_per_msec * boot_delay;
240 238
241 timeout = jiffies + msecs_to_jiffies(boot_delay); 239 timeout = jiffies + msecs_to_jiffies(boot_delay);
242 while (k) { 240 while (k) {
@@ -655,6 +653,20 @@ static int recursion_bug;
655static int new_text_line = 1; 653static int new_text_line = 1;
656static char printk_buf[1024]; 654static char printk_buf[1024];
657 655
656int printk_delay_msec __read_mostly;
657
658static inline void printk_delay(void)
659{
660 if (unlikely(printk_delay_msec)) {
661 int m = printk_delay_msec;
662
663 while (m--) {
664 mdelay(1);
665 touch_nmi_watchdog();
666 }
667 }
668}
669
658asmlinkage int vprintk(const char *fmt, va_list args) 670asmlinkage int vprintk(const char *fmt, va_list args)
659{ 671{
660 int printed_len = 0; 672 int printed_len = 0;
@@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
664 char *p; 676 char *p;
665 677
666 boot_delay_msec(); 678 boot_delay_msec();
679 printk_delay();
667 680
668 preempt_disable(); 681 preempt_disable();
669 /* This stops the holder of console_sem just where we want him */ 682 /* This stops the holder of console_sem just where we want him */
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bd5d5c8e5140..400183346ad2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -46,22 +46,15 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/kernel_stat.h>
48 48
49enum rcu_barrier { 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 RCU_BARRIER_STD, 50static struct lock_class_key rcu_lock_key;
51 RCU_BARRIER_BH, 51struct lockdep_map rcu_lock_map =
52 RCU_BARRIER_SCHED, 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53}; 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
54 55
55static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
56static atomic_t rcu_barrier_cpu_count;
57static DEFINE_MUTEX(rcu_barrier_mutex);
58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly; 56int rcu_scheduler_active __read_mostly;
60 57
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64
65/* 58/*
66 * Awaken the corresponding synchronize_rcu() instance now that a 59 * Awaken the corresponding synchronize_rcu() instance now that a
67 * grace period has elapsed. 60 * grace period has elapsed.
@@ -74,6 +67,8 @@ void wakeme_after_rcu(struct rcu_head *head)
74 complete(&rcu->completion); 67 complete(&rcu->completion);
75} 68}
76 69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
77/** 72/**
78 * synchronize_rcu - wait until a grace period has elapsed. 73 * synchronize_rcu - wait until a grace period has elapsed.
79 * 74 *
@@ -87,7 +82,7 @@ void synchronize_rcu(void)
87{ 82{
88 struct rcu_synchronize rcu; 83 struct rcu_synchronize rcu;
89 84
90 if (rcu_blocking_is_gp()) 85 if (!rcu_scheduler_active)
91 return; 86 return;
92 87
93 init_completion(&rcu.completion); 88 init_completion(&rcu.completion);
@@ -98,6 +93,46 @@ void synchronize_rcu(void)
98} 93}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 94EXPORT_SYMBOL_GPL(synchronize_rcu);
100 95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
101/** 136/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 * 138 *
@@ -122,129 +157,10 @@ void synchronize_rcu_bh(void)
122} 157}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124 159
125static void rcu_barrier_callback(struct rcu_head *notused)
126{
127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
128 complete(&rcu_barrier_completion);
129}
130
131/*
132 * Called with preemption disabled, and from cross-cpu IRQ context.
133 */
134static void rcu_barrier_func(void *type)
135{
136 int cpu = smp_processor_id();
137 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
138
139 atomic_inc(&rcu_barrier_cpu_count);
140 switch ((enum rcu_barrier)type) {
141 case RCU_BARRIER_STD:
142 call_rcu(head, rcu_barrier_callback);
143 break;
144 case RCU_BARRIER_BH:
145 call_rcu_bh(head, rcu_barrier_callback);
146 break;
147 case RCU_BARRIER_SCHED:
148 call_rcu_sched(head, rcu_barrier_callback);
149 break;
150 }
151}
152
153static inline void wait_migrated_callbacks(void)
154{
155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
157}
158
159/*
160 * Orchestrate the specified type of RCU barrier, waiting for all
161 * RCU callbacks of the specified type to complete.
162 */
163static void _rcu_barrier(enum rcu_barrier type)
164{
165 BUG_ON(in_interrupt());
166 /* Take cpucontrol mutex to protect against CPU hotplug */
167 mutex_lock(&rcu_barrier_mutex);
168 init_completion(&rcu_barrier_completion);
169 /*
170 * Initialize rcu_barrier_cpu_count to 1, then invoke
171 * rcu_barrier_func() on each CPU, so that each CPU also has
172 * incremented rcu_barrier_cpu_count. Only then is it safe to
173 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
174 * might complete its grace period before all of the other CPUs
175 * did their increment, causing this function to return too
176 * early.
177 */
178 atomic_set(&rcu_barrier_cpu_count, 1);
179 on_each_cpu(rcu_barrier_func, (void *)type, 1);
180 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
181 complete(&rcu_barrier_completion);
182 wait_for_completion(&rcu_barrier_completion);
183 mutex_unlock(&rcu_barrier_mutex);
184 wait_migrated_callbacks();
185}
186
187/**
188 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
189 */
190void rcu_barrier(void)
191{
192 _rcu_barrier(RCU_BARRIER_STD);
193}
194EXPORT_SYMBOL_GPL(rcu_barrier);
195
196/**
197 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
198 */
199void rcu_barrier_bh(void)
200{
201 _rcu_barrier(RCU_BARRIER_BH);
202}
203EXPORT_SYMBOL_GPL(rcu_barrier_bh);
204
205/**
206 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
207 */
208void rcu_barrier_sched(void)
209{
210 _rcu_barrier(RCU_BARRIER_SCHED);
211}
212EXPORT_SYMBOL_GPL(rcu_barrier_sched);
213
214static void rcu_migrate_callback(struct rcu_head *notused)
215{
216 if (atomic_dec_and_test(&rcu_migrate_type_count))
217 wake_up(&rcu_migrate_wq);
218}
219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
224 unsigned long action, void *hcpu) 161 unsigned long action, void *hcpu)
225{ 162{
226 rcu_cpu_notify(self, action, hcpu); 163 return rcu_cpu_notify(self, action, hcpu);
227 if (action == CPU_DYING) {
228 /*
229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
230 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
231 * returns, all online cpus have queued rcu_barrier_func(),
232 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
233 *
234 * These callbacks ensure _rcu_barrier() waits for all
235 * RCU callbacks of the specified type to complete.
236 */
237 atomic_set(&rcu_migrate_type_count, 3);
238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
244 wait_migrated_callbacks();
245 }
246
247 return NOTIFY_OK;
248} 164}
249 165
250void __init rcu_init(void) 166void __init rcu_init(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b33db539a8ad..697c0a0229d4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
267 int irq_capable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = {
339 .sync = synchronize_rcu, 343 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 344 .cb_barrier = rcu_barrier,
341 .stats = NULL, 345 .stats = NULL,
342 .irq_capable = 1, 346 .irq_capable = 1,
343 .name = "rcu" 347 .name = "rcu"
344}; 348};
345 349
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 350static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -602,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = {
602 .name = "sched_sync" 606 .name = "sched_sync"
603}; 607};
604 608
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = { 609static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init, 610 .init = rcu_sync_torture_init,
609 .cleanup = NULL, 611 .cleanup = NULL,
@@ -638,14 +640,15 @@ rcu_torture_writer(void *arg)
638 640
639 do { 641 do {
640 schedule_timeout_uninterruptible(1); 642 schedule_timeout_uninterruptible(1);
641 if ((rp = rcu_torture_alloc()) == NULL) 643 rp = rcu_torture_alloc();
644 if (rp == NULL)
642 continue; 645 continue;
643 rp->rtort_pipe_count = 0; 646 rp->rtort_pipe_count = 0;
644 udelay(rcu_random(&rand) & 0x3ff); 647 udelay(rcu_random(&rand) & 0x3ff);
645 old_rp = rcu_torture_current; 648 old_rp = rcu_torture_current;
646 rp->rtort_mbtest = 1; 649 rp->rtort_mbtest = 1;
647 rcu_assign_pointer(rcu_torture_current, rp); 650 rcu_assign_pointer(rcu_torture_current, rp);
648 smp_wmb(); 651 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
649 if (old_rp) { 652 if (old_rp) {
650 i = old_rp->rtort_pipe_count; 653 i = old_rp->rtort_pipe_count;
651 if (i > RCU_TORTURE_PIPE_LEN) 654 if (i > RCU_TORTURE_PIPE_LEN)
@@ -1110,7 +1113,7 @@ rcu_torture_init(void)
1110 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1111 torture_type); 1114 torture_type);
1112 mutex_unlock(&fullstop_mutex); 1115 mutex_unlock(&fullstop_mutex);
1113 return (-EINVAL); 1116 return -EINVAL;
1114 } 1117 }
1115 if (cur_ops->init) 1118 if (cur_ops->init)
1116 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1119 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1161,7 +1164,7 @@ rcu_torture_init(void)
1161 goto unwind; 1164 goto unwind;
1162 } 1165 }
1163 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1166 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1164 GFP_KERNEL); 1167 GFP_KERNEL);
1165 if (fakewriter_tasks == NULL) { 1168 if (fakewriter_tasks == NULL) {
1166 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1169 VERBOSE_PRINTK_ERRSTRING("out of memory");
1167 firsterr = -ENOMEM; 1170 firsterr = -ENOMEM;
@@ -1170,7 +1173,7 @@ rcu_torture_init(void)
1170 for (i = 0; i < nfakewriters; i++) { 1173 for (i = 0; i < nfakewriters; i++) {
1171 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1174 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1172 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1175 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1173 "rcu_torture_fakewriter"); 1176 "rcu_torture_fakewriter");
1174 if (IS_ERR(fakewriter_tasks[i])) { 1177 if (IS_ERR(fakewriter_tasks[i])) {
1175 firsterr = PTR_ERR(fakewriter_tasks[i]); 1178 firsterr = PTR_ERR(fakewriter_tasks[i]);
1176 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1179 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b11b07cfe7f..705f02ac7433 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -49,13 +49,6 @@
49 49
50#include "rcutree.h" 50#include "rcutree.h"
51 51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Data structures. */ 52/* Data structures. */
60 53
61#define RCU_STATE_INITIALIZER(name) { \ 54#define RCU_STATE_INITIALIZER(name) { \
@@ -70,6 +63,9 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
70 .gpnum = -300, \ 63 .gpnum = -300, \
71 .completed = -300, \ 64 .completed = -300, \
72 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
66 .orphan_cbs_list = NULL, \
67 .orphan_cbs_tail = &name.orphan_cbs_list, \
68 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 69 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 70 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 71 .n_force_qs_ngp = 0, \
@@ -81,24 +77,16 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 79
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100 80
101#include "rcutree_plugin.h" 81/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
83 * permit this function to be invoked without holding the root rcu_node
84 * structure's ->lock, but of course results can be subject to change.
85 */
86static int rcu_gp_in_progress(struct rcu_state *rsp)
87{
88 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
89}
102 90
103/* 91/*
104 * Note a quiescent state. Because we do not need to know 92 * Note a quiescent state. Because we do not need to know
@@ -107,27 +95,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
107 */ 95 */
108void rcu_sched_qs(int cpu) 96void rcu_sched_qs(int cpu)
109{ 97{
110 unsigned long flags;
111 struct rcu_data *rdp; 98 struct rcu_data *rdp;
112 99
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu); 100 rdp = &per_cpu(rcu_sched_data, cpu);
115 rdp->passed_quiesc = 1;
116 rdp->passed_quiesc_completed = rdp->completed; 101 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu); 102 barrier();
118 local_irq_restore(flags); 103 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu);
119} 105}
120 106
121void rcu_bh_qs(int cpu) 107void rcu_bh_qs(int cpu)
122{ 108{
123 unsigned long flags;
124 struct rcu_data *rdp; 109 struct rcu_data *rdp;
125 110
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu); 111 rdp = &per_cpu(rcu_bh_data, cpu);
128 rdp->passed_quiesc = 1;
129 rdp->passed_quiesc_completed = rdp->completed; 112 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags); 113 barrier();
114 rdp->passed_quiesc = 1;
131} 115}
132 116
133#ifdef CONFIG_NO_HZ 117#ifdef CONFIG_NO_HZ
@@ -141,6 +125,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
141static int qhimark = 10000; /* If this many pending, ignore blimit. */ 125static int qhimark = 10000; /* If this many pending, ignore blimit. */
142static int qlowmark = 100; /* Once only this many pending, use blimit. */ 126static int qlowmark = 100; /* Once only this many pending, use blimit. */
143 127
128module_param(blimit, int, 0);
129module_param(qhimark, int, 0);
130module_param(qlowmark, int, 0);
131
144static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 132static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu); 133static int rcu_pending(int cpu);
146 134
@@ -177,9 +165,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
177static int 165static int
178cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 166cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
179{ 167{
180 /* ACCESS_ONCE() because we are accessing outside of lock. */ 168 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
181 return *rdp->nxttail[RCU_DONE_TAIL] &&
182 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
183} 169}
184 170
185/* 171/*
@@ -373,7 +359,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp)
373/* 359/*
374 * Snapshot the specified CPU's dynticks counter so that we can later 360 * Snapshot the specified CPU's dynticks counter so that we can later
375 * credit them with an implicit quiescent state. Return 1 if this CPU 361 * credit them with an implicit quiescent state. Return 1 if this CPU
376 * is already in a quiescent state courtesy of dynticks idle mode. 362 * is in dynticks idle mode, which is an extended quiescent state.
377 */ 363 */
378static int dyntick_save_progress_counter(struct rcu_data *rdp) 364static int dyntick_save_progress_counter(struct rcu_data *rdp)
379{ 365{
@@ -479,30 +465,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
479 long delta; 465 long delta;
480 unsigned long flags; 466 unsigned long flags;
481 struct rcu_node *rnp = rcu_get_root(rsp); 467 struct rcu_node *rnp = rcu_get_root(rsp);
482 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
483 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
484 468
485 /* Only let one CPU complain about others per time interval. */ 469 /* Only let one CPU complain about others per time interval. */
486 470
487 spin_lock_irqsave(&rnp->lock, flags); 471 spin_lock_irqsave(&rnp->lock, flags);
488 delta = jiffies - rsp->jiffies_stall; 472 delta = jiffies - rsp->jiffies_stall;
489 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { 473 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
490 spin_unlock_irqrestore(&rnp->lock, flags); 474 spin_unlock_irqrestore(&rnp->lock, flags);
491 return; 475 return;
492 } 476 }
493 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 477 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
478
479 /*
480 * Now rat on any tasks that got kicked up to the root rcu_node
481 * due to CPU offlining.
482 */
483 rcu_print_task_stall(rnp);
494 spin_unlock_irqrestore(&rnp->lock, flags); 484 spin_unlock_irqrestore(&rnp->lock, flags);
495 485
496 /* OK, time to rat on our buddy... */ 486 /* OK, time to rat on our buddy... */
497 487
498 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 488 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
499 for (; rnp_cur < rnp_end; rnp_cur++) { 489 rcu_for_each_leaf_node(rsp, rnp) {
500 rcu_print_task_stall(rnp); 490 rcu_print_task_stall(rnp);
501 if (rnp_cur->qsmask == 0) 491 if (rnp->qsmask == 0)
502 continue; 492 continue;
503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 493 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
504 if (rnp_cur->qsmask & (1UL << cpu)) 494 if (rnp->qsmask & (1UL << cpu))
505 printk(" %d", rnp_cur->grplo + cpu); 495 printk(" %d", rnp->grplo + cpu);
506 } 496 }
507 printk(" (detected by %d, t=%ld jiffies)\n", 497 printk(" (detected by %d, t=%ld jiffies)\n",
508 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 498 smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -541,8 +531,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
541 /* We haven't checked in, so go dump stack. */ 531 /* We haven't checked in, so go dump stack. */
542 print_cpu_stall(rsp); 532 print_cpu_stall(rsp);
543 533
544 } else if (rsp->gpnum != rsp->completed && 534 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
545 delta >= RCU_STALL_RAT_DELAY) {
546 535
547 /* They had two time units to dump stack, so complain. */ 536 /* They had two time units to dump stack, so complain. */
548 print_other_cpu_stall(rsp); 537 print_other_cpu_stall(rsp);
@@ -605,8 +594,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
605{ 594{
606 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 595 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
607 struct rcu_node *rnp = rcu_get_root(rsp); 596 struct rcu_node *rnp = rcu_get_root(rsp);
608 struct rcu_node *rnp_cur;
609 struct rcu_node *rnp_end;
610 597
611 if (!cpu_needs_another_gp(rsp, rdp)) { 598 if (!cpu_needs_another_gp(rsp, rdp)) {
612 spin_unlock_irqrestore(&rnp->lock, flags); 599 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -615,6 +602,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
615 602
616 /* Advance to a new grace period and initialize state. */ 603 /* Advance to a new grace period and initialize state. */
617 rsp->gpnum++; 604 rsp->gpnum++;
605 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
618 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
619 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
620 record_gp_stall_check_time(rsp); 608 record_gp_stall_check_time(rsp);
@@ -622,16 +610,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
622 note_new_gpnum(rsp, rdp); 610 note_new_gpnum(rsp, rdp);
623 611
624 /* 612 /*
625 * Because we are first, we know that all our callbacks will 613 * Because this CPU just now started the new grace period, we know
626 * be covered by this upcoming grace period, even the ones 614 * that all of its callbacks will be covered by this upcoming grace
627 * that were registered arbitrarily recently. 615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
628 */ 622 */
629 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
630 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
631 625
632 /* Special-case the common single-level case. */ 626 /* Special-case the common single-level case. */
633 if (NUM_RCU_NODES == 1) { 627 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp);
634 rnp->qsmask = rnp->qsmaskinit; 629 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum;
635 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
636 spin_unlock_irqrestore(&rnp->lock, flags); 632 spin_unlock_irqrestore(&rnp->lock, flags);
637 return; 633 return;
@@ -644,42 +640,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
644 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 640 spin_lock(&rsp->onofflock); /* irqs already disabled. */
645 641
646 /* 642 /*
647 * Set the quiescent-state-needed bits in all the non-leaf RCU 643 * Set the quiescent-state-needed bits in all the rcu_node
648 * nodes for all currently online CPUs. This operation relies 644 * structures for all currently online CPUs in breadth-first
649 * on the layout of the hierarchy within the rsp->node[] array. 645 * order, starting from the root rcu_node structure. This
650 * Note that other CPUs will access only the leaves of the 646 * operation relies on the layout of the hierarchy within the
651 * hierarchy, which still indicate that no grace period is in 647 * rsp->node[] array. Note that other CPUs will access only
652 * progress. In addition, we have excluded CPU-hotplug operations. 648 * the leaves of the hierarchy, which still indicate that no
653 * 649 * grace period is in progress, at least until the corresponding
654 * We therefore do not need to hold any locks. Any required 650 * leaf node has been initialized. In addition, we have excluded
655 * memory barriers will be supplied by the locks guarding the 651 * CPU-hotplug operations.
656 * leaf rcu_nodes in the hierarchy.
657 */
658
659 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
660 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
661 rnp_cur->qsmask = rnp_cur->qsmaskinit;
662
663 /*
664 * Now set up the leaf nodes. Here we must be careful. First,
665 * we need to hold the lock in order to exclude other CPUs, which
666 * might be contending for the leaf nodes' locks. Second, as
667 * soon as we initialize a given leaf node, its CPUs might run
668 * up the rest of the hierarchy. We must therefore acquire locks
669 * for each node that we touch during this stage. (But we still
670 * are excluding CPU-hotplug operations.)
671 * 652 *
672 * Note that the grace period cannot complete until we finish 653 * Note that the grace period cannot complete until we finish
673 * the initialization process, as there will be at least one 654 * the initialization process, as there will be at least one
674 * qsmask bit set in the root node until that time, namely the 655 * qsmask bit set in the root node until that time, namely the
675 * one corresponding to this CPU. 656 * one corresponding to this CPU, due to the fact that we have
657 * irqs disabled.
676 */ 658 */
677 rnp_end = &rsp->node[NUM_RCU_NODES]; 659 rcu_for_each_node_breadth_first(rsp, rnp) {
678 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 660 spin_lock(&rnp->lock); /* irqs already disabled. */
679 for (; rnp_cur < rnp_end; rnp_cur++) { 661 rcu_preempt_check_blocked_tasks(rnp);
680 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 662 rnp->qsmask = rnp->qsmaskinit;
681 rnp_cur->qsmask = rnp_cur->qsmaskinit; 663 rnp->gpnum = rsp->gpnum;
682 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 664 spin_unlock(&rnp->lock); /* irqs already disabled. */
683 } 665 }
684 666
685 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 667 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -720,8 +702,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it. 702 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */ 703 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 704static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock) 705 __releases(rcu_get_root(rsp)->lock)
724{ 706{
707 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
725 rsp->completed = rsp->gpnum; 708 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 709 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 710 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -739,6 +722,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
739 unsigned long flags) 722 unsigned long flags)
740 __releases(rnp->lock) 723 __releases(rnp->lock)
741{ 724{
725 struct rcu_node *rnp_c;
726
742 /* Walk up the rcu_node hierarchy. */ 727 /* Walk up the rcu_node hierarchy. */
743 for (;;) { 728 for (;;) {
744 if (!(rnp->qsmask & mask)) { 729 if (!(rnp->qsmask & mask)) {
@@ -762,8 +747,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
762 break; 747 break;
763 } 748 }
764 spin_unlock_irqrestore(&rnp->lock, flags); 749 spin_unlock_irqrestore(&rnp->lock, flags);
750 rnp_c = rnp;
765 rnp = rnp->parent; 751 rnp = rnp->parent;
766 spin_lock_irqsave(&rnp->lock, flags); 752 spin_lock_irqsave(&rnp->lock, flags);
753 WARN_ON_ONCE(rnp_c->qsmask);
767 } 754 }
768 755
769 /* 756 /*
@@ -776,10 +763,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
776 763
777/* 764/*
778 * Record a quiescent state for the specified CPU, which must either be 765 * Record a quiescent state for the specified CPU, which must either be
779 * the current CPU or an offline CPU. The lastcomp argument is used to 766 * the current CPU. The lastcomp argument is used to make sure we are
780 * make sure we are still in the grace period of interest. We don't want 767 * still in the grace period of interest. We don't want to end the current
781 * to end the current grace period based on quiescent states detected in 768 * grace period based on quiescent states detected in an earlier grace
782 * an earlier grace period! 769 * period!
783 */ 770 */
784static void 771static void
785cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 772cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -814,7 +801,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
814 * This GP can't end until cpu checks in, so all of our 801 * This GP can't end until cpu checks in, so all of our
815 * callbacks can be processed during the next GP. 802 * callbacks can be processed during the next GP.
816 */ 803 */
817 rdp = rsp->rda[smp_processor_id()];
818 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 804 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
819 805
820 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 806 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -855,24 +841,70 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
855#ifdef CONFIG_HOTPLUG_CPU 841#ifdef CONFIG_HOTPLUG_CPU
856 842
857/* 843/*
844 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
845 * specified flavor of RCU. The callbacks will be adopted by the next
846 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
847 * comes first. Because this is invoked from the CPU_DYING notifier,
848 * irqs are already disabled.
849 */
850static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
851{
852 int i;
853 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
854
855 if (rdp->nxtlist == NULL)
856 return; /* irqs disabled, so comparison is stable. */
857 spin_lock(&rsp->onofflock); /* irqs already disabled. */
858 *rsp->orphan_cbs_tail = rdp->nxtlist;
859 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
860 rdp->nxtlist = NULL;
861 for (i = 0; i < RCU_NEXT_SIZE; i++)
862 rdp->nxttail[i] = &rdp->nxtlist;
863 rsp->orphan_qlen += rdp->qlen;
864 rdp->qlen = 0;
865 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
866}
867
868/*
869 * Adopt previously orphaned RCU callbacks.
870 */
871static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
872{
873 unsigned long flags;
874 struct rcu_data *rdp;
875
876 spin_lock_irqsave(&rsp->onofflock, flags);
877 rdp = rsp->rda[smp_processor_id()];
878 if (rsp->orphan_cbs_list == NULL) {
879 spin_unlock_irqrestore(&rsp->onofflock, flags);
880 return;
881 }
882 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
883 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
884 rdp->qlen += rsp->orphan_qlen;
885 rsp->orphan_cbs_list = NULL;
886 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
887 rsp->orphan_qlen = 0;
888 spin_unlock_irqrestore(&rsp->onofflock, flags);
889}
890
891/*
858 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 892 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
859 * and move all callbacks from the outgoing CPU to the current one. 893 * and move all callbacks from the outgoing CPU to the current one.
860 */ 894 */
861static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 895static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
862{ 896{
863 int i;
864 unsigned long flags; 897 unsigned long flags;
865 long lastcomp; 898 long lastcomp;
866 unsigned long mask; 899 unsigned long mask;
867 struct rcu_data *rdp = rsp->rda[cpu]; 900 struct rcu_data *rdp = rsp->rda[cpu];
868 struct rcu_data *rdp_me;
869 struct rcu_node *rnp; 901 struct rcu_node *rnp;
870 902
871 /* Exclude any attempts to start a new grace period. */ 903 /* Exclude any attempts to start a new grace period. */
872 spin_lock_irqsave(&rsp->onofflock, flags); 904 spin_lock_irqsave(&rsp->onofflock, flags);
873 905
874 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 906 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
875 rnp = rdp->mynode; 907 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
876 mask = rdp->grpmask; /* rnp->grplo is constant. */ 908 mask = rdp->grpmask; /* rnp->grplo is constant. */
877 do { 909 do {
878 spin_lock(&rnp->lock); /* irqs already disabled. */ 910 spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -881,42 +913,16 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
881 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 913 spin_unlock(&rnp->lock); /* irqs remain disabled. */
882 break; 914 break;
883 } 915 }
884 rcu_preempt_offline_tasks(rsp, rnp); 916 rcu_preempt_offline_tasks(rsp, rnp, rdp);
885 mask = rnp->grpmask; 917 mask = rnp->grpmask;
886 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 918 spin_unlock(&rnp->lock); /* irqs remain disabled. */
887 rnp = rnp->parent; 919 rnp = rnp->parent;
888 } while (rnp != NULL); 920 } while (rnp != NULL);
889 lastcomp = rsp->completed; 921 lastcomp = rsp->completed;
890 922
891 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 923 spin_unlock_irqrestore(&rsp->onofflock, flags);
892
893 /* Being offline is a quiescent state, so go record it. */
894 cpu_quiet(cpu, rsp, rdp, lastcomp);
895 924
896 /* 925 rcu_adopt_orphan_cbs(rsp);
897 * Move callbacks from the outgoing CPU to the running CPU.
898 * Note that the outgoing CPU is now quiscent, so it is now
899 * (uncharacteristically) safe to access its rcu_data structure.
900 * Note also that we must carefully retain the order of the
901 * outgoing CPU's callbacks in order for rcu_barrier() to work
902 * correctly. Finally, note that we start all the callbacks
903 * afresh, even those that have passed through a grace period
904 * and are therefore ready to invoke. The theory is that hotplug
905 * events are rare, and that if they are frequent enough to
906 * indefinitely delay callbacks, you have far worse things to
907 * be worrying about.
908 */
909 rdp_me = rsp->rda[smp_processor_id()];
910 if (rdp->nxtlist != NULL) {
911 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
912 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
913 rdp->nxtlist = NULL;
914 for (i = 0; i < RCU_NEXT_SIZE; i++)
915 rdp->nxttail[i] = &rdp->nxtlist;
916 rdp_me->qlen += rdp->qlen;
917 rdp->qlen = 0;
918 }
919 local_irq_restore(flags);
920} 926}
921 927
922/* 928/*
@@ -934,6 +940,14 @@ static void rcu_offline_cpu(int cpu)
934 940
935#else /* #ifdef CONFIG_HOTPLUG_CPU */ 941#else /* #ifdef CONFIG_HOTPLUG_CPU */
936 942
943static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
944{
945}
946
947static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
948{
949}
950
937static void rcu_offline_cpu(int cpu) 951static void rcu_offline_cpu(int cpu)
938{ 952{
939} 953}
@@ -1066,33 +1080,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1066 int cpu; 1080 int cpu;
1067 unsigned long flags; 1081 unsigned long flags;
1068 unsigned long mask; 1082 unsigned long mask;
1069 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 1083 struct rcu_node *rnp;
1070 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1071 1084
1072 for (; rnp_cur < rnp_end; rnp_cur++) { 1085 rcu_for_each_leaf_node(rsp, rnp) {
1073 mask = 0; 1086 mask = 0;
1074 spin_lock_irqsave(&rnp_cur->lock, flags); 1087 spin_lock_irqsave(&rnp->lock, flags);
1075 if (rsp->completed != lastcomp) { 1088 if (rsp->completed != lastcomp) {
1076 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1089 spin_unlock_irqrestore(&rnp->lock, flags);
1077 return 1; 1090 return 1;
1078 } 1091 }
1079 if (rnp_cur->qsmask == 0) { 1092 if (rnp->qsmask == 0) {
1080 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1093 spin_unlock_irqrestore(&rnp->lock, flags);
1081 continue; 1094 continue;
1082 } 1095 }
1083 cpu = rnp_cur->grplo; 1096 cpu = rnp->grplo;
1084 bit = 1; 1097 bit = 1;
1085 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { 1098 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1086 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1099 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1087 mask |= bit; 1100 mask |= bit;
1088 } 1101 }
1089 if (mask != 0 && rsp->completed == lastcomp) { 1102 if (mask != 0 && rsp->completed == lastcomp) {
1090 1103
1091 /* cpu_quiet_msk() releases rnp_cur->lock. */ 1104 /* cpu_quiet_msk() releases rnp->lock. */
1092 cpu_quiet_msk(mask, rsp, rnp_cur, flags); 1105 cpu_quiet_msk(mask, rsp, rnp, flags);
1093 continue; 1106 continue;
1094 } 1107 }
1095 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1108 spin_unlock_irqrestore(&rnp->lock, flags);
1096 } 1109 }
1097 return 0; 1110 return 0;
1098} 1111}
@@ -1108,7 +1121,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1108 struct rcu_node *rnp = rcu_get_root(rsp); 1121 struct rcu_node *rnp = rcu_get_root(rsp);
1109 u8 signaled; 1122 u8 signaled;
1110 1123
1111 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) 1124 if (!rcu_gp_in_progress(rsp))
1112 return; /* No grace period in progress, nothing to force. */ 1125 return; /* No grace period in progress, nothing to force. */
1113 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1126 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1114 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1127 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
@@ -1267,7 +1280,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1267 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1280 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1268 1281
1269 /* Start a new grace period if one not already started. */ 1282 /* Start a new grace period if one not already started. */
1270 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { 1283 if (!rcu_gp_in_progress(rsp)) {
1271 unsigned long nestflag; 1284 unsigned long nestflag;
1272 struct rcu_node *rnp_root = rcu_get_root(rsp); 1285 struct rcu_node *rnp_root = rcu_get_root(rsp);
1273 1286
@@ -1347,7 +1360,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1347 } 1360 }
1348 1361
1349 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1362 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1350 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1363 if (rcu_gp_in_progress(rsp) &&
1351 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1364 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1352 rdp->n_rp_need_fqs++; 1365 rdp->n_rp_need_fqs++;
1353 return 1; 1366 return 1;
@@ -1384,6 +1397,82 @@ int rcu_needs_cpu(int cpu)
1384 rcu_preempt_needs_cpu(cpu); 1397 rcu_preempt_needs_cpu(cpu);
1385} 1398}
1386 1399
1400static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1401static atomic_t rcu_barrier_cpu_count;
1402static DEFINE_MUTEX(rcu_barrier_mutex);
1403static struct completion rcu_barrier_completion;
1404
1405static void rcu_barrier_callback(struct rcu_head *notused)
1406{
1407 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1408 complete(&rcu_barrier_completion);
1409}
1410
1411/*
1412 * Called with preemption disabled, and from cross-cpu IRQ context.
1413 */
1414static void rcu_barrier_func(void *type)
1415{
1416 int cpu = smp_processor_id();
1417 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1418 void (*call_rcu_func)(struct rcu_head *head,
1419 void (*func)(struct rcu_head *head));
1420
1421 atomic_inc(&rcu_barrier_cpu_count);
1422 call_rcu_func = type;
1423 call_rcu_func(head, rcu_barrier_callback);
1424}
1425
1426/*
1427 * Orchestrate the specified type of RCU barrier, waiting for all
1428 * RCU callbacks of the specified type to complete.
1429 */
1430static void _rcu_barrier(struct rcu_state *rsp,
1431 void (*call_rcu_func)(struct rcu_head *head,
1432 void (*func)(struct rcu_head *head)))
1433{
1434 BUG_ON(in_interrupt());
1435 /* Take mutex to serialize concurrent rcu_barrier() requests. */
1436 mutex_lock(&rcu_barrier_mutex);
1437 init_completion(&rcu_barrier_completion);
1438 /*
1439 * Initialize rcu_barrier_cpu_count to 1, then invoke
1440 * rcu_barrier_func() on each CPU, so that each CPU also has
1441 * incremented rcu_barrier_cpu_count. Only then is it safe to
1442 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1443 * might complete its grace period before all of the other CPUs
1444 * did their increment, causing this function to return too
1445 * early.
1446 */
1447 atomic_set(&rcu_barrier_cpu_count, 1);
1448 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1449 rcu_adopt_orphan_cbs(rsp);
1450 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1451 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1452 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1453 complete(&rcu_barrier_completion);
1454 wait_for_completion(&rcu_barrier_completion);
1455 mutex_unlock(&rcu_barrier_mutex);
1456}
1457
1458/**
1459 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
1460 */
1461void rcu_barrier_bh(void)
1462{
1463 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
1464}
1465EXPORT_SYMBOL_GPL(rcu_barrier_bh);
1466
1467/**
1468 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
1469 */
1470void rcu_barrier_sched(void)
1471{
1472 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
1473}
1474EXPORT_SYMBOL_GPL(rcu_barrier_sched);
1475
1387/* 1476/*
1388 * Do boot-time initialization of a CPU's per-CPU RCU data. 1477 * Do boot-time initialization of a CPU's per-CPU RCU data.
1389 */ 1478 */
@@ -1457,20 +1546,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1457 rnp = rnp->parent; 1546 rnp = rnp->parent;
1458 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1547 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1459 1548
1460 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1549 spin_unlock_irqrestore(&rsp->onofflock, flags);
1461
1462 /*
1463 * A new grace period might start here. If so, we will be part of
1464 * it, and its gpnum will be greater than ours, so we will
1465 * participate. It is also possible for the gpnum to have been
1466 * incremented before this function was called, and the bitmasks
1467 * to not be filled out until now, in which case we will also
1468 * participate due to our gpnum being behind.
1469 */
1470
1471 /* Since it is coming online, the CPU is in a quiescent state. */
1472 cpu_quiet(cpu, rsp, rdp, lastcomp);
1473 local_irq_restore(flags);
1474} 1550}
1475 1551
1476static void __cpuinit rcu_online_cpu(int cpu) 1552static void __cpuinit rcu_online_cpu(int cpu)
@@ -1493,6 +1569,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1493 case CPU_UP_PREPARE_FROZEN: 1569 case CPU_UP_PREPARE_FROZEN:
1494 rcu_online_cpu(cpu); 1570 rcu_online_cpu(cpu);
1495 break; 1571 break;
1572 case CPU_DYING:
1573 case CPU_DYING_FROZEN:
1574 /*
1575 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1576 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1577 * returns, all online cpus have queued rcu_barrier_func().
1578 * The dying CPU clears its cpu_online_mask bit and
1579 * moves all of its RCU callbacks to ->orphan_cbs_list
1580 * in the context of stop_machine(), so subsequent calls
1581 * to _rcu_barrier() will adopt these callbacks and only
1582 * then queue rcu_barrier_func() on all remaining CPUs.
1583 */
1584 rcu_send_cbs_to_orphanage(&rcu_bh_state);
1585 rcu_send_cbs_to_orphanage(&rcu_sched_state);
1586 rcu_preempt_send_cbs_to_orphanage();
1587 break;
1496 case CPU_DEAD: 1588 case CPU_DEAD:
1497 case CPU_DEAD_FROZEN: 1589 case CPU_DEAD_FROZEN:
1498 case CPU_UP_CANCELED: 1590 case CPU_UP_CANCELED:
@@ -1555,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1555 cpustride *= rsp->levelspread[i]; 1647 cpustride *= rsp->levelspread[i];
1556 rnp = rsp->level[i]; 1648 rnp = rsp->level[i];
1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1649 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1558 spin_lock_init(&rnp->lock); 1650 if (rnp != rcu_get_root(rsp))
1651 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0; 1652 rnp->gpnum = 0;
1560 rnp->qsmask = 0; 1653 rnp->qsmask = 0;
1561 rnp->qsmaskinit = 0; 1654 rnp->qsmaskinit = 0;
@@ -1578,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1671 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1579 } 1672 }
1580 } 1673 }
1674 spin_lock_init(&rcu_get_root(rsp)->lock);
1581} 1675}
1582 1676
1583/* 1677/*
@@ -1587,6 +1681,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1587 */ 1681 */
1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1682#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1589do { \ 1683do { \
1684 int i; \
1685 int j; \
1686 struct rcu_node *rnp; \
1687 \
1590 rcu_init_one(rsp); \ 1688 rcu_init_one(rsp); \
1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1689 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1592 j = 0; \ 1690 j = 0; \
@@ -1599,31 +1697,8 @@ do { \
1599 } \ 1697 } \
1600} while (0) 1698} while (0)
1601 1699
1602#ifdef CONFIG_TREE_PREEMPT_RCU
1603
1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1620
1621void __init __rcu_init(void) 1700void __init __rcu_init(void)
1622{ 1701{
1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1624 int j;
1625 struct rcu_node *rnp;
1626
1627 rcu_bootup_announce(); 1702 rcu_bootup_announce();
1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1703#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1704 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
@@ -1634,6 +1709,4 @@ void __init __rcu_init(void)
1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1709 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1635} 1710}
1636 1711
1637module_param(blimit, int, 0); 1712#include "rcutree_plugin.h"
1638module_param(qhimark, int, 0);
1639module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index bf8a6f9f134d..b40ac5706040 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -48,14 +48,14 @@
48#elif NR_CPUS <= RCU_FANOUT_SQ 48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) 51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) 57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) 58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 59# define NUM_RCU_LVL_3 NR_CPUS
60#else 60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
@@ -79,15 +79,21 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 80 */
81struct rcu_node { 81struct rcu_node {
82 spinlock_t lock; 82 spinlock_t lock; /* Root rcu_node's lock protects some */
83 /* rcu_state fields as well as following. */
83 long gpnum; /* Current grace period for this node. */ 84 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */ 85 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */ 86 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */ 87 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/ 88 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */
92 /* structure. */
88 unsigned long qsmaskinit; 93 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */ 94 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 95 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */
91 int grplo; /* lowest-numbered CPU or group here. */ 97 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */ 98 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */ 99 u8 grpnum; /* CPU/group number for next level up. */
@@ -95,8 +101,23 @@ struct rcu_node {
95 struct rcu_node *parent; 101 struct rcu_node *parent;
96 struct list_head blocked_tasks[2]; 102 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */ 103 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */
106 /* blocked_tasks[] array. */
98} ____cacheline_internodealigned_in_smp; 107} ____cacheline_internodealigned_in_smp;
99 108
109/*
110 * Do a full breadth-first scan of the rcu_node structures for the
111 * specified rcu_state structure.
112 */
113#define rcu_for_each_node_breadth_first(rsp, rnp) \
114 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116
117#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
120
100/* Index values for nxttail array in struct rcu_data. */ 121/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 122#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 123#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -126,23 +147,26 @@ struct rcu_data {
126 * Any of the partitions might be empty, in which case the 147 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for 148 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of 149 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL. 150 * the nxttail elements point to the ->nxtlist pointer itself,
151 * which in that case is NULL.
130 * 152 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]): 153 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed 154 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and 155 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved 156 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks(). 157 * here temporarily in rcu_process_callbacks().
158 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
159 * Entries that batch # <= ->completed - 1: waiting for current GP
160 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
161 * Entries known to have arrived before current GP ended
162 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
163 * Entries that might have arrived after current GP ended
164 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
165 * always be NULL, as this is the end of the list.
142 */ 166 */
143 struct rcu_head *nxtlist; 167 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 168 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */ 169 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */ 170 long blimit; /* Upper limit on a processed batch */
147 171
148#ifdef CONFIG_NO_HZ 172#ifdef CONFIG_NO_HZ
@@ -216,8 +240,19 @@ struct rcu_state {
216 /* Force QS state. */ 240 /* Force QS state. */
217 long gpnum; /* Current gp number. */ 241 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */ 242 long completed; /* # of last completed gp. */
243
244 /* End of fields guarded by root rcu_node's lock. */
245
219 spinlock_t onofflock; /* exclude on/offline and */ 246 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */ 247 /* starting new GP. Also */
248 /* protects the following */
249 /* orphan_cbs fields. */
250 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
251 /* orphaned by all CPUs in */
252 /* a given leaf rcu_node */
253 /* going offline. */
254 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
255 long orphan_qlen; /* Number of orphaned cbs. */
221 spinlock_t fqslock; /* Only one task forcing */ 256 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */ 257 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */ 258 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -255,5 +290,30 @@ extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 290DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 291#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257 292
258#endif /* #ifdef RCU_TREE_NONCORE */ 293#else /* #ifdef RCU_TREE_NONCORE */
294
295/* Forward declarations for rcutree_plugin.h */
296static inline void rcu_bootup_announce(void);
297long rcu_batches_completed(void);
298static void rcu_preempt_note_context_switch(int cpu);
299static int rcu_preempted_readers(struct rcu_node *rnp);
300#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
301static void rcu_print_task_stall(struct rcu_node *rnp);
302#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
303static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
304#ifdef CONFIG_HOTPLUG_CPU
305static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
306 struct rcu_node *rnp,
307 struct rcu_data *rdp);
308static void rcu_preempt_offline_cpu(int cpu);
309#endif /* #ifdef CONFIG_HOTPLUG_CPU */
310static void rcu_preempt_check_callbacks(int cpu);
311static void rcu_preempt_process_callbacks(void);
312void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
313static int rcu_preempt_pending(int cpu);
314static int rcu_preempt_needs_cpu(int cpu);
315static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
316static void rcu_preempt_send_cbs_to_orphanage(void);
317static void __init __rcu_init_preempt(void);
259 318
319#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 47789369ea59..c0cb783aa16a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
64 * not in a quiescent state. There might be any number of tasks blocked 64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section. 65 * while in an RCU read-side critical section.
66 */ 66 */
67static void rcu_preempt_qs_record(int cpu) 67static void rcu_preempt_qs(int cpu)
68{ 68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed; 70 rdp->passed_quiesc_completed = rdp->completed;
71 barrier();
72 rdp->passed_quiesc = 1;
72} 73}
73 74
74/* 75/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd. 76 * We have entered the scheduler, and the current task might soon be
76 * If we are in an RCU read-side critical section, we need to reflect 77 * context-switched away from. If this task is in an RCU read-side
77 * that in the state of the rcu_node structure corresponding to this CPU. 78 * critical section, we will no longer be able to rely on the CPU to
78 * Caller must disable hardirqs. 79 * record that fact, so we enqueue the task on the appropriate entry
80 * of the blocked_tasks[] array. The task will dequeue itself when
81 * it exits the outermost enclosing RCU read-side critical section.
82 * Therefore, the current grace period cannot be permitted to complete
83 * until the blocked_tasks[] entry indexed by the low-order bit of
84 * rnp->gpnum empties.
85 *
86 * Caller must disable preemption.
79 */ 87 */
80static void rcu_preempt_qs(int cpu) 88static void rcu_preempt_note_context_switch(int cpu)
81{ 89{
82 struct task_struct *t = current; 90 struct task_struct *t = current;
91 unsigned long flags;
83 int phase; 92 int phase;
84 struct rcu_data *rdp; 93 struct rcu_data *rdp;
85 struct rcu_node *rnp; 94 struct rcu_node *rnp;
@@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu)
90 /* Possibly blocking in an RCU read-side critical section. */ 99 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu]; 100 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode; 101 rnp = rdp->mynode;
93 spin_lock(&rnp->lock); 102 spin_lock_irqsave(&rnp->lock, flags);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp; 104 t->rcu_blocked_node = rnp;
96 105
@@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu)
103 * state for the current grace period), then as long 112 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period 113 * as that task remains queued, the current grace period
105 * cannot end. 114 * cannot end.
115 *
116 * But first, note that the current CPU must still be
117 * on line!
106 */ 118 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); 119 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */ 123 spin_unlock_irqrestore(&rnp->lock, flags);
110 spin_unlock(&rnp->lock);
111 } 124 }
112 125
113 /* 126 /*
@@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu)
119 * grace period, then the fact that the task has been enqueued 132 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period. 133 * means that we continue to block the current grace period.
121 */ 134 */
122 rcu_preempt_qs_record(cpu); 135 rcu_preempt_qs(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | 136 local_irq_save(flags);
124 RCU_READ_UNLOCK_GOT_QS); 137 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
138 local_irq_restore(flags);
125} 139}
126 140
127/* 141/*
@@ -136,6 +150,16 @@ void __rcu_read_lock(void)
136} 150}
137EXPORT_SYMBOL_GPL(__rcu_read_lock); 151EXPORT_SYMBOL_GPL(__rcu_read_lock);
138 152
153/*
154 * Check for preempted RCU readers blocking the current grace period
155 * for the specified rcu_node structure. If the caller needs a reliable
156 * answer, it must hold the rcu_node's ->lock.
157 */
158static int rcu_preempted_readers(struct rcu_node *rnp)
159{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
161}
162
139static void rcu_read_unlock_special(struct task_struct *t) 163static void rcu_read_unlock_special(struct task_struct *t)
140{ 164{
141 int empty; 165 int empty;
@@ -157,7 +181,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
157 special = t->rcu_read_unlock_special; 181 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) { 182 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 183 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; 184 rcu_preempt_qs(smp_processor_id());
161 } 185 }
162 186
163 /* Hardware IRQ handlers cannot block. */ 187 /* Hardware IRQ handlers cannot block. */
@@ -177,12 +201,12 @@ static void rcu_read_unlock_special(struct task_struct *t)
177 */ 201 */
178 for (;;) { 202 for (;;) {
179 rnp = t->rcu_blocked_node; 203 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock); 204 spin_lock(&rnp->lock); /* irqs already disabled. */
181 if (rnp == t->rcu_blocked_node) 205 if (rnp == t->rcu_blocked_node)
182 break; 206 break;
183 spin_unlock(&rnp->lock); 207 spin_unlock(&rnp->lock); /* irqs remain disabled. */
184 } 208 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 209 empty = !rcu_preempted_readers(rnp);
186 list_del_init(&t->rcu_node_entry); 210 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL; 211 t->rcu_blocked_node = NULL;
188 212
@@ -193,10 +217,9 @@ static void rcu_read_unlock_special(struct task_struct *t)
193 * drop rnp->lock and restore irq. 217 * drop rnp->lock and restore irq.
194 */ 218 */
195 if (!empty && rnp->qsmask == 0 && 219 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { 220 !rcu_preempted_readers(rnp)) {
197 t->rcu_read_unlock_special &= 221 struct rcu_node *rnp_p;
198 ~(RCU_READ_UNLOCK_NEED_QS | 222
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) { 223 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */ 224 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags); 225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
@@ -205,9 +228,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
205 /* Report up the rest of the hierarchy. */ 228 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask; 229 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags); 230 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent; 231 rnp_p = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags); 232 spin_lock_irqsave(&rnp_p->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); 233 WARN_ON_ONCE(rnp->qsmask);
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
211 return; 235 return;
212 } 236 }
213 spin_unlock(&rnp->lock); 237 spin_unlock(&rnp->lock);
@@ -243,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
243{ 267{
244 unsigned long flags; 268 unsigned long flags;
245 struct list_head *lp; 269 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1; 270 int phase;
247 struct task_struct *t; 271 struct task_struct *t;
248 272
249 if (!list_empty(&rnp->blocked_tasks[phase])) { 273 if (rcu_preempted_readers(rnp)) {
250 spin_lock_irqsave(&rnp->lock, flags); 274 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */ 275 phase = rnp->gpnum & 0x1;
252 lp = &rnp->blocked_tasks[phase]; 276 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry) 277 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid); 278 printk(" P%d", t->pid);
@@ -259,13 +283,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 283#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260 284
261/* 285/*
262 * Check for preempted RCU readers for the specified rcu_node structure. 286 * Check that the list of blocked tasks for the newly completed grace
263 * If the caller needs a reliable answer, it must hold the rcu_node's 287 * period is in fact empty. It is a serious bug to complete a grace
264 * >lock. 288 * period that still has RCU readers blocked! This function must be
289 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
290 * must be held by the caller.
265 */ 291 */
266static int rcu_preempted_readers(struct rcu_node *rnp) 292static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
267{ 293{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 294 WARN_ON_ONCE(rcu_preempted_readers(rnp));
295 WARN_ON_ONCE(rnp->qsmask);
269} 296}
270 297
271#ifdef CONFIG_HOTPLUG_CPU 298#ifdef CONFIG_HOTPLUG_CPU
@@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
280 * The caller must hold rnp->lock with irqs disabled. 307 * The caller must hold rnp->lock with irqs disabled.
281 */ 308 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 309static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp) 310 struct rcu_node *rnp,
311 struct rcu_data *rdp)
284{ 312{
285 int i; 313 int i;
286 struct list_head *lp; 314 struct list_head *lp;
@@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
292 WARN_ONCE(1, "Last CPU thought to be offlined?"); 320 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */ 321 return; /* Shouldn't happen: at least one CPU online. */
294 } 322 }
323 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1])));
295 326
296 /* 327 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the 328 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu)
335 struct task_struct *t = current; 366 struct task_struct *t = current;
336 367
337 if (t->rcu_read_lock_nesting == 0) { 368 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &= 369 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); 370 rcu_preempt_qs(cpu);
340 rcu_preempt_qs_record(cpu);
341 return; 371 return;
342 } 372 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) { 373 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { 374 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352} 375}
353 376
354/* 377/*
@@ -387,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu)
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388} 411}
389 412
413/**
414 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
415 */
416void rcu_barrier(void)
417{
418 _rcu_barrier(&rcu_preempt_state, call_rcu);
419}
420EXPORT_SYMBOL_GPL(rcu_barrier);
421
390/* 422/*
391 * Initialize preemptable RCU's per-CPU data. 423 * Initialize preemptable RCU's per-CPU data.
392 */ 424 */
@@ -396,6 +428,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
396} 428}
397 429
398/* 430/*
431 * Move preemptable RCU's callbacks to ->orphan_cbs_list.
432 */
433static void rcu_preempt_send_cbs_to_orphanage(void)
434{
435 rcu_send_cbs_to_orphanage(&rcu_preempt_state);
436}
437
438/*
439 * Initialize preemptable RCU's state structures.
440 */
441static void __init __rcu_init_preempt(void)
442{
443 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
444}
445
446/*
399 * Check for a task exiting while in a preemptable-RCU read-side 447 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings, 448 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep 449 * as debug_check_no_locks_held() already does this if lockdep
@@ -434,8 +482,17 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
434 * Because preemptable RCU does not exist, we never have to check for 482 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states. 483 * CPUs being in quiescent states.
436 */ 484 */
437static void rcu_preempt_qs(int cpu) 485static void rcu_preempt_note_context_switch(int cpu)
486{
487}
488
489/*
490 * Because preemptable RCU does not exist, there are never any preempted
491 * RCU readers.
492 */
493static int rcu_preempted_readers(struct rcu_node *rnp)
438{ 494{
495 return 0;
439} 496}
440 497
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 498#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -451,12 +508,13 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 508#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452 509
453/* 510/*
454 * Because preemptable RCU does not exist, there are never any preempted 511 * Because there is no preemptable RCU, there can be no readers blocked,
455 * RCU readers. 512 * so there is no need to check for blocked tasks. So check only for
513 * bogus qsmask values.
456 */ 514 */
457static int rcu_preempted_readers(struct rcu_node *rnp) 515static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
458{ 516{
459 return 0; 517 WARN_ON_ONCE(rnp->qsmask);
460} 518}
461 519
462#ifdef CONFIG_HOTPLUG_CPU 520#ifdef CONFIG_HOTPLUG_CPU
@@ -466,7 +524,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
466 * tasks that were blocked within RCU read-side critical sections. 524 * tasks that were blocked within RCU read-side critical sections.
467 */ 525 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 526static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp) 527 struct rcu_node *rnp,
528 struct rcu_data *rdp)
470{ 529{
471} 530}
472 531
@@ -484,7 +543,7 @@ static void rcu_preempt_offline_cpu(int cpu)
484 * Because preemptable RCU does not exist, it never has any callbacks 543 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check. 544 * to check.
486 */ 545 */
487void rcu_preempt_check_callbacks(int cpu) 546static void rcu_preempt_check_callbacks(int cpu)
488{ 547{
489} 548}
490 549
@@ -492,7 +551,7 @@ void rcu_preempt_check_callbacks(int cpu)
492 * Because preemptable RCU does not exist, it never has any callbacks 551 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process. 552 * to process.
494 */ 553 */
495void rcu_preempt_process_callbacks(void) 554static void rcu_preempt_process_callbacks(void)
496{ 555{
497} 556}
498 557
@@ -522,6 +581,16 @@ static int rcu_preempt_needs_cpu(int cpu)
522} 581}
523 582
524/* 583/*
584 * Because preemptable RCU does not exist, rcu_barrier() is just
585 * another name for rcu_barrier_sched().
586 */
587void rcu_barrier(void)
588{
589 rcu_barrier_sched();
590}
591EXPORT_SYMBOL_GPL(rcu_barrier);
592
593/*
525 * Because preemptable RCU does not exist, there is no per-CPU 594 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize. 595 * data to initialize.
527 */ 596 */
@@ -529,4 +598,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{ 598{
530} 599}
531 600
601/*
602 * Because there is no preemptable RCU, there are no callbacks to move.
603 */
604static void rcu_preempt_send_cbs_to_orphanage(void)
605{
606}
607
608/*
609 * Because preemptable RCU does not exist, it need not be initialized.
610 */
611static void __init __rcu_init_preempt(void)
612{
613}
614
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 615#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0ea1bff69727..4b31c779e62e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
93 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
94} 94}
95 95
96static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
97 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
98 .open = rcudata_open, 98 .open = rcudata_open,
99 .read = seq_read, 99 .read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
145 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
146} 146}
147 147
148static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
151 .read = seq_read, 151 .read = seq_read,
@@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
159 struct rcu_node *rnp; 159 struct rcu_node *rnp;
160 160
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 163 rsp->completed, rsp->gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 164 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 165 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 166 rsp->n_force_qs, rsp->n_force_qs_ngp,
167 rsp->n_force_qs - rsp->n_force_qs_ngp, 167 rsp->n_force_qs - rsp->n_force_qs_ngp,
168 rsp->n_force_qs_lh); 168 rsp->n_force_qs_lh, rsp->orphan_qlen);
169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
170 if (rnp->level != level) { 170 if (rnp->level != level) {
171 seq_puts(m, "\n"); 171 seq_puts(m, "\n");
@@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
196 return single_open(file, show_rcuhier, NULL); 196 return single_open(file, show_rcuhier, NULL);
197} 197}
198 198
199static struct file_operations rcuhier_fops = { 199static const struct file_operations rcuhier_fops = {
200 .owner = THIS_MODULE, 200 .owner = THIS_MODULE,
201 .open = rcuhier_open, 201 .open = rcuhier_open,
202 .read = seq_read, 202 .read = seq_read,
@@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
222 return single_open(file, show_rcugp, NULL); 222 return single_open(file, show_rcugp, NULL);
223} 223}
224 224
225static struct file_operations rcugp_fops = { 225static const struct file_operations rcugp_fops = {
226 .owner = THIS_MODULE, 226 .owner = THIS_MODULE,
227 .open = rcugp_open, 227 .open = rcugp_open,
228 .read = seq_read, 228 .read = seq_read,
@@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
276 return single_open(file, show_rcu_pending, NULL); 276 return single_open(file, show_rcu_pending, NULL);
277} 277}
278 278
279static struct file_operations rcu_pending_fops = { 279static const struct file_operations rcu_pending_fops = {
280 .owner = THIS_MODULE, 280 .owner = THIS_MODULE,
281 .open = rcu_pending_open, 281 .open = rcu_pending_open,
282 .read = seq_read, 282 .read = seq_read,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..760c26209a3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
223 223
224EXPORT_SYMBOL(release_resource); 224EXPORT_SYMBOL(release_resource);
225 225
226#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) 226#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
227/* 227/*
228 * Finds the lowest memory reosurce exists within [res->start.res->end) 228 * Finds the lowest memory reosurce exists within [res->start.res->end)
229 * the caller must specify res->start, res->end, res->flags. 229 * the caller must specify res->start, res->end, res->flags and "name".
230 * If found, returns 0, res is overwritten, if not found, returns -1. 230 * If found, returns 0, res is overwritten, if not found, returns -1.
231 */ 231 */
232static int find_next_system_ram(struct resource *res) 232static int find_next_system_ram(struct resource *res, char *name)
233{ 233{
234 resource_size_t start, end; 234 resource_size_t start, end;
235 struct resource *p; 235 struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
245 /* system ram is just marked as IORESOURCE_MEM */ 245 /* system ram is just marked as IORESOURCE_MEM */
246 if (p->flags != res->flags) 246 if (p->flags != res->flags)
247 continue; 247 continue;
248 if (name && strcmp(p->name, name))
249 continue;
248 if (p->start > end) { 250 if (p->start > end) {
249 p = NULL; 251 p = NULL;
250 break; 252 break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
262 res->end = p->end; 264 res->end = p->end;
263 return 0; 265 return 0;
264} 266}
265int 267
266walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, 268/*
267 int (*func)(unsigned long, unsigned long, void *)) 269 * This function calls callback against all memory range of "System RAM"
270 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
271 * Now, this function is only for "System RAM".
272 */
273int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *))
268{ 275{
269 struct resource res; 276 struct resource res;
270 unsigned long pfn, len; 277 unsigned long pfn, len;
271 u64 orig_end; 278 u64 orig_end;
272 int ret = -1; 279 int ret = -1;
280
273 res.start = (u64) start_pfn << PAGE_SHIFT; 281 res.start = (u64) start_pfn << PAGE_SHIFT;
274 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 282 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
275 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 283 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
276 orig_end = res.end; 284 orig_end = res.end;
277 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 285 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) {
278 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 287 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
279 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
280 ret = (*func)(pfn, len, arg); 289 ret = (*func)(pfn, len, arg);
diff --git a/kernel/sched.c b/kernel/sched.c
index e27a53685ed9..e88689522e66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -295,12 +293,12 @@ struct task_group root_task_group;
295/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
297/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
299#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
300 298
301#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
302static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
303static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
304#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
305#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
306#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 376
379#else 377#else
380 378
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
390{ 381{
@@ -514,14 +505,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
515 struct cpupri cpupri; 506 struct cpupri cpupri;
516#endif 507#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 508};
526 509
527/* 510/*
@@ -646,9 +629,10 @@ struct rq {
646 629
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 631
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 634{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 636}
653 637
654static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -692,20 +676,15 @@ inline void update_rq_clock(struct rq *rq)
692 676
693/** 677/**
694 * runqueue_is_locked 678 * runqueue_is_locked
679 * @cpu: the processor in question.
695 * 680 *
696 * Returns true if the current cpu runqueue is locked. 681 * Returns true if the current cpu runqueue is locked.
697 * This interface allows printk to be called with the runqueue lock 682 * This interface allows printk to be called with the runqueue lock
698 * held and know whether or not it is OK to wake up the klogd. 683 * held and know whether or not it is OK to wake up the klogd.
699 */ 684 */
700int runqueue_is_locked(void) 685int runqueue_is_locked(int cpu)
701{ 686{
702 int cpu = get_cpu(); 687 return spin_is_locked(&cpu_rq(cpu)->lock);
703 struct rq *rq = cpu_rq(cpu);
704 int ret;
705
706 ret = spin_is_locked(&rq->lock);
707 put_cpu();
708 return ret;
709} 688}
710 689
711/* 690/*
@@ -802,7 +781,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
802 return single_open(filp, sched_feat_show, NULL); 781 return single_open(filp, sched_feat_show, NULL);
803} 782}
804 783
805static struct file_operations sched_feat_fops = { 784static const struct file_operations sched_feat_fops = {
806 .open = sched_feat_open, 785 .open = sched_feat_open,
807 .write = sched_feat_write, 786 .write = sched_feat_write,
808 .read = seq_read, 787 .read = seq_read,
@@ -1509,8 +1488,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1488#endif
1510 1489
1511#ifdef CONFIG_SMP 1490#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1491/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1492static unsigned long weighted_cpuload(const int cpu)
1493{
1494 return cpu_rq(cpu)->load.weight;
1495}
1496
1497/*
1498 * Return a low guess at the load of a migration-source cpu weighted
1499 * according to the scheduling class and "nice" value.
1500 *
1501 * We want to under-estimate the load of migration sources, to
1502 * balance conservatively.
1503 */
1504static unsigned long source_load(int cpu, int type)
1505{
1506 struct rq *rq = cpu_rq(cpu);
1507 unsigned long total = weighted_cpuload(cpu);
1508
1509 if (type == 0 || !sched_feat(LB_BIAS))
1510 return total;
1511
1512 return min(rq->cpu_load[type-1], total);
1513}
1514
1515/*
1516 * Return a high guess at the load of a migration-target cpu weighted
1517 * according to the scheduling class and "nice" value.
1518 */
1519static unsigned long target_load(int cpu, int type)
1520{
1521 struct rq *rq = cpu_rq(cpu);
1522 unsigned long total = weighted_cpuload(cpu);
1523
1524 if (type == 0 || !sched_feat(LB_BIAS))
1525 return total;
1526
1527 return max(rq->cpu_load[type-1], total);
1528}
1529
1530static struct sched_group *group_of(int cpu)
1531{
1532 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1533
1534 if (!sd)
1535 return NULL;
1536
1537 return sd->groups;
1538}
1539
1540static unsigned long power_of(int cpu)
1541{
1542 struct sched_group *group = group_of(cpu);
1543
1544 if (!group)
1545 return SCHED_LOAD_SCALE;
1546
1547 return group->cpu_power;
1548}
1549
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1550static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1551
1516static unsigned long cpu_avg_load_per_task(int cpu) 1552static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1731,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1731
1696#ifdef CONFIG_PREEMPT 1732#ifdef CONFIG_PREEMPT
1697 1733
1734static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1735
1698/* 1736/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1737 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1738 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +1997,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1959} 1997}
1960 1998
1961#ifdef CONFIG_SMP 1999#ifdef CONFIG_SMP
1962
1963/* Used instead of source_load when we know the type == 0 */
1964static unsigned long weighted_cpuload(const int cpu)
1965{
1966 return cpu_rq(cpu)->load.weight;
1967}
1968
1969/* 2000/*
1970 * Is this task likely cache-hot: 2001 * Is this task likely cache-hot:
1971 */ 2002 */
@@ -2023,7 +2054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2023 if (task_hot(p, old_rq->clock, NULL)) 2054 if (task_hot(p, old_rq->clock, NULL))
2024 schedstat_inc(p, se.nr_forced2_migrations); 2055 schedstat_inc(p, se.nr_forced2_migrations);
2025#endif 2056#endif
2026 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2057 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2027 1, 1, NULL, 0); 2058 1, 1, NULL, 0);
2028 } 2059 }
2029 p->se.vruntime -= old_cfsrq->min_vruntime - 2060 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2239,185 +2270,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2270 preempt_enable();
2240} 2271}
2241EXPORT_SYMBOL_GPL(kick_process); 2272EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2273#endif /* CONFIG_SMP */
2422 2274
2423/** 2275/**
@@ -2455,37 +2307,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2307 *
2456 * returns failure only if the task is already active. 2308 * returns failure only if the task is already active.
2457 */ 2309 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2310static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311 int wake_flags)
2459{ 2312{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2313 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2314 unsigned long flags;
2462 long old_state; 2315 struct rq *rq, *orig_rq;
2463 struct rq *rq;
2464 2316
2465 if (!sched_feat(SYNC_WAKEUPS)) 2317 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2318 wake_flags &= ~WF_SYNC;
2467 2319
2468#ifdef CONFIG_SMP 2320 this_cpu = get_cpu();
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471
2472 this_cpu = raw_smp_processor_id();
2473 cpu = task_cpu(p);
2474
2475 for_each_domain(this_cpu, sd) {
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2321
2484 smp_wmb(); 2322 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2323 rq = orig_rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2324 update_rq_clock(rq);
2487 old_state = p->state; 2325 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2326 goto out;
2490 2327
2491 if (p->se.on_rq) 2328 if (p->se.on_rq)
@@ -2493,27 +2330,33 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2330
2494 cpu = task_cpu(p); 2331 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2332 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2333
2498#ifdef CONFIG_SMP 2334#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2335 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2336 goto out_activate;
2501 2337
2502 cpu = p->sched_class->select_task_rq(p, sync); 2338 /*
2503 if (cpu != orig_cpu) { 2339 * In order to handle concurrent wakeups and release the rq->lock
2340 * we put the task in TASK_WAKING state.
2341 *
2342 * First fix up the nr_uninterruptible count:
2343 */
2344 if (task_contributes_to_load(p))
2345 rq->nr_uninterruptible--;
2346 p->state = TASK_WAKING;
2347 task_rq_unlock(rq, &flags);
2348
2349 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2350 if (cpu != orig_cpu)
2504 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513 2352
2514 this_cpu = smp_processor_id(); 2353 rq = task_rq_lock(p, &flags);
2515 cpu = task_cpu(p); 2354
2516 } 2355 if (rq != orig_rq)
2356 update_rq_clock(rq);
2357
2358 WARN_ON(p->state != TASK_WAKING);
2359 cpu = task_cpu(p);
2517 2360
2518#ifdef CONFIG_SCHEDSTATS 2361#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2362 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2376,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2376out_activate:
2534#endif /* CONFIG_SMP */ 2377#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2378 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2379 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2380 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2381 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2382 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2405,7 @@ out_activate:
2562 2405
2563out_running: 2406out_running:
2564 trace_sched_wakeup(rq, p, success); 2407 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2408 check_preempt_curr(rq, p, wake_flags);
2566 2409
2567 p->state = TASK_RUNNING; 2410 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2411#ifdef CONFIG_SMP
@@ -2571,6 +2414,7 @@ out_running:
2571#endif 2414#endif
2572out: 2415out:
2573 task_rq_unlock(rq, &flags); 2416 task_rq_unlock(rq, &flags);
2417 put_cpu();
2574 2418
2575 return success; 2419 return success;
2576} 2420}
@@ -2613,6 +2457,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2457 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2458 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2459 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2460 p->se.avg_running = 0;
2616 2461
2617#ifdef CONFIG_SCHEDSTATS 2462#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2463 p->se.wait_start = 0;
@@ -2674,28 +2519,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
2674 2519
2675 __sched_fork(p); 2520 __sched_fork(p);
2676 2521
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /*
2683 * Make sure we do not leak PI boosting priority to the child.
2684 */
2685 p->prio = current->normal_prio;
2686
2687 /* 2522 /*
2688 * Revert to default priority/policy on fork if requested. 2523 * Revert to default priority/policy on fork if requested.
2689 */ 2524 */
2690 if (unlikely(p->sched_reset_on_fork)) { 2525 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2692 p->policy = SCHED_NORMAL; 2527 p->policy = SCHED_NORMAL;
2693 2528 p->normal_prio = p->static_prio;
2694 if (p->normal_prio < DEFAULT_PRIO) 2529 }
2695 p->prio = DEFAULT_PRIO;
2696 2530
2697 if (PRIO_TO_NICE(p->static_prio) < 0) { 2531 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0); 2532 p->static_prio = NICE_TO_PRIO(0);
2533 p->normal_prio = p->static_prio;
2699 set_load_weight(p); 2534 set_load_weight(p);
2700 } 2535 }
2701 2536
@@ -2706,9 +2541,19 @@ void sched_fork(struct task_struct *p, int clone_flags)
2706 p->sched_reset_on_fork = 0; 2541 p->sched_reset_on_fork = 0;
2707 } 2542 }
2708 2543
2544 /*
2545 * Make sure we do not leak PI boosting priority to the child.
2546 */
2547 p->prio = current->normal_prio;
2548
2709 if (!rt_prio(p->prio)) 2549 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2550 p->sched_class = &fair_sched_class;
2711 2551
2552#ifdef CONFIG_SMP
2553 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2554#endif
2555 set_task_cpu(p, cpu);
2556
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2557#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2558 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2559 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2741,8 +2586,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2741 BUG_ON(p->state != TASK_RUNNING); 2586 BUG_ON(p->state != TASK_RUNNING);
2742 update_rq_clock(rq); 2587 update_rq_clock(rq);
2743 2588
2744 p->prio = effective_prio(p);
2745
2746 if (!p->sched_class->task_new || !current->se.on_rq) { 2589 if (!p->sched_class->task_new || !current->se.on_rq) {
2747 activate_task(rq, p, 0); 2590 activate_task(rq, p, 0);
2748 } else { 2591 } else {
@@ -2754,7 +2597,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2597 inc_nr_running(rq);
2755 } 2598 }
2756 trace_sched_wakeup_new(rq, p, 1); 2599 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2600 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2601#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2602 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2603 p->sched_class->task_wake_up(rq, p);
@@ -2878,7 +2721,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2878 */ 2721 */
2879 prev_state = prev->state; 2722 prev_state = prev->state;
2880 finish_arch_switch(prev); 2723 finish_arch_switch(prev);
2881 perf_counter_task_sched_in(current, cpu_of(rq)); 2724 perf_event_task_sched_in(current, cpu_of(rq));
2882 finish_lock_switch(rq, prev); 2725 finish_lock_switch(rq, prev);
2883 2726
2884 fire_sched_in_preempt_notifiers(current); 2727 fire_sched_in_preempt_notifiers(current);
@@ -3064,6 +2907,19 @@ unsigned long nr_iowait(void)
3064 return sum; 2907 return sum;
3065} 2908}
3066 2909
2910unsigned long nr_iowait_cpu(void)
2911{
2912 struct rq *this = this_rq();
2913 return atomic_read(&this->nr_iowait);
2914}
2915
2916unsigned long this_cpu_load(void)
2917{
2918 struct rq *this = this_rq();
2919 return this->cpu_load[0];
2920}
2921
2922
3067/* Variables and functions for calc_load */ 2923/* Variables and functions for calc_load */
3068static atomic_long_t calc_load_tasks; 2924static atomic_long_t calc_load_tasks;
3069static unsigned long calc_load_update; 2925static unsigned long calc_load_update;
@@ -3263,7 +3119,7 @@ out:
3263void sched_exec(void) 3119void sched_exec(void)
3264{ 3120{
3265 int new_cpu, this_cpu = get_cpu(); 3121 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3122 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3123 put_cpu();
3268 if (new_cpu != this_cpu) 3124 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3125 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3539,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3539 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3540 sds->busiest = sds->group_min;
3685 3541
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3542 return 1;
3692 3543
3693} 3544}
@@ -3711,7 +3562,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3562}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3563#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3564
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3565
3566unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3567{
3568 return SCHED_LOAD_SCALE;
3569}
3570
3571unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3572{
3573 return default_scale_freq_power(sd, cpu);
3574}
3575
3576unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3577{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3578 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3579 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3583,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3583 return smt_gain;
3722} 3584}
3723 3585
3586unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3587{
3588 return default_scale_smt_power(sd, cpu);
3589}
3590
3724unsigned long scale_rt_power(int cpu) 3591unsigned long scale_rt_power(int cpu)
3725{ 3592{
3726 struct rq *rq = cpu_rq(cpu); 3593 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3612,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3612 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3613 struct sched_group *sdg = sd->groups;
3747 3614
3748 /* here we could scale based on cpufreq */ 3615 if (sched_feat(ARCH_POWER))
3616 power *= arch_scale_freq_power(sd, cpu);
3617 else
3618 power *= default_scale_freq_power(sd, cpu);
3619
3620 power >>= SCHED_LOAD_SHIFT;
3749 3621
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3622 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3623 if (sched_feat(ARCH_POWER))
3624 power *= arch_scale_smt_power(sd, cpu);
3625 else
3626 power *= default_scale_smt_power(sd, cpu);
3627
3752 power >>= SCHED_LOAD_SHIFT; 3628 power >>= SCHED_LOAD_SHIFT;
3753 } 3629 }
3754 3630
@@ -3785,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3785 3661
3786/** 3662/**
3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3663 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3664 * @sd: The sched_domain whose statistics are to be updated.
3788 * @group: sched_group whose statistics are to be updated. 3665 * @group: sched_group whose statistics are to be updated.
3789 * @this_cpu: Cpu for which load balance is currently performed. 3666 * @this_cpu: Cpu for which load balance is currently performed.
3790 * @idle: Idle status of this_cpu 3667 * @idle: Idle status of this_cpu
@@ -4161,26 +4038,6 @@ ret:
4161 return NULL; 4038 return NULL;
4162} 4039}
4163 4040
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4041/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4042 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4043 */
@@ -5239,17 +5096,16 @@ void account_idle_time(cputime_t cputime)
5239 */ 5096 */
5240void account_process_tick(struct task_struct *p, int user_tick) 5097void account_process_tick(struct task_struct *p, int user_tick)
5241{ 5098{
5242 cputime_t one_jiffy = jiffies_to_cputime(1); 5099 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5243 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5244 struct rq *rq = this_rq(); 5100 struct rq *rq = this_rq();
5245 5101
5246 if (user_tick) 5102 if (user_tick)
5247 account_user_time(p, one_jiffy, one_jiffy_scaled); 5103 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5248 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5104 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5249 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5105 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5250 one_jiffy_scaled); 5106 one_jiffy_scaled);
5251 else 5107 else
5252 account_idle_time(one_jiffy); 5108 account_idle_time(cputime_one_jiffy);
5253} 5109}
5254 5110
5255/* 5111/*
@@ -5353,7 +5209,7 @@ void scheduler_tick(void)
5353 curr->sched_class->task_tick(rq, curr, 0); 5209 curr->sched_class->task_tick(rq, curr, 0);
5354 spin_unlock(&rq->lock); 5210 spin_unlock(&rq->lock);
5355 5211
5356 perf_counter_task_tick(curr, cpu); 5212 perf_event_task_tick(curr, cpu);
5357 5213
5358#ifdef CONFIG_SMP 5214#ifdef CONFIG_SMP
5359 rq->idle_at_tick = idle_cpu(cpu); 5215 rq->idle_at_tick = idle_cpu(cpu);
@@ -5465,14 +5321,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5321#endif
5466} 5322}
5467 5323
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5324static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5325{
5470 if (prev->state == TASK_RUNNING) { 5326 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5327
5473 runtime -= prev->se.prev_sum_exec_runtime; 5328 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5329
5330 if (p->state == TASK_RUNNING) {
5476 /* 5331 /*
5477 * In order to avoid avg_overlap growing stale when we are 5332 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5333 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5337,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5337 * correlates to the amount of cache footprint a task can
5483 * build up. 5338 * build up.
5484 */ 5339 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5340 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5341 update_avg(&p->se.avg_overlap, runtime);
5342 } else {
5343 update_avg(&p->se.avg_running, 0);
5486 } 5344 }
5487 prev->sched_class->put_prev_task(rq, prev); 5345 p->sched_class->put_prev_task(rq, p);
5488} 5346}
5489 5347
5490/* 5348/*
@@ -5567,7 +5425,7 @@ need_resched_nonpreemptible:
5567 5425
5568 if (likely(prev != next)) { 5426 if (likely(prev != next)) {
5569 sched_info_switch(prev, next); 5427 sched_info_switch(prev, next);
5570 perf_counter_task_sched_out(prev, next, cpu); 5428 perf_event_task_sched_out(prev, next, cpu);
5571 5429
5572 rq->nr_switches++; 5430 rq->nr_switches++;
5573 rq->curr = next; 5431 rq->curr = next;
@@ -5716,10 +5574,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5574
5717#endif /* CONFIG_PREEMPT */ 5575#endif /* CONFIG_PREEMPT */
5718 5576
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5577int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5578 void *key)
5721{ 5579{
5722 return try_to_wake_up(curr->private, mode, sync); 5580 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5581}
5724EXPORT_SYMBOL(default_wake_function); 5582EXPORT_SYMBOL(default_wake_function);
5725 5583
@@ -5733,14 +5591,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5591 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5592 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5593static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5594 int nr_exclusive, int wake_flags, void *key)
5737{ 5595{
5738 wait_queue_t *curr, *next; 5596 wait_queue_t *curr, *next;
5739 5597
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5598 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5599 unsigned flags = curr->flags;
5742 5600
5743 if (curr->func(curr, mode, sync, key) && 5601 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5602 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5603 break;
5746 } 5604 }
@@ -5801,16 +5659,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5659 int nr_exclusive, void *key)
5802{ 5660{
5803 unsigned long flags; 5661 unsigned long flags;
5804 int sync = 1; 5662 int wake_flags = WF_SYNC;
5805 5663
5806 if (unlikely(!q)) 5664 if (unlikely(!q))
5807 return; 5665 return;
5808 5666
5809 if (unlikely(!nr_exclusive)) 5667 if (unlikely(!nr_exclusive))
5810 sync = 0; 5668 wake_flags = 0;
5811 5669
5812 spin_lock_irqsave(&q->lock, flags); 5670 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5671 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5672 spin_unlock_irqrestore(&q->lock, flags);
5815} 5673}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5674EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6866,9 +6724,6 @@ EXPORT_SYMBOL(yield);
6866/* 6724/*
6867 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6725 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6868 * that process accounting knows that this is a task in IO wait state. 6726 * that process accounting knows that this is a task in IO wait state.
6869 *
6870 * But don't do that if it is a deliberate, throttling IO wait (this task
6871 * has set its backing_dev_info: the queue against which it should throttle)
6872 */ 6727 */
6873void __sched io_schedule(void) 6728void __sched io_schedule(void)
6874{ 6729{
@@ -6977,23 +6832,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6977 if (retval) 6832 if (retval)
6978 goto out_unlock; 6833 goto out_unlock;
6979 6834
6980 /* 6835 time_slice = p->sched_class->get_rr_interval(p);
6981 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6982 * tasks that are on an otherwise idle runqueue:
6983 */
6984 time_slice = 0;
6985 if (p->policy == SCHED_RR) {
6986 time_slice = DEF_TIMESLICE;
6987 } else if (p->policy != SCHED_FIFO) {
6988 struct sched_entity *se = &p->se;
6989 unsigned long flags;
6990 struct rq *rq;
6991 6836
6992 rq = task_rq_lock(p, &flags);
6993 if (rq->cfs.load.weight)
6994 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6995 task_rq_unlock(rq, &flags);
6996 }
6997 read_unlock(&tasklist_lock); 6837 read_unlock(&tasklist_lock);
6998 jiffies_to_timespec(time_slice, &t); 6838 jiffies_to_timespec(time_slice, &t);
6999 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6839 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -7844,7 +7684,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7844/* 7684/*
7845 * Register at high priority so that task migration (migrate_all_tasks) 7685 * Register at high priority so that task migration (migrate_all_tasks)
7846 * happens before everything else. This has to be lower priority than 7686 * happens before everything else. This has to be lower priority than
7847 * the notifier in the perf_counter subsystem, though. 7687 * the notifier in the perf_event subsystem, though.
7848 */ 7688 */
7849static struct notifier_block __cpuinitdata migration_notifier = { 7689static struct notifier_block __cpuinitdata migration_notifier = {
7850 .notifier_call = migration_call, 7690 .notifier_call = migration_call,
@@ -8000,9 +7840,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7840 }
8001 7841
8002 /* Following flags don't use groups */ 7842 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7843 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7844 return 0;
8007 7845
8008 return 1; 7846 return 1;
@@ -8019,10 +7857,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7857 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7858 return 0;
8021 7859
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7860 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7861 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7862 pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8542,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8542 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8543 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8544 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8545 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8546 } else {
8713 /* turn on idle balance on this domain */ 8547 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8548 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8549 }
8716} 8550}
8717 8551
@@ -9329,6 +9163,7 @@ void __init sched_init_smp(void)
9329 cpumask_var_t non_isolated_cpus; 9163 cpumask_var_t non_isolated_cpus;
9330 9164
9331 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9165 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9166 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9332 9167
9333#if defined(CONFIG_NUMA) 9168#if defined(CONFIG_NUMA)
9334 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9169 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9360,7 +9195,6 @@ void __init sched_init_smp(void)
9360 sched_init_granularity(); 9195 sched_init_granularity();
9361 free_cpumask_var(non_isolated_cpus); 9196 free_cpumask_var(non_isolated_cpus);
9362 9197
9363 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9364 init_sched_rt_class(); 9198 init_sched_rt_class();
9365} 9199}
9366#else 9200#else
@@ -9707,7 +9541,7 @@ void __init sched_init(void)
9707 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9708#endif /* SMP */ 9542#endif /* SMP */
9709 9543
9710 perf_counter_init(); 9544 perf_event_init();
9711 9545
9712 scheduler_running = 1; 9546 scheduler_running = 1;
9713} 9547}
@@ -10479,7 +10313,7 @@ static int sched_rt_global_constraints(void)
10479#endif /* CONFIG_RT_GROUP_SCHED */ 10313#endif /* CONFIG_RT_GROUP_SCHED */
10480 10314
10481int sched_rt_handler(struct ctl_table *table, int write, 10315int sched_rt_handler(struct ctl_table *table, int write,
10482 struct file *filp, void __user *buffer, size_t *lenp, 10316 void __user *buffer, size_t *lenp,
10483 loff_t *ppos) 10317 loff_t *ppos)
10484{ 10318{
10485 int ret; 10319 int ret;
@@ -10490,7 +10324,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10490 old_period = sysctl_sched_rt_period; 10324 old_period = sysctl_sched_rt_period;
10491 old_runtime = sysctl_sched_rt_runtime; 10325 old_runtime = sysctl_sched_rt_runtime;
10492 10326
10493 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10327 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10494 10328
10495 if (!ret && write) { 10329 if (!ret && write) {
10496 ret = sched_rt_global_constraints(); 10330 ret = sched_rt_global_constraints();
@@ -10544,8 +10378,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10544} 10378}
10545 10379
10546static int 10380static int
10547cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10381cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10548 struct task_struct *tsk)
10549{ 10382{
10550#ifdef CONFIG_RT_GROUP_SCHED 10383#ifdef CONFIG_RT_GROUP_SCHED
10551 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10555,15 +10388,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10555 if (tsk->sched_class != &fair_sched_class) 10388 if (tsk->sched_class != &fair_sched_class)
10556 return -EINVAL; 10389 return -EINVAL;
10557#endif 10390#endif
10391 return 0;
10392}
10558 10393
10394static int
10395cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10396 struct task_struct *tsk, bool threadgroup)
10397{
10398 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10399 if (retval)
10400 return retval;
10401 if (threadgroup) {
10402 struct task_struct *c;
10403 rcu_read_lock();
10404 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10405 retval = cpu_cgroup_can_attach_task(cgrp, c);
10406 if (retval) {
10407 rcu_read_unlock();
10408 return retval;
10409 }
10410 }
10411 rcu_read_unlock();
10412 }
10559 return 0; 10413 return 0;
10560} 10414}
10561 10415
10562static void 10416static void
10563cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10417cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10564 struct cgroup *old_cont, struct task_struct *tsk) 10418 struct cgroup *old_cont, struct task_struct *tsk,
10419 bool threadgroup)
10565{ 10420{
10566 sched_move_task(tsk); 10421 sched_move_task(tsk);
10422 if (threadgroup) {
10423 struct task_struct *c;
10424 rcu_read_lock();
10425 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10426 sched_move_task(c);
10427 }
10428 rcu_read_unlock();
10429 }
10567} 10430}
10568 10431
10569#ifdef CONFIG_FAIR_GROUP_SCHED 10432#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..4e777b47eeda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
384 384
385#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 388 loff_t *ppos)
389{ 389{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
391 391
392 if (ret || !write) 392 if (ret || !write)
393 return ret; 393 return ret;
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
513 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
514 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
515 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
516 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
517 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
518 } 519 }
@@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
709 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
710 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
711 712
712 if (!initial) { 713 /* sleeps up to a single latency don't count. */
713 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
714 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
715 unsigned long thresh = sysctl_sched_latency;
716 716
717 /* 717 /*
718 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
719 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
720 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
721 * all of which have the same weight. 721 * all of which have the same weight.
722 */ 722 */
723 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
724 (!entity_is_task(se) || 724 task_of(se)->policy != SCHED_IDLE))
725 task_of(se)->policy != SCHED_IDLE)) 725 thresh = calc_delta_fair(thresh, se);
726 thresh = calc_delta_fair(thresh, se);
727 726
728 vruntime -= thresh; 727 /*
729 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
733
734 vruntime -= thresh;
730 } 735 }
731 736
732 /* ensure we never gain time by being placed backwards. */ 737 /* ensure we never gain time by being placed backwards. */
@@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
757 762
758static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
759{ 764{
760 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
761 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
762 767
763 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
764 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
765} 770}
766 771
@@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1067 se->vruntime = rightmost->vruntime + 1;
1063} 1068}
1064 1069
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1070#ifdef CONFIG_SMP
1143 1071
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1072#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1153
1226#endif 1154#endif
1227 1155
1228static int 1156static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1157{
1234 struct task_struct *curr = this_rq->curr; 1158 struct task_struct *curr = current;
1235 struct task_group *tg; 1159 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1160 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1161 unsigned long tl_per_task;
1162 unsigned int imbalance;
1163 struct task_group *tg;
1238 unsigned long weight; 1164 unsigned long weight;
1239 int balanced; 1165 int balanced;
1240 1166
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1167 idx = sd->wake_idx;
1242 return 0; 1168 this_cpu = smp_processor_id();
1169 prev_cpu = task_cpu(p);
1170 load = source_load(prev_cpu, idx);
1171 this_load = target_load(this_cpu, idx);
1243 1172
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1173 if (sync) {
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1174 if (sched_feat(SYNC_LESS) &&
1246 sync = 0; 1175 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178 } else {
1179 if (sched_feat(SYNC_MORE) &&
1180 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1181 p->se.avg_overlap < sysctl_sched_migration_cost))
1182 sync = 1;
1183 }
1247 1184
1248 /* 1185 /*
1249 * If sync wakeup then subtract the (maximum possible) 1186 * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1191 tg = task_group(current);
1255 weight = current->se.load.weight; 1192 weight = current->se.load.weight;
1256 1193
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1194 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1195 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1196 }
1260 1197
1261 tg = task_group(p); 1198 tg = task_group(p);
1262 weight = p->se.load.weight; 1199 weight = p->se.load.weight;
1263 1200
1201 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1202
1264 /* 1203 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1204 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1205 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1206 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1207 * about that, so that's good too.
1269 * 1208 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1209 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1210 * task to be woken on this_cpu.
1272 */ 1211 */
1273 balanced = !tl || 1212 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1213 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1214 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1215
1277 /* 1216 /*
@@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1224 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1225 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1226
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1227 if (balanced ||
1289 tl_per_task)) { 1228 (this_load <= load &&
1229 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1230 /*
1291 * This domain has SD_WAKE_AFFINE and 1231 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1232 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1233 * there is no bad imbalance.
1294 */ 1234 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1235 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1236 schedstat_inc(p, se.nr_wakeups_affine);
1297 1237
1298 return 1; 1238 return 1;
@@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1240 return 0;
1301} 1241}
1302 1242
1303static int select_task_rq_fair(struct task_struct *p, int sync) 1243/*
1244 * find_idlest_group finds and returns the least busy CPU group within the
1245 * domain.
1246 */
1247static struct sched_group *
1248find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1249 int this_cpu, int load_idx)
1304{ 1250{
1305 struct sched_domain *sd, *this_sd = NULL; 1251 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1306 int prev_cpu, this_cpu, new_cpu; 1252 unsigned long min_load = ULONG_MAX, this_load = 0;
1307 unsigned long load, this_load; 1253 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1308 struct rq *this_rq;
1309 unsigned int imbalance;
1310 int idx;
1311 1254
1312 prev_cpu = task_cpu(p); 1255 do {
1313 this_cpu = smp_processor_id(); 1256 unsigned long load, avg_load;
1314 this_rq = cpu_rq(this_cpu); 1257 int local_group;
1315 new_cpu = prev_cpu; 1258 int i;
1316 1259
1317 /* 1260 /* Skip over this group if it has no CPUs allowed */
1318 * 'this_sd' is the first domain that both 1261 if (!cpumask_intersects(sched_group_cpus(group),
1319 * this_cpu and prev_cpu are present in: 1262 &p->cpus_allowed))
1320 */ 1263 continue;
1321 for_each_domain(this_cpu, sd) { 1264
1322 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1265 local_group = cpumask_test_cpu(this_cpu,
1323 this_sd = sd; 1266 sched_group_cpus(group));
1324 break; 1267
1268 /* Tally up the load of all CPUs in the group */
1269 avg_load = 0;
1270
1271 for_each_cpu(i, sched_group_cpus(group)) {
1272 /* Bias balancing toward cpus of our domain */
1273 if (local_group)
1274 load = source_load(i, load_idx);
1275 else
1276 load = target_load(i, load_idx);
1277
1278 avg_load += load;
1279 }
1280
1281 /* Adjust by relative CPU power of the group */
1282 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1283
1284 if (local_group) {
1285 this_load = avg_load;
1286 this = group;
1287 } else if (avg_load < min_load) {
1288 min_load = avg_load;
1289 idlest = group;
1290 }
1291 } while (group = group->next, group != sd->groups);
1292
1293 if (!idlest || 100*this_load < imbalance*min_load)
1294 return NULL;
1295 return idlest;
1296}
1297
1298/*
1299 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1300 */
1301static int
1302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1303{
1304 unsigned long load, min_load = ULONG_MAX;
1305 int idlest = -1;
1306 int i;
1307
1308 /* Traverse only the allowed CPUs */
1309 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1310 load = weighted_cpuload(i);
1311
1312 if (load < min_load || (load == min_load && i == this_cpu)) {
1313 min_load = load;
1314 idlest = i;
1325 } 1315 }
1326 } 1316 }
1327 1317
1328 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1318 return idlest;
1329 goto out; 1319}
1330 1320
1331 /* 1321/*
1332 * Check for affine wakeup and passive balancing possibilities. 1322 * sched_balance_self: balance the current task (running on cpu) in domains
1333 */ 1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1334 if (!this_sd) 1324 * SD_BALANCE_EXEC.
1325 *
1326 * Balance, ie. select the least loaded group.
1327 *
1328 * Returns the target CPU number, or the same CPU if no balancing is needed.
1329 *
1330 * preempt must be disabled.
1331 */
1332static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1333{
1334 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1335 int cpu = smp_processor_id();
1336 int prev_cpu = task_cpu(p);
1337 int new_cpu = cpu;
1338 int want_affine = 0;
1339 int want_sd = 1;
1340 int sync = wake_flags & WF_SYNC;
1341
1342 if (sd_flag & SD_BALANCE_WAKE) {
1343 if (sched_feat(AFFINE_WAKEUPS) &&
1344 cpumask_test_cpu(cpu, &p->cpus_allowed))
1345 want_affine = 1;
1346 new_cpu = prev_cpu;
1347 }
1348
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) {
1351 /*
1352 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider.
1354 */
1355 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1356 unsigned long power = 0;
1357 unsigned long nr_running = 0;
1358 unsigned long capacity;
1359 int i;
1360
1361 for_each_cpu(i, sched_domain_span(tmp)) {
1362 power += power_of(i);
1363 nr_running += cpu_rq(i)->cfs.nr_running;
1364 }
1365
1366 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1367
1368 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1369 nr_running /= 2;
1370
1371 if (nr_running < capacity)
1372 want_sd = 0;
1373 }
1374
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1377
1378 affine_sd = tmp;
1379 want_affine = 0;
1380 }
1381
1382 if (!want_sd && !want_affine)
1383 break;
1384
1385 if (!(tmp->flags & sd_flag))
1386 continue;
1387
1388 if (want_sd)
1389 sd = tmp;
1390 }
1391
1392 if (sched_feat(LB_SHARES_UPDATE)) {
1393 /*
1394 * Pick the largest domain to update shares over
1395 */
1396 tmp = sd;
1397 if (affine_sd && (!tmp ||
1398 cpumask_weight(sched_domain_span(affine_sd)) >
1399 cpumask_weight(sched_domain_span(sd))))
1400 tmp = affine_sd;
1401
1402 if (tmp)
1403 update_shares(tmp);
1404 }
1405
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1407 new_cpu = cpu;
1335 goto out; 1408 goto out;
1409 }
1336 1410
1337 idx = this_sd->wake_idx; 1411 while (sd) {
1412 int load_idx = sd->forkexec_idx;
1413 struct sched_group *group;
1414 int weight;
1338 1415
1339 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1416 if (!(sd->flags & sd_flag)) {
1417 sd = sd->child;
1418 continue;
1419 }
1340 1420
1341 load = source_load(prev_cpu, idx); 1421 if (sd_flag & SD_BALANCE_WAKE)
1342 this_load = target_load(this_cpu, idx); 1422 load_idx = sd->wake_idx;
1343 1423
1344 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1424 group = find_idlest_group(sd, p, cpu, load_idx);
1345 load, this_load, imbalance)) 1425 if (!group) {
1346 return this_cpu; 1426 sd = sd->child;
1427 continue;
1428 }
1347 1429
1348 /* 1430 new_cpu = find_idlest_cpu(group, p, cpu);
1349 * Start passive balancing when half the imbalance_pct 1431 if (new_cpu == -1 || new_cpu == cpu) {
1350 * limit is reached. 1432 /* Now try balancing at a lower domain level of cpu */
1351 */ 1433 sd = sd->child;
1352 if (this_sd->flags & SD_WAKE_BALANCE) { 1434 continue;
1353 if (imbalance*this_load <= 100*load) { 1435 }
1354 schedstat_inc(this_sd, ttwu_move_balance); 1436
1355 schedstat_inc(p, se.nr_wakeups_passive); 1437 /* Now try balancing at a lower domain level of new_cpu */
1356 return this_cpu; 1438 cpu = new_cpu;
1439 weight = cpumask_weight(sched_domain_span(sd));
1440 sd = NULL;
1441 for_each_domain(cpu, tmp) {
1442 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1443 break;
1444 if (tmp->flags & sd_flag)
1445 sd = tmp;
1357 } 1446 }
1447 /* while loop will break here if sd == NULL */
1358 } 1448 }
1359 1449
1360out: 1450out:
1361 return wake_idle(new_cpu, p); 1451 rcu_read_unlock();
1452 return new_cpu;
1362} 1453}
1363#endif /* CONFIG_SMP */ 1454#endif /* CONFIG_SMP */
1364 1455
@@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
1471/* 1562/*
1472 * Preempt the current task with a newly woken task if needed: 1563 * Preempt the current task with a newly woken task if needed:
1473 */ 1564 */
1474static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1565static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1475{ 1566{
1476 struct task_struct *curr = rq->curr; 1567 struct task_struct *curr = rq->curr;
1477 struct sched_entity *se = &curr->se, *pse = &p->se; 1568 struct sched_entity *se = &curr->se, *pse = &p->se;
1478 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC;
1479 1571
1480 update_curr(cfs_rq); 1572 update_curr(cfs_rq);
1481 1573
@@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1501 */ 1593 */
1502 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1503 set_last_buddy(se); 1595 set_last_buddy(se);
1504 set_next_buddy(pse); 1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse);
1505 1598
1506 /* 1599 /*
1507 * We can come here with TIF_NEED_RESCHED already set from new task 1600 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1523 return; 1616 return;
1524 } 1617 }
1525 1618
1526 if (!sched_feat(WAKEUP_PREEMPT)) 1619 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1527 return; 1620 (sched_feat(WAKEUP_OVERLAP) &&
1528 1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1529 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1530 (se->avg_overlap < sysctl_sched_migration_cost &&
1531 pse->avg_overlap < sysctl_sched_migration_cost))) {
1532 resched_task(curr); 1623 resched_task(curr);
1533 return; 1624 return;
1534 } 1625 }
1535 1626
1627 if (sched_feat(WAKEUP_RUNNING)) {
1628 if (pse->avg_running < se->avg_running) {
1629 set_next_buddy(pse);
1630 resched_task(curr);
1631 return;
1632 }
1633 }
1634
1635 if (!sched_feat(WAKEUP_PREEMPT))
1636 return;
1637
1536 find_matching_se(&se, &pse); 1638 find_matching_se(&se, &pse);
1537 1639
1538 BUG_ON(!pse); 1640 BUG_ON(!pse);
@@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1555 /* 1657 /*
1556 * If se was a buddy, clear it so that it will have to earn 1658 * If se was a buddy, clear it so that it will have to earn
1557 * the favour again. 1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1558 */ 1665 */
1559 __clear_buddies(cfs_rq, se); 1666 __clear_buddies(cfs_rq, NULL);
1560 set_next_entity(cfs_rq, se); 1667 set_next_entity(cfs_rq, se);
1561 cfs_rq = group_cfs_rq(se); 1668 cfs_rq = group_cfs_rq(se);
1562 } while (cfs_rq); 1669 } while (cfs_rq);
@@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
1832} 1939}
1833#endif 1940#endif
1834 1941
1942unsigned int get_rr_interval_fair(struct task_struct *task)
1943{
1944 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0;
1948
1949 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue:
1952 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957
1958 return rr_interval;
1959}
1960
1835/* 1961/*
1836 * All the scheduling class methods: 1962 * All the scheduling class methods:
1837 */ 1963 */
@@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
1860 .prio_changed = prio_changed_fair, 1986 .prio_changed = prio_changed_fair,
1861 .switched_to = switched_to_fair, 1987 .switched_to = switched_to_fair,
1862 1988
1989 .get_rr_interval = get_rr_interval_fair,
1990
1863#ifdef CONFIG_FAIR_GROUP_SCHED 1991#ifdef CONFIG_FAIR_GROUP_SCHED
1864 .moved_group = moved_group_fair, 1992 .moved_group = moved_group_fair,
1865#endif 1993#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
940 940
941static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
942{ 942{
943 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
944 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
945 /* 948 /*
946 * If the current task is an RT task, then 949 * If the current task is an RT task, then
947 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
999/* 1002/*
1000 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
1001 */ 1004 */
1002static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1003{ 1006{
1004 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
1005 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
1731 dequeue_pushable_task(rq, p); 1734 dequeue_pushable_task(rq, p);
1732} 1735}
1733 1736
1737unsigned int get_rr_interval_rt(struct task_struct *task)
1738{
1739 /*
1740 * Time slice is 0 for SCHED_FIFO tasks
1741 */
1742 if (task->policy == SCHED_RR)
1743 return DEF_TIMESLICE;
1744 else
1745 return 0;
1746}
1747
1734static const struct sched_class rt_sched_class = { 1748static const struct sched_class rt_sched_class = {
1735 .next = &fair_sched_class, 1749 .next = &fair_sched_class,
1736 .enqueue_task = enqueue_task_rt, 1750 .enqueue_task = enqueue_task_rt,
@@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
1759 .set_curr_task = set_curr_task_rt, 1773 .set_curr_task = set_curr_task_rt,
1760 .task_tick = task_tick_rt, 1774 .task_tick = task_tick_rt,
1761 1775
1776 .get_rr_interval = get_rr_interval_rt,
1777
1762 .prio_changed = prio_changed_rt, 1778 .prio_changed = prio_changed_rt,
1763 .switched_to = switched_to_rt, 1779 .switched_to = switched_to_rt,
1764}; 1780};
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6705320784fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 705
706 if (why) { 706 if (why) {
707 /* 707 /*
708 * The first thread which returns from finish_stop() 708 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 709 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 710 * notify its parent. See get_signal_to_deliver().
711 */ 711 */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 971 return send_signal(sig, info, t, 0);
972} 972}
973 973
974int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
975 bool group)
976{
977 unsigned long flags;
978 int ret = -ESRCH;
979
980 if (lock_task_sighand(p, &flags)) {
981 ret = send_signal(sig, info, p, group);
982 unlock_task_sighand(p, &flags);
983 }
984
985 return ret;
986}
987
974/* 988/*
975 * Force a signal that the process can't ignore: if necessary 989 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 990 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1050 }
1037} 1051}
1038 1052
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1053struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1054{
1047 struct sighand_struct *sighand; 1055 struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1076 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1077int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1078{
1071 unsigned long flags; 1079 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073 1080
1074 ret = check_kill_permission(sig, info, p); 1081 if (!ret && sig)
1075 1082 ret = do_send_sig_info(sig, info, p, true);
1076 if (!ret && sig) {
1077 ret = -ESRCH;
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1083
1084 return ret; 1084 return ret;
1085} 1085}
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1224 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1225 */
1226 1226
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1227int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1228send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1229{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1230 /*
1237 * Make sure legacy kernel users don't send in bad values 1231 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1232 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1234 if (!valid_signal(sig))
1241 return -EINVAL; 1235 return -EINVAL;
1242 1236
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1237 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1238}
1248 1239
1249#define __si_special(priv) \ 1240#define __si_special(priv) \
@@ -1383,15 +1374,6 @@ ret:
1383} 1374}
1384 1375
1385/* 1376/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1377 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1378 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1379 *
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1655 spin_unlock_irq(&current->sighand->siglock);
1674} 1656}
1675 1657
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1658/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1659 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1660 * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1664static int do_signal_stop(int signr)
1706{ 1665{
1707 struct signal_struct *sig = current->signal; 1666 struct signal_struct *sig = current->signal;
1708 int stop_count; 1667 int notify;
1709 1668
1710 if (sig->group_stop_count > 0) { 1669 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1670 struct task_struct *t;
1718 1671
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1672 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
1725 */ 1678 */
1726 sig->group_exit_code = signr; 1679 sig->group_exit_code = signr;
1727 1680
1728 stop_count = 0; 1681 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1682 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1683 /*
1731 * Setting state to TASK_STOPPED for a group 1684 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
1734 */ 1687 */
1735 if (!(t->flags & PF_EXITING) && 1688 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1689 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1690 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1691 signal_wake_up(t, 0);
1739 } 1692 }
1740 sig->group_stop_count = stop_count;
1741 } 1693 }
1694 /*
1695 * If there are no other threads in the group, or if there is
1696 * a group stop in progress and we are the last to stop, report
1697 * to the parent. When ptraced, every thread reports itself.
1698 */
1699 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1700 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1701 /*
1702 * tracehook_notify_jctl() can drop and reacquire siglock, so
1703 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1704 * or SIGKILL comes in between ->group_stop_count == 0.
1705 */
1706 if (sig->group_stop_count) {
1707 if (!--sig->group_stop_count)
1708 sig->flags = SIGNAL_STOP_STOPPED;
1709 current->exit_code = sig->group_exit_code;
1710 __set_current_state(TASK_STOPPED);
1711 }
1712 spin_unlock_irq(&current->sighand->siglock);
1742 1713
1743 if (stop_count == 0) 1714 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1715 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1716 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1717 read_unlock(&tasklist_lock);
1718 }
1719
1720 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1721 do {
1722 schedule();
1723 } while (try_to_freeze());
1724
1725 tracehook_finish_jctl();
1726 current->exit_code = 0;
1747 1727
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1728 return 1;
1751} 1729}
1752 1730
@@ -1815,14 +1793,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1793 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1794 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1795 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1796
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1797 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1798 spin_unlock_irq(&sighand->siglock);
1822 1799
1823 read_lock(&tasklist_lock); 1800 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1801 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1802 do_notify_parent_cldstop(current->group_leader, why);
1803 read_unlock(&tasklist_lock);
1804 }
1826 goto relock; 1805 goto relock;
1827 } 1806 }
1828 1807
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1966 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 1967 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 1968 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 1969 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 1970 }
1992out: 1971out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 1972 spin_unlock_irq(&tsk->sighand->siglock);
1994 1973
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 1974 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 1975 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1976 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 1977 read_unlock(&tasklist_lock);
1999 } 1978 }
2000} 1979}
@@ -2290,7 +2269,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2269do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2270{
2292 struct task_struct *p; 2271 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2272 int error = -ESRCH;
2295 2273
2296 rcu_read_lock(); 2274 rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2278 /*
2301 * The null signal is a permissions and process existence 2279 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2280 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2281 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2282 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2283 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2284 /*
2285 * If lock_task_sighand() failed we pretend the task
2286 * dies after receiving the signal. The window is tiny,
2287 * and the signal is private anyway.
2288 */
2289 if (unlikely(error == -ESRCH))
2290 error = 0;
2311 } 2291 }
2312 } 2292 }
2313 rcu_read_unlock(); 2293 rcu_read_unlock();
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..0d31135efbf4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 26static void slow_work_oom_timeout(unsigned long);
27 27
28#ifdef CONFIG_SYSCTL 28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 29static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
31 31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 32static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 33 void __user *, size_t *, loff_t *);
34#endif 34#endif
35 35
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 493 * Handle adjustment of the minimum number of threads
494 */ 494 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 496 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 497 size_t *lenp, loff_t *ppos)
498{ 498{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 499 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 500 int n;
501 501
502 if (ret == 0) { 502 if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 521 * Handle adjustment of the maximum number of threads
522 */ 522 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 524 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 525 size_t *lenp, loff_t *ppos)
526{ 526{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 527 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 528 int n;
529 529
530 if (ret == 0) { 530 if (ret == 0) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..c9d1c7835c2f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
29 29
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data csd; 31 struct call_single_data csd;
32 spinlock_t lock; 32 atomic_t refs;
33 unsigned int refs;
34 cpumask_var_t cpumask; 33 cpumask_var_t cpumask;
35}; 34};
36 35
@@ -39,9 +38,7 @@ struct call_single_queue {
39 spinlock_t lock; 38 spinlock_t lock;
40}; 39};
41 40
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45 42
46static int 43static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 174 int cpu = get_cpu();
178 175
179 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online.
178 */
179 WARN_ON_ONCE(!cpu_online(cpu));
180
181 /*
180 * Ensure entry is visible on call_function_queue after we have 182 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 183 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 184 * If we don't have this, then we may miss an entry on the list
@@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 193 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
192 int refs; 194 int refs;
193 195
194 spin_lock(&data->lock); 196 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
197 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
201 198
202 data->csd.func(data->csd.info); 199 data->csd.func(data->csd.info);
203 200
204 spin_lock(&data->lock); 201 refs = atomic_dec_return(&data->refs);
205 WARN_ON(data->refs == 0); 202 WARN_ON(refs < 0);
206 refs = --data->refs;
207 if (!refs) { 203 if (!refs) {
208 spin_lock(&call_function.lock); 204 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock); 206 spin_unlock(&call_function.lock);
211 } 207 }
212 spin_unlock(&data->lock);
213 208
214 if (refs) 209 if (refs)
215 continue; 210 continue;
@@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 225 unsigned int data_flags;
231 LIST_HEAD(list); 226 LIST_HEAD(list);
232 227
228 /*
229 * Shouldn't receive this interrupt on a cpu that is not yet online.
230 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232
233 spin_lock(&q->lock); 233 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 234 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 235 spin_unlock(&q->lock);
@@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 285 */
286 this_cpu = get_cpu(); 286 this_cpu = get_cpu();
287 287
288 /* Can deadlock when called with interrupts disabled */ 288 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 289 * Can deadlock when called with interrupts disabled.
290 * We allow cpu's that are not yet online though, as no one else can
291 * send smp call function interrupt to this cpu and as such deadlocks
292 * can't happen.
293 */
294 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
295 && !oops_in_progress);
290 296
291 if (cpu == this_cpu) { 297 if (cpu == this_cpu) {
292 local_irq_save(flags); 298 local_irq_save(flags);
@@ -329,19 +335,18 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 335{
330 csd_lock(data); 336 csd_lock(data);
331 337
332 /* Can deadlock when called with interrupts disabled */ 338 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 339 * Can deadlock when called with interrupts disabled.
340 * We allow cpu's that are not yet online though, as no one else can
341 * send smp call function interrupt to this cpu and as such deadlocks
342 * can't happen.
343 */
344 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
345 && !oops_in_progress);
334 346
335 generic_exec_single(cpu, data, wait); 347 generic_exec_single(cpu, data, wait);
336} 348}
337 349
338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
340#ifndef arch_send_call_function_ipi_mask
341# define arch_send_call_function_ipi_mask(maskp) \
342 arch_send_call_function_ipi(*(maskp))
343#endif
344
345/** 350/**
346 * smp_call_function_many(): Run a function on a set of other CPUs. 351 * smp_call_function_many(): Run a function on a set of other CPUs.
347 * @mask: The set of cpus to run on (only runs on online subset). 352 * @mask: The set of cpus to run on (only runs on online subset).
@@ -365,8 +370,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 370 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 371 int cpu, next_cpu, this_cpu = smp_processor_id();
367 372
368 /* Can deadlock when called with interrupts disabled */ 373 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 374 * Can deadlock when called with interrupts disabled.
375 * We allow cpu's that are not yet online though, as no one else can
376 * send smp call function interrupt to this cpu and as such deadlocks
377 * can't happen.
378 */
379 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
380 && !oops_in_progress);
370 381
371 /* So, what's a CPU they want? Ignoring this one. */ 382 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 383 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -391,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask,
391 data = &__get_cpu_var(cfd_data); 402 data = &__get_cpu_var(cfd_data);
392 csd_lock(&data->csd); 403 csd_lock(&data->csd);
393 404
394 spin_lock_irqsave(&data->lock, flags);
395 data->csd.func = func; 405 data->csd.func = func;
396 data->csd.info = info; 406 data->csd.info = info;
397 cpumask_and(data->cpumask, mask, cpu_online_mask); 407 cpumask_and(data->cpumask, mask, cpu_online_mask);
398 cpumask_clear_cpu(this_cpu, data->cpumask); 408 cpumask_clear_cpu(this_cpu, data->cpumask);
399 data->refs = cpumask_weight(data->cpumask); 409 atomic_set(&data->refs, cpumask_weight(data->cpumask));
400 410
401 spin_lock(&call_function.lock); 411 spin_lock_irqsave(&call_function.lock, flags);
402 /* 412 /*
403 * Place entry at the _HEAD_ of the list, so that any cpu still 413 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt() 414 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries: 415 * will not miss any other list entries:
406 */ 416 */
407 list_add_rcu(&data->csd.list, &call_function.queue); 417 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock); 418 spin_unlock_irqrestore(&call_function.lock, flags);
409
410 spin_unlock_irqrestore(&data->lock, flags);
411 419
412 /* 420 /*
413 * Make the list addition visible before sending the ipi. 421 * Make the list addition visible before sending the ipi.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7db25067cd2d..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..255475d163e0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338 unsigned long flags; 1338 unsigned long flags;
1339 cputime_t utime, stime; 1339 cputime_t utime, stime;
1340 struct task_cputime cputime; 1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0;
1341 1342
1342 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1343 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1346 utime = task_utime(current); 1347 utime = task_utime(current);
1347 stime = task_stime(current); 1348 stime = task_stime(current);
1348 accumulate_thread_rusage(p, r); 1349 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss;
1349 goto out; 1351 goto out;
1350 } 1352 }
1351 1353
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1363 r->ru_majflt = p->signal->cmaj_flt; 1365 r->ru_majflt = p->signal->cmaj_flt;
1364 r->ru_inblock = p->signal->cinblock; 1366 r->ru_inblock = p->signal->cinblock;
1365 r->ru_oublock = p->signal->coublock; 1367 r->ru_oublock = p->signal->coublock;
1368 maxrss = p->signal->cmaxrss;
1366 1369
1367 if (who == RUSAGE_CHILDREN) 1370 if (who == RUSAGE_CHILDREN)
1368 break; 1371 break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1377 r->ru_majflt += p->signal->maj_flt; 1380 r->ru_majflt += p->signal->maj_flt;
1378 r->ru_inblock += p->signal->inblock; 1381 r->ru_inblock += p->signal->inblock;
1379 r->ru_oublock += p->signal->oublock; 1382 r->ru_oublock += p->signal->oublock;
1383 if (maxrss < p->signal->maxrss)
1384 maxrss = p->signal->maxrss;
1380 t = p; 1385 t = p;
1381 do { 1386 do {
1382 accumulate_thread_rusage(t, r); 1387 accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1392out: 1397out:
1393 cputime_to_timeval(utime, &r->ru_utime); 1398 cputime_to_timeval(utime, &r->ru_utime);
1394 cputime_to_timeval(stime, &r->ru_stime); 1399 cputime_to_timeval(stime, &r->ru_stime);
1400
1401 if (who != RUSAGE_CHILDREN) {
1402 struct mm_struct *mm = get_task_mm(p);
1403 if (mm) {
1404 setmax_mm_hiwater_rss(&maxrss, mm);
1405 mmput(mm);
1406 }
1407 }
1408 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1395} 1409}
1396 1410
1397int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1411int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1525 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1526 error = SET_TSC_CTL(arg2);
1513 break; 1527 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1528 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1529 error = perf_event_task_disable();
1516 break; 1530 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1531 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1532 error = perf_event_task_enable();
1519 break; 1533 break;
1520 case PR_GET_TIMERSLACK: 1534 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1535 error = current->timer_slack_ns;
@@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1528 current->timer_slack_ns = arg2; 1542 current->timer_slack_ns = arg2;
1529 error = 0; 1543 error = 0;
1530 break; 1544 break;
1545 case PR_MCE_KILL:
1546 if (arg4 | arg5)
1547 return -EINVAL;
1548 switch (arg2) {
1549 case 0:
1550 if (arg3 != 0)
1551 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS;
1553 break;
1554 case 1:
1555 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0)
1557 current->flags |= PF_MCE_EARLY;
1558 else
1559 current->flags &= ~PF_MCE_EARLY;
1560 break;
1561 default:
1562 return -EINVAL;
1563 }
1564 error = 0;
1565 break;
1566
1531 default: 1567 default:
1532 error = -EINVAL; 1568 error = -EINVAL;
1533 break; 1569 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..e06d0b8d1951 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(compat_sys_recvmsg); 51cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom);
52cond_syscall(sys_socketcall); 53cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 54cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 55cond_syscall(compat_sys_futex);
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 178cond_syscall(sys_eventfd2);
178 179
179/* performance counters: */ 180/* performance counters: */
180cond_syscall(sys_perf_counter_open); 181cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6bb59f707402..0d949c517412 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -50,7 +49,7 @@
50#include <linux/reboot.h> 49#include <linux/reboot.h>
51#include <linux/ftrace.h> 50#include <linux/ftrace.h>
52#include <linux/slow-work.h> 51#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 52#include <linux/perf_event.h>
54 53
55#include <asm/uaccess.h> 54#include <asm/uaccess.h>
56#include <asm/processor.h> 55#include <asm/processor.h>
@@ -77,6 +76,7 @@ extern int max_threads;
77extern int core_uses_pid; 76extern int core_uses_pid;
78extern int suid_dumpable; 77extern int suid_dumpable;
79extern char core_pattern[]; 78extern char core_pattern[];
79extern unsigned int core_pipe_limit;
80extern int pid_max; 80extern int pid_max;
81extern int min_free_kbytes; 81extern int min_free_kbytes;
82extern int pid_max_min, pid_max_max; 82extern int pid_max_min, pid_max_max;
@@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages;
91#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable; 92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 95extern int blk_iopoll_enabled;
96#endif
95 97
96/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -104,6 +106,9 @@ static int __maybe_unused one = 1;
104static int __maybe_unused two = 2; 106static int __maybe_unused two = 2;
105static unsigned long one_ul = 1; 107static unsigned long one_ul = 1;
106static int one_hundred = 100; 108static int one_hundred = 100;
109#ifdef CONFIG_PRINTK
110static int ten_thousand = 10000;
111#endif
107 112
108/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 113/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
109static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 114static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -158,9 +163,9 @@ extern int max_lock_depth;
158#endif 163#endif
159 164
160#ifdef CONFIG_PROC_SYSCTL 165#ifdef CONFIG_PROC_SYSCTL
161static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 166static int proc_do_cad_pid(struct ctl_table *table, int write,
162 void __user *buffer, size_t *lenp, loff_t *ppos); 167 void __user *buffer, size_t *lenp, loff_t *ppos);
163static int proc_taint(struct ctl_table *table, int write, struct file *filp, 168static int proc_taint(struct ctl_table *table, int write,
164 void __user *buffer, size_t *lenp, loff_t *ppos); 169 void __user *buffer, size_t *lenp, loff_t *ppos);
165#endif 170#endif
166 171
@@ -419,6 +424,14 @@ static struct ctl_table kern_table[] = {
419 .proc_handler = &proc_dostring, 424 .proc_handler = &proc_dostring,
420 .strategy = &sysctl_string, 425 .strategy = &sysctl_string,
421 }, 426 },
427 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int),
432 .mode = 0644,
433 .proc_handler = &proc_dointvec,
434 },
422#ifdef CONFIG_PROC_SYSCTL 435#ifdef CONFIG_PROC_SYSCTL
423 { 436 {
424 .procname = "tainted", 437 .procname = "tainted",
@@ -720,6 +733,17 @@ static struct ctl_table kern_table[] = {
720 .mode = 0644, 733 .mode = 0644,
721 .proc_handler = &proc_dointvec, 734 .proc_handler = &proc_dointvec,
722 }, 735 },
736 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay",
739 .data = &printk_delay_msec,
740 .maxlen = sizeof(int),
741 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero,
745 .extra2 = &ten_thousand,
746 },
723#endif 747#endif
724 { 748 {
725 .ctl_name = KERN_NGROUPS_MAX, 749 .ctl_name = KERN_NGROUPS_MAX,
@@ -962,28 +986,28 @@ static struct ctl_table kern_table[] = {
962 .child = slow_work_sysctls, 986 .child = slow_work_sysctls,
963 }, 987 },
964#endif 988#endif
965#ifdef CONFIG_PERF_COUNTERS 989#ifdef CONFIG_PERF_EVENTS
966 { 990 {
967 .ctl_name = CTL_UNNUMBERED, 991 .ctl_name = CTL_UNNUMBERED,
968 .procname = "perf_counter_paranoid", 992 .procname = "perf_event_paranoid",
969 .data = &sysctl_perf_counter_paranoid, 993 .data = &sysctl_perf_event_paranoid,
970 .maxlen = sizeof(sysctl_perf_counter_paranoid), 994 .maxlen = sizeof(sysctl_perf_event_paranoid),
971 .mode = 0644, 995 .mode = 0644,
972 .proc_handler = &proc_dointvec, 996 .proc_handler = &proc_dointvec,
973 }, 997 },
974 { 998 {
975 .ctl_name = CTL_UNNUMBERED, 999 .ctl_name = CTL_UNNUMBERED,
976 .procname = "perf_counter_mlock_kb", 1000 .procname = "perf_event_mlock_kb",
977 .data = &sysctl_perf_counter_mlock, 1001 .data = &sysctl_perf_event_mlock,
978 .maxlen = sizeof(sysctl_perf_counter_mlock), 1002 .maxlen = sizeof(sysctl_perf_event_mlock),
979 .mode = 0644, 1003 .mode = 0644,
980 .proc_handler = &proc_dointvec, 1004 .proc_handler = &proc_dointvec,
981 }, 1005 },
982 { 1006 {
983 .ctl_name = CTL_UNNUMBERED, 1007 .ctl_name = CTL_UNNUMBERED,
984 .procname = "perf_counter_max_sample_rate", 1008 .procname = "perf_event_max_sample_rate",
985 .data = &sysctl_perf_counter_sample_rate, 1009 .data = &sysctl_perf_event_sample_rate,
986 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 1010 .maxlen = sizeof(sysctl_perf_event_sample_rate),
987 .mode = 0644, 1011 .mode = 0644,
988 .proc_handler = &proc_dointvec, 1012 .proc_handler = &proc_dointvec,
989 }, 1013 },
@@ -998,6 +1022,7 @@ static struct ctl_table kern_table[] = {
998 .proc_handler = &proc_dointvec, 1022 .proc_handler = &proc_dointvec,
999 }, 1023 },
1000#endif 1024#endif
1025#ifdef CONFIG_BLOCK
1001 { 1026 {
1002 .ctl_name = CTL_UNNUMBERED, 1027 .ctl_name = CTL_UNNUMBERED,
1003 .procname = "blk_iopoll", 1028 .procname = "blk_iopoll",
@@ -1006,6 +1031,7 @@ static struct ctl_table kern_table[] = {
1006 .mode = 0644, 1031 .mode = 0644,
1007 .proc_handler = &proc_dointvec, 1032 .proc_handler = &proc_dointvec,
1008 }, 1033 },
1034#endif
1009/* 1035/*
1010 * NOTE: do not add new entries to this table unless you have read 1036 * NOTE: do not add new entries to this table unless you have read
1011 * Documentation/sysctl/ctl_unnumbered.txt 1037 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1372,6 +1398,31 @@ static struct ctl_table vm_table[] = {
1372 .mode = 0644, 1398 .mode = 0644,
1373 .proc_handler = &scan_unevictable_handler, 1399 .proc_handler = &scan_unevictable_handler,
1374 }, 1400 },
1401#ifdef CONFIG_MEMORY_FAILURE
1402 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero,
1411 .extra2 = &one,
1412 },
1413 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero,
1422 .extra2 = &one,
1423 },
1424#endif
1425
1375/* 1426/*
1376 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1377 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2200,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2200#ifdef CONFIG_PROC_SYSCTL 2251#ifdef CONFIG_PROC_SYSCTL
2201 2252
2202static int _proc_do_string(void* data, int maxlen, int write, 2253static int _proc_do_string(void* data, int maxlen, int write,
2203 struct file *filp, void __user *buffer, 2254 void __user *buffer,
2204 size_t *lenp, loff_t *ppos) 2255 size_t *lenp, loff_t *ppos)
2205{ 2256{
2206 size_t len; 2257 size_t len;
@@ -2261,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2261 * proc_dostring - read a string sysctl 2312 * proc_dostring - read a string sysctl
2262 * @table: the sysctl table 2313 * @table: the sysctl table
2263 * @write: %TRUE if this is a write to the sysctl file 2314 * @write: %TRUE if this is a write to the sysctl file
2264 * @filp: the file structure
2265 * @buffer: the user buffer 2315 * @buffer: the user buffer
2266 * @lenp: the size of the user buffer 2316 * @lenp: the size of the user buffer
2267 * @ppos: file position 2317 * @ppos: file position
@@ -2275,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2275 * 2325 *
2276 * Returns 0 on success. 2326 * Returns 0 on success.
2277 */ 2327 */
2278int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2328int proc_dostring(struct ctl_table *table, int write,
2279 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2280{ 2330{
2281 return _proc_do_string(table->data, table->maxlen, write, filp, 2331 return _proc_do_string(table->data, table->maxlen, write,
2282 buffer, lenp, ppos); 2332 buffer, lenp, ppos);
2283} 2333}
2284 2334
@@ -2303,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2303} 2353}
2304 2354
2305static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2355static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2306 int write, struct file *filp, void __user *buffer, 2356 int write, void __user *buffer,
2307 size_t *lenp, loff_t *ppos, 2357 size_t *lenp, loff_t *ppos,
2308 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2358 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2309 int write, void *data), 2359 int write, void *data),
@@ -2410,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2410#undef TMPBUFLEN 2460#undef TMPBUFLEN
2411} 2461}
2412 2462
2413static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2463static int do_proc_dointvec(struct ctl_table *table, int write,
2414 void __user *buffer, size_t *lenp, loff_t *ppos, 2464 void __user *buffer, size_t *lenp, loff_t *ppos,
2415 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2465 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2416 int write, void *data), 2466 int write, void *data),
2417 void *data) 2467 void *data)
2418{ 2468{
2419 return __do_proc_dointvec(table->data, table, write, filp, 2469 return __do_proc_dointvec(table->data, table, write,
2420 buffer, lenp, ppos, conv, data); 2470 buffer, lenp, ppos, conv, data);
2421} 2471}
2422 2472
@@ -2424,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2424 * proc_dointvec - read a vector of integers 2474 * proc_dointvec - read a vector of integers
2425 * @table: the sysctl table 2475 * @table: the sysctl table
2426 * @write: %TRUE if this is a write to the sysctl file 2476 * @write: %TRUE if this is a write to the sysctl file
2427 * @filp: the file structure
2428 * @buffer: the user buffer 2477 * @buffer: the user buffer
2429 * @lenp: the size of the user buffer 2478 * @lenp: the size of the user buffer
2430 * @ppos: file position 2479 * @ppos: file position
@@ -2434,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2434 * 2483 *
2435 * Returns 0 on success. 2484 * Returns 0 on success.
2436 */ 2485 */
2437int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2486int proc_dointvec(struct ctl_table *table, int write,
2438 void __user *buffer, size_t *lenp, loff_t *ppos) 2487 void __user *buffer, size_t *lenp, loff_t *ppos)
2439{ 2488{
2440 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2489 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2441 NULL,NULL); 2490 NULL,NULL);
2442} 2491}
2443 2492
@@ -2445,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2445 * Taint values can only be increased 2494 * Taint values can only be increased
2446 * This means we can safely use a temporary. 2495 * This means we can safely use a temporary.
2447 */ 2496 */
2448static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2497static int proc_taint(struct ctl_table *table, int write,
2449 void __user *buffer, size_t *lenp, loff_t *ppos) 2498 void __user *buffer, size_t *lenp, loff_t *ppos)
2450{ 2499{
2451 struct ctl_table t; 2500 struct ctl_table t;
@@ -2457,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2457 2506
2458 t = *table; 2507 t = *table;
2459 t.data = &tmptaint; 2508 t.data = &tmptaint;
2460 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2509 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2461 if (err < 0) 2510 if (err < 0)
2462 return err; 2511 return err;
2463 2512
@@ -2509,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2509 * proc_dointvec_minmax - read a vector of integers with min/max values 2558 * proc_dointvec_minmax - read a vector of integers with min/max values
2510 * @table: the sysctl table 2559 * @table: the sysctl table
2511 * @write: %TRUE if this is a write to the sysctl file 2560 * @write: %TRUE if this is a write to the sysctl file
2512 * @filp: the file structure
2513 * @buffer: the user buffer 2561 * @buffer: the user buffer
2514 * @lenp: the size of the user buffer 2562 * @lenp: the size of the user buffer
2515 * @ppos: file position 2563 * @ppos: file position
@@ -2522,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2522 * 2570 *
2523 * Returns 0 on success. 2571 * Returns 0 on success.
2524 */ 2572 */
2525int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2573int proc_dointvec_minmax(struct ctl_table *table, int write,
2526 void __user *buffer, size_t *lenp, loff_t *ppos) 2574 void __user *buffer, size_t *lenp, loff_t *ppos)
2527{ 2575{
2528 struct do_proc_dointvec_minmax_conv_param param = { 2576 struct do_proc_dointvec_minmax_conv_param param = {
2529 .min = (int *) table->extra1, 2577 .min = (int *) table->extra1,
2530 .max = (int *) table->extra2, 2578 .max = (int *) table->extra2,
2531 }; 2579 };
2532 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2580 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2533 do_proc_dointvec_minmax_conv, &param); 2581 do_proc_dointvec_minmax_conv, &param);
2534} 2582}
2535 2583
2536static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2584static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2537 struct file *filp,
2538 void __user *buffer, 2585 void __user *buffer,
2539 size_t *lenp, loff_t *ppos, 2586 size_t *lenp, loff_t *ppos,
2540 unsigned long convmul, 2587 unsigned long convmul,
@@ -2639,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2639} 2686}
2640 2687
2641static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2688static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2642 struct file *filp,
2643 void __user *buffer, 2689 void __user *buffer,
2644 size_t *lenp, loff_t *ppos, 2690 size_t *lenp, loff_t *ppos,
2645 unsigned long convmul, 2691 unsigned long convmul,
2646 unsigned long convdiv) 2692 unsigned long convdiv)
2647{ 2693{
2648 return __do_proc_doulongvec_minmax(table->data, table, write, 2694 return __do_proc_doulongvec_minmax(table->data, table, write,
2649 filp, buffer, lenp, ppos, convmul, convdiv); 2695 buffer, lenp, ppos, convmul, convdiv);
2650} 2696}
2651 2697
2652/** 2698/**
2653 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2699 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2654 * @table: the sysctl table 2700 * @table: the sysctl table
2655 * @write: %TRUE if this is a write to the sysctl file 2701 * @write: %TRUE if this is a write to the sysctl file
2656 * @filp: the file structure
2657 * @buffer: the user buffer 2702 * @buffer: the user buffer
2658 * @lenp: the size of the user buffer 2703 * @lenp: the size of the user buffer
2659 * @ppos: file position 2704 * @ppos: file position
@@ -2666,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2666 * 2711 *
2667 * Returns 0 on success. 2712 * Returns 0 on success.
2668 */ 2713 */
2669int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2714int proc_doulongvec_minmax(struct ctl_table *table, int write,
2670 void __user *buffer, size_t *lenp, loff_t *ppos) 2715 void __user *buffer, size_t *lenp, loff_t *ppos)
2671{ 2716{
2672 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2717 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2673} 2718}
2674 2719
2675/** 2720/**
2676 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2721 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2677 * @table: the sysctl table 2722 * @table: the sysctl table
2678 * @write: %TRUE if this is a write to the sysctl file 2723 * @write: %TRUE if this is a write to the sysctl file
2679 * @filp: the file structure
2680 * @buffer: the user buffer 2724 * @buffer: the user buffer
2681 * @lenp: the size of the user buffer 2725 * @lenp: the size of the user buffer
2682 * @ppos: file position 2726 * @ppos: file position
@@ -2691,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2691 * Returns 0 on success. 2735 * Returns 0 on success.
2692 */ 2736 */
2693int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2737int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2694 struct file *filp,
2695 void __user *buffer, 2738 void __user *buffer,
2696 size_t *lenp, loff_t *ppos) 2739 size_t *lenp, loff_t *ppos)
2697{ 2740{
2698 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2741 return do_proc_doulongvec_minmax(table, write, buffer,
2699 lenp, ppos, HZ, 1000l); 2742 lenp, ppos, HZ, 1000l);
2700} 2743}
2701 2744
@@ -2771,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2771 * proc_dointvec_jiffies - read a vector of integers as seconds 2814 * proc_dointvec_jiffies - read a vector of integers as seconds
2772 * @table: the sysctl table 2815 * @table: the sysctl table
2773 * @write: %TRUE if this is a write to the sysctl file 2816 * @write: %TRUE if this is a write to the sysctl file
2774 * @filp: the file structure
2775 * @buffer: the user buffer 2817 * @buffer: the user buffer
2776 * @lenp: the size of the user buffer 2818 * @lenp: the size of the user buffer
2777 * @ppos: file position 2819 * @ppos: file position
@@ -2783,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2783 * 2825 *
2784 * Returns 0 on success. 2826 * Returns 0 on success.
2785 */ 2827 */
2786int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2828int proc_dointvec_jiffies(struct ctl_table *table, int write,
2787 void __user *buffer, size_t *lenp, loff_t *ppos) 2829 void __user *buffer, size_t *lenp, loff_t *ppos)
2788{ 2830{
2789 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2831 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2790 do_proc_dointvec_jiffies_conv,NULL); 2832 do_proc_dointvec_jiffies_conv,NULL);
2791} 2833}
2792 2834
@@ -2794,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2794 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2836 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2795 * @table: the sysctl table 2837 * @table: the sysctl table
2796 * @write: %TRUE if this is a write to the sysctl file 2838 * @write: %TRUE if this is a write to the sysctl file
2797 * @filp: the file structure
2798 * @buffer: the user buffer 2839 * @buffer: the user buffer
2799 * @lenp: the size of the user buffer 2840 * @lenp: the size of the user buffer
2800 * @ppos: pointer to the file position 2841 * @ppos: pointer to the file position
@@ -2806,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2806 * 2847 *
2807 * Returns 0 on success. 2848 * Returns 0 on success.
2808 */ 2849 */
2809int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2850int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2810 void __user *buffer, size_t *lenp, loff_t *ppos) 2851 void __user *buffer, size_t *lenp, loff_t *ppos)
2811{ 2852{
2812 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2853 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2813 do_proc_dointvec_userhz_jiffies_conv,NULL); 2854 do_proc_dointvec_userhz_jiffies_conv,NULL);
2814} 2855}
2815 2856
@@ -2817,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2817 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2858 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2818 * @table: the sysctl table 2859 * @table: the sysctl table
2819 * @write: %TRUE if this is a write to the sysctl file 2860 * @write: %TRUE if this is a write to the sysctl file
2820 * @filp: the file structure
2821 * @buffer: the user buffer 2861 * @buffer: the user buffer
2822 * @lenp: the size of the user buffer 2862 * @lenp: the size of the user buffer
2823 * @ppos: file position 2863 * @ppos: file position
@@ -2830,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2830 * 2870 *
2831 * Returns 0 on success. 2871 * Returns 0 on success.
2832 */ 2872 */
2833int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2873int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2834 void __user *buffer, size_t *lenp, loff_t *ppos) 2874 void __user *buffer, size_t *lenp, loff_t *ppos)
2835{ 2875{
2836 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2876 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2837 do_proc_dointvec_ms_jiffies_conv, NULL); 2877 do_proc_dointvec_ms_jiffies_conv, NULL);
2838} 2878}
2839 2879
2840static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2880static int proc_do_cad_pid(struct ctl_table *table, int write,
2841 void __user *buffer, size_t *lenp, loff_t *ppos) 2881 void __user *buffer, size_t *lenp, loff_t *ppos)
2842{ 2882{
2843 struct pid *new_pid; 2883 struct pid *new_pid;
@@ -2846,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2846 2886
2847 tmp = pid_vnr(cad_pid); 2887 tmp = pid_vnr(cad_pid);
2848 2888
2849 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2889 r = __do_proc_dointvec(&tmp, table, write, buffer,
2850 lenp, ppos, NULL, NULL); 2890 lenp, ppos, NULL, NULL);
2851 if (r || !write) 2891 if (r || !write)
2852 return r; 2892 return r;
@@ -2861,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2861 2901
2862#else /* CONFIG_PROC_FS */ 2902#else /* CONFIG_PROC_FS */
2863 2903
2864int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2904int proc_dostring(struct ctl_table *table, int write,
2865 void __user *buffer, size_t *lenp, loff_t *ppos) 2905 void __user *buffer, size_t *lenp, loff_t *ppos)
2866{ 2906{
2867 return -ENOSYS; 2907 return -ENOSYS;
2868} 2908}
2869 2909
2870int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2910int proc_dointvec(struct ctl_table *table, int write,
2871 void __user *buffer, size_t *lenp, loff_t *ppos) 2911 void __user *buffer, size_t *lenp, loff_t *ppos)
2872{ 2912{
2873 return -ENOSYS; 2913 return -ENOSYS;
2874} 2914}
2875 2915
2876int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2916int proc_dointvec_minmax(struct ctl_table *table, int write,
2877 void __user *buffer, size_t *lenp, loff_t *ppos) 2917 void __user *buffer, size_t *lenp, loff_t *ppos)
2878{ 2918{
2879 return -ENOSYS; 2919 return -ENOSYS;
2880} 2920}
2881 2921
2882int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2922int proc_dointvec_jiffies(struct ctl_table *table, int write,
2883 void __user *buffer, size_t *lenp, loff_t *ppos) 2923 void __user *buffer, size_t *lenp, loff_t *ppos)
2884{ 2924{
2885 return -ENOSYS; 2925 return -ENOSYS;
2886} 2926}
2887 2927
2888int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2928int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2889 void __user *buffer, size_t *lenp, loff_t *ppos) 2929 void __user *buffer, size_t *lenp, loff_t *ppos)
2890{ 2930{
2891 return -ENOSYS; 2931 return -ENOSYS;
2892} 2932}
2893 2933
2894int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2934int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2895 void __user *buffer, size_t *lenp, loff_t *ppos) 2935 void __user *buffer, size_t *lenp, loff_t *ppos)
2896{ 2936{
2897 return -ENOSYS; 2937 return -ENOSYS;
2898} 2938}
2899 2939
2900int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2940int proc_doulongvec_minmax(struct ctl_table *table, int write,
2901 void __user *buffer, size_t *lenp, loff_t *ppos) 2941 void __user *buffer, size_t *lenp, loff_t *ppos)
2902{ 2942{
2903 return -ENOSYS; 2943 return -ENOSYS;
2904} 2944}
2905 2945
2906int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2946int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2907 struct file *filp,
2908 void __user *buffer, 2947 void __user *buffer,
2909 size_t *lenp, loff_t *ppos) 2948 size_t *lenp, loff_t *ppos)
2910{ 2949{
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..5e18c6ab2c6a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
165 162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 list_for_each_entry(cs, &clocksource_list, list)
300
301 list_for_each_entry(cs, &clocksource_list, list) {
302 if (cs->resume) 398 if (cs->resume)
303 cs->resume(); 399 cs->resume();
304 }
305 400
306 clocksource_resume_watchdog(); 401 clocksource_resume_watchdog();
307
308 spin_unlock_irqrestore(&clocksource_lock, flags);
309} 402}
310 403
311/** 404/**
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 413 clocksource_resume_watchdog();
321} 414}
322 415
416#ifdef CONFIG_GENERIC_TIME
417
323/** 418/**
324 * clocksource_get_next - Returns the selected clocksource 419 * clocksource_select - Select the best clocksource available
325 * 420 *
421 * Private function. Must hold clocksource_mutex when called.
422 *
423 * Select the clocksource with the best rating, or the clocksource,
424 * which is selected by userspace override.
326 */ 425 */
327struct clocksource *clocksource_get_next(void) 426static void clocksource_select(void)
328{ 427{
329 unsigned long flags; 428 struct clocksource *best, *cs;
330 429
331 spin_lock_irqsave(&clocksource_lock, flags); 430 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 431 return;
333 curr_clocksource = next_clocksource; 432 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 433 best = list_first_entry(&clocksource_list, struct clocksource, list);
434 /* Check for the override clocksource. */
435 list_for_each_entry(cs, &clocksource_list, list) {
436 if (strcmp(cs->name, override_name) != 0)
437 continue;
438 /*
439 * Check to make sure we don't switch to a non-highres
440 * capable clocksource if the tick code is in oneshot
441 * mode (highres or nohz)
442 */
443 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
444 tick_oneshot_mode_active()) {
445 /* Override clocksource cannot be used. */
446 printk(KERN_WARNING "Override clocksource %s is not "
447 "HRT compatible. Cannot switch while in "
448 "HRT/NOHZ mode\n", cs->name);
449 override_name[0] = 0;
450 } else
451 /* Override clocksource can be used. */
452 best = cs;
453 break;
454 }
455 if (curr_clocksource != best) {
456 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
457 curr_clocksource = best;
458 timekeeping_notify(curr_clocksource);
335 } 459 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 460}
340 461
341/** 462#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 463
343 * 464static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 465
466#endif
467
468/*
469 * clocksource_done_booting - Called near the end of core bootup
345 * 470 *
346 * Select the clocksource with the best rating, or the clocksource, 471 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 472 * We use fs_initcall because we want this to start before
473 * device_initcall but after subsys_initcall.
348 */ 474 */
349static struct clocksource *select_clocksource(void) 475static int __init clocksource_done_booting(void)
350{ 476{
351 struct clocksource *next; 477 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 478
362 if (next == curr_clocksource) 479 /*
363 return NULL; 480 * Run the watchdog first to eliminate unstable clock sources
481 */
482 clocksource_watchdog_kthread(NULL);
364 483
365 return next; 484 mutex_lock(&clocksource_mutex);
485 clocksource_select();
486 mutex_unlock(&clocksource_mutex);
487 return 0;
366} 488}
489fs_initcall(clocksource_done_booting);
367 490
368/* 491/*
369 * Enqueue the clocksource sorted by rating 492 * Enqueue the clocksource sorted by rating
370 */ 493 */
371static int clocksource_enqueue(struct clocksource *c) 494static void clocksource_enqueue(struct clocksource *cs)
372{ 495{
373 struct list_head *tmp, *entry = &clocksource_list; 496 struct list_head *entry = &clocksource_list;
497 struct clocksource *tmp;
374 498
375 list_for_each(tmp, &clocksource_list) { 499 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 500 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 501 if (tmp->rating >= cs->rating)
383 entry = tmp; 502 entry = &tmp->list;
384 } 503 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 504}
393 505
394/** 506/**
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 509 *
398 * Returns -EBUSY if registration fails, zero otherwise. 510 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 511 */
400int clocksource_register(struct clocksource *c) 512int clocksource_register(struct clocksource *cs)
401{ 513{
402 unsigned long flags; 514 mutex_lock(&clocksource_mutex);
403 int ret; 515 clocksource_enqueue(cs);
404 516 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 517 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 518 mutex_unlock(&clocksource_mutex);
407 if (!ret) 519 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 520}
414EXPORT_SYMBOL(clocksource_register); 521EXPORT_SYMBOL(clocksource_register);
415 522
523static void __clocksource_change_rating(struct clocksource *cs, int rating)
524{
525 list_del(&cs->list);
526 cs->rating = rating;
527 clocksource_enqueue(cs);
528 clocksource_select();
529}
530
416/** 531/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 532 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 533 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 534void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 535{
422 unsigned long flags; 536 mutex_lock(&clocksource_mutex);
423 537 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 538 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 539}
540EXPORT_SYMBOL(clocksource_change_rating);
431 541
432/** 542/**
433 * clocksource_unregister - remove a registered clocksource 543 * clocksource_unregister - remove a registered clocksource
434 */ 544 */
435void clocksource_unregister(struct clocksource *cs) 545void clocksource_unregister(struct clocksource *cs)
436{ 546{
437 unsigned long flags; 547 mutex_lock(&clocksource_mutex);
438 548 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 549 list_del(&cs->list);
441 if (clocksource_override == cs) 550 clocksource_select();
442 clocksource_override = NULL; 551 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 552}
553EXPORT_SYMBOL(clocksource_unregister);
446 554
447#ifdef CONFIG_SYSFS 555#ifdef CONFIG_SYSFS
448/** 556/**
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 566{
459 ssize_t count = 0; 567 ssize_t count = 0;
460 568
461 spin_lock_irq(&clocksource_lock); 569 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 570 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 571 mutex_unlock(&clocksource_mutex);
464 572
465 return count; 573 return count;
466} 574}
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 586 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 587 const char *buf, size_t count)
480{ 588{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 589 size_t ret = count;
483 int len;
484 590
485 /* strings from sysfs write are not 0 terminated! */ 591 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 592 if (count >= sizeof(override_name))
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 596 if (buf[count-1] == '\n')
491 count--; 597 count--;
492 598
493 spin_lock_irq(&clocksource_lock); 599 mutex_lock(&clocksource_mutex);
494 600
495 if (count > 0) 601 if (count > 0)
496 memcpy(override_name, buf, count); 602 memcpy(override_name, buf, count);
497 override_name[count] = 0; 603 override_name[count] = 0;
604 clocksource_select();
498 605
499 len = strlen(override_name); 606 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 607
532 return ret; 608 return ret;
533} 609}
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 623 struct clocksource *src;
548 ssize_t count = 0; 624 ssize_t count = 0;
549 625
550 spin_lock_irq(&clocksource_lock); 626 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 627 list_for_each_entry(src, &clocksource_list, list) {
552 /* 628 /*
553 * Don't show non-HRES clocksource if the tick code is 629 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 635 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 636 "%s ", src->name);
561 } 637 }
562 spin_unlock_irq(&clocksource_lock); 638 mutex_unlock(&clocksource_mutex);
563 639
564 count += snprintf(buf + count, 640 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 641 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 690 */
615static int __init boot_override_clocksource(char* str) 691static int __init boot_override_clocksource(char* str)
616{ 692{
617 unsigned long flags; 693 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 694 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 695 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 696 mutex_unlock(&clocksource_mutex);
622 return 1; 697 return 1;
623} 698}
624 699
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..89aed5933ed4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle)
231 if (!inidle && !ts->inidle) 231 if (!inidle && !ts->inidle)
232 goto end; 232 goto end;
233 233
234 /*
235 * Set ts->inidle unconditionally. Even if the system did not
236 * switch to NOHZ mode the cpu frequency governers rely on the
237 * update of the idle time accounting in tick_nohz_start_idle().
238 */
239 ts->inidle = 1;
240
234 now = tick_nohz_start_idle(ts); 241 now = tick_nohz_start_idle(ts);
235 242
236 /* 243 /*
@@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle)
248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 255 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
249 goto end; 256 goto end;
250 257
251 ts->inidle = 1;
252
253 if (need_resched()) 258 if (need_resched())
254 goto end; 259 goto end;
255 260
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..c3a4e2907eaa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,12 +13,123 @@
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h>
16#include <linux/sysdev.h> 17#include <linux/sysdev.h>
17#include <linux/clocksource.h> 18#include <linux/clocksource.h>
18#include <linux/jiffies.h> 19#include <linux/jiffies.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include <linux/tick.h> 21#include <linux/tick.h>
22#include <linux/stop_machine.h>
23
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* The shift value of the current clocksource. */
29 int shift;
30
31 /* Number of clock cycles in one NTP interval. */
32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval;
35 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval;
37
38 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
39 u64 xtime_nsec;
40 /* Difference between accumulated time and NTP time in ntp
41 * shifted nano seconds. */
42 s64 ntp_error;
43 /* Shift conversion between clock shifted nano seconds and
44 * ntp shifted nano seconds. */
45 int ntp_error_shift;
46 /* NTP adjusted clock multiplier */
47 u32 mult;
48};
49
50struct timekeeper timekeeper;
51
52/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock.
54 *
55 * @clock: Pointer to clocksource.
56 *
57 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
58 * pair and interval request.
59 *
60 * Unless you're the timekeeping code, you should not be using this!
61 */
62static void timekeeper_setup_internals(struct clocksource *clock)
63{
64 cycle_t interval;
65 u64 tmp;
66
67 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock);
21 69
70 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift;
73 tmp += clock->mult/2;
74 do_div(tmp, clock->mult);
75 if (tmp == 0)
76 tmp = 1;
77
78 interval = (cycle_t) tmp;
79 timekeeper.cycle_interval = interval;
80
81 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult;
83 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift;
85
86 timekeeper.xtime_nsec = 0;
87 timekeeper.shift = clock->shift;
88
89 timekeeper.ntp_error = 0;
90 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
91
92 /*
93 * The timekeeper keeps its own mult values for the currently
94 * active clocksource. These value will be adjusted via NTP
95 * to counteract clock drifting.
96 */
97 timekeeper.mult = clock->mult;
98}
99
100/* Timekeeper helper functions. */
101static inline s64 timekeeping_get_ns(void)
102{
103 cycle_t cycle_now, cycle_delta;
104 struct clocksource *clock;
105
106 /* read clocksource: */
107 clock = timekeeper.clock;
108 cycle_now = clock->read(clock);
109
110 /* calculate the delta since the last update_wall_time: */
111 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
112
113 /* return delta convert to nanoseconds using ntp adjusted mult. */
114 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
115 timekeeper.shift);
116}
117
118static inline s64 timekeeping_get_ns_raw(void)
119{
120 cycle_t cycle_now, cycle_delta;
121 struct clocksource *clock;
122
123 /* read clocksource: */
124 clock = timekeeper.clock;
125 cycle_now = clock->read(clock);
126
127 /* calculate the delta since the last update_wall_time: */
128 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
129
130 /* return delta convert to nanoseconds using ntp adjusted mult. */
131 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
132}
22 133
23/* 134/*
24 * This read-write spinlock protects us from races in SMP while 135 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +155,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 155 */
45struct timespec xtime __attribute__ ((aligned (16))); 156struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 158static struct timespec total_sleep_time;
159
160/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */
163struct timespec raw_time;
48 164
49/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
@@ -56,35 +172,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 172 timespec_add_ns(&xtime_cache, nsec);
57} 173}
58 174
59struct clocksource *clock; 175/* must hold xtime_lock */
60 176void timekeeping_leap_insert(int leapsecond)
177{
178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock);
181}
61 182
62#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
184
63/** 185/**
64 * clocksource_forward_now - update clock to the current time 186 * timekeeping_forward_now - update clock to the current time
65 * 187 *
66 * Forward the current clock to update its state since the last call to 188 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 189 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 190 * as it avoids having to deal with this time offset explicitly.
69 */ 191 */
70static void clocksource_forward_now(void) 192static void timekeeping_forward_now(void)
71{ 193{
72 cycle_t cycle_now, cycle_delta; 194 cycle_t cycle_now, cycle_delta;
195 struct clocksource *clock;
73 s64 nsec; 196 s64 nsec;
74 197
75 cycle_now = clocksource_read(clock); 198 clock = timekeeper.clock;
199 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 200 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 201 clock->cycle_last = cycle_now;
78 202
79 nsec = cyc2ns(clock, cycle_delta); 203 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
204 timekeeper.shift);
80 205
81 /* If arch requires, add in gettimeoffset() */ 206 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 207 nsec += arch_gettimeoffset();
83 208
84 timespec_add_ns(&xtime, nsec); 209 timespec_add_ns(&xtime, nsec);
85 210
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 211 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 212 timespec_add_ns(&raw_time, nsec);
88} 213}
89 214
90/** 215/**
@@ -95,7 +220,6 @@ static void clocksource_forward_now(void)
95 */ 220 */
96void getnstimeofday(struct timespec *ts) 221void getnstimeofday(struct timespec *ts)
97{ 222{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 223 unsigned long seq;
100 s64 nsecs; 224 s64 nsecs;
101 225
@@ -105,15 +229,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 229 seq = read_seqbegin(&xtime_lock);
106 230
107 *ts = xtime; 231 *ts = xtime;
108 232 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 233
118 /* If arch requires, add in gettimeoffset() */ 234 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 235 nsecs += arch_gettimeoffset();
@@ -125,6 +241,57 @@ void getnstimeofday(struct timespec *ts)
125 241
126EXPORT_SYMBOL(getnstimeofday); 242EXPORT_SYMBOL(getnstimeofday);
127 243
244ktime_t ktime_get(void)
245{
246 unsigned int seq;
247 s64 secs, nsecs;
248
249 WARN_ON(timekeeping_suspended);
250
251 do {
252 seq = read_seqbegin(&xtime_lock);
253 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
254 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
255 nsecs += timekeeping_get_ns();
256
257 } while (read_seqretry(&xtime_lock, seq));
258 /*
259 * Use ktime_set/ktime_add_ns to create a proper ktime on
260 * 32-bit architectures without CONFIG_KTIME_SCALAR.
261 */
262 return ktime_add_ns(ktime_set(secs, 0), nsecs);
263}
264EXPORT_SYMBOL_GPL(ktime_get);
265
266/**
267 * ktime_get_ts - get the monotonic clock in timespec format
268 * @ts: pointer to timespec variable
269 *
270 * The function calculates the monotonic clock from the realtime
271 * clock and the wall_to_monotonic offset and stores the result
272 * in normalized timespec format in the variable pointed to by @ts.
273 */
274void ktime_get_ts(struct timespec *ts)
275{
276 struct timespec tomono;
277 unsigned int seq;
278 s64 nsecs;
279
280 WARN_ON(timekeeping_suspended);
281
282 do {
283 seq = read_seqbegin(&xtime_lock);
284 *ts = xtime;
285 tomono = wall_to_monotonic;
286 nsecs = timekeeping_get_ns();
287
288 } while (read_seqretry(&xtime_lock, seq));
289
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs);
292}
293EXPORT_SYMBOL_GPL(ktime_get_ts);
294
128/** 295/**
129 * do_gettimeofday - Returns the time of day in a timeval 296 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 297 * @tv: pointer to the timeval to be set
@@ -157,7 +324,7 @@ int do_settimeofday(struct timespec *tv)
157 324
158 write_seqlock_irqsave(&xtime_lock, flags); 325 write_seqlock_irqsave(&xtime_lock, flags);
159 326
160 clocksource_forward_now(); 327 timekeeping_forward_now();
161 328
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 329 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 330 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +334,10 @@ int do_settimeofday(struct timespec *tv)
167 334
168 update_xtime_cache(0); 335 update_xtime_cache(0);
169 336
170 clock->error = 0; 337 timekeeper.ntp_error = 0;
171 ntp_clear(); 338 ntp_clear();
172 339
173 update_vsyscall(&xtime, clock); 340 update_vsyscall(&xtime, timekeeper.clock);
174 341
175 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
176 343
@@ -187,44 +354,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 354 *
188 * Accumulates current time interval and initializes new clocksource 355 * Accumulates current time interval and initializes new clocksource
189 */ 356 */
190static void change_clocksource(void) 357static int change_clocksource(void *data)
191{ 358{
192 struct clocksource *new, *old; 359 struct clocksource *new, *old;
193 360
194 new = clocksource_get_next(); 361 new = (struct clocksource *) data;
362
363 timekeeping_forward_now();
364 if (!new->enable || new->enable(new) == 0) {
365 old = timekeeper.clock;
366 timekeeper_setup_internals(new);
367 if (old->disable)
368 old->disable(old);
369 }
370 return 0;
371}
195 372
196 if (clock == new) 373/**
374 * timekeeping_notify - Install a new clock source
375 * @clock: pointer to the clock source
376 *
377 * This function is called from clocksource.c after a new, better clock
378 * source has been registered. The caller holds the clocksource_mutex.
379 */
380void timekeeping_notify(struct clocksource *clock)
381{
382 if (timekeeper.clock == clock)
197 return; 383 return;
384 stop_machine(change_clocksource, clock, NULL);
385 tick_clock_notify();
386}
198 387
199 clocksource_forward_now(); 388#else /* GENERIC_TIME */
200 389
201 if (clocksource_enable(new)) 390static inline void timekeeping_forward_now(void) { }
202 return;
203 391
204 new->raw_time = clock->raw_time; 392/**
205 old = clock; 393 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 394 *
207 clocksource_disable(old); 395 * returns the time in ktime_t format
396 */
397ktime_t ktime_get(void)
398{
399 struct timespec now;
208 400
209 clock->cycle_last = 0; 401 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 402
215 tick_clock_notify(); 403 return timespec_to_ktime(now);
404}
405EXPORT_SYMBOL_GPL(ktime_get);
216 406
217 /* 407/**
218 * We're holding xtime lock and waking up klogd would deadlock 408 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 409 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 410 *
221 clock->name); 411 * The function calculates the monotonic clock from the realtime
222 */ 412 * clock and the wall_to_monotonic offset and stores the result
413 * in normalized timespec format in the variable pointed to by @ts.
414 */
415void ktime_get_ts(struct timespec *ts)
416{
417 struct timespec tomono;
418 unsigned long seq;
419
420 do {
421 seq = read_seqbegin(&xtime_lock);
422 getnstimeofday(ts);
423 tomono = wall_to_monotonic;
424
425 } while (read_seqretry(&xtime_lock, seq));
426
427 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
428 ts->tv_nsec + tomono.tv_nsec);
223} 429}
224#else 430EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 431
226static inline void change_clocksource(void) { } 432#endif /* !GENERIC_TIME */
227#endif 433
434/**
435 * ktime_get_real - get the real (wall-) time in ktime_t format
436 *
437 * returns the time in ktime_t format
438 */
439ktime_t ktime_get_real(void)
440{
441 struct timespec now;
442
443 getnstimeofday(&now);
444
445 return timespec_to_ktime(now);
446}
447EXPORT_SYMBOL_GPL(ktime_get_real);
228 448
229/** 449/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 450 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +456,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 456{
237 unsigned long seq; 457 unsigned long seq;
238 s64 nsecs; 458 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 459
241 do { 460 do {
242 seq = read_seqbegin(&xtime_lock); 461 seq = read_seqbegin(&xtime_lock);
243 462 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 463 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 464
255 } while (read_seqretry(&xtime_lock, seq)); 465 } while (read_seqretry(&xtime_lock, seq));
256 466
@@ -270,7 +480,7 @@ int timekeeping_valid_for_hres(void)
270 do { 480 do {
271 seq = read_seqbegin(&xtime_lock); 481 seq = read_seqbegin(&xtime_lock);
272 482
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 483 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 484
275 } while (read_seqretry(&xtime_lock, seq)); 485 } while (read_seqretry(&xtime_lock, seq));
276 486
@@ -278,17 +488,33 @@ int timekeeping_valid_for_hres(void)
278} 488}
279 489
280/** 490/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 491 * read_persistent_clock - Return time from the persistent clock.
282 * 492 *
283 * Weak dummy function for arches that do not yet support it. 493 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 494 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 495 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 496 *
287 * XXX - Do be sure to remove it once all arches implement it. 497 * XXX - Do be sure to remove it once all arches implement it.
288 */ 498 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 499void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 500{
291 return 0; 501 ts->tv_sec = 0;
502 ts->tv_nsec = 0;
503}
504
505/**
506 * read_boot_clock - Return time of the system start.
507 *
508 * Weak dummy function for arches that do not yet support it.
509 * Function to read the exact time the system has been started.
510 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
511 *
512 * XXX - Do be sure to remove it once all arches implement it.
513 */
514void __attribute__((weak)) read_boot_clock(struct timespec *ts)
515{
516 ts->tv_sec = 0;
517 ts->tv_nsec = 0;
292} 518}
293 519
294/* 520/*
@@ -296,29 +522,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 522 */
297void __init timekeeping_init(void) 523void __init timekeeping_init(void)
298{ 524{
525 struct clocksource *clock;
299 unsigned long flags; 526 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 527 struct timespec now, boot;
528
529 read_persistent_clock(&now);
530 read_boot_clock(&boot);
301 531
302 write_seqlock_irqsave(&xtime_lock, flags); 532 write_seqlock_irqsave(&xtime_lock, flags);
303 533
304 ntp_init(); 534 ntp_init();
305 535
306 clock = clocksource_get_next(); 536 clock = clocksource_default_clock();
307 clocksource_enable(clock); 537 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 538 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 539 timekeeper_setup_internals(clock);
310 540
311 xtime.tv_sec = sec; 541 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 542 xtime.tv_nsec = now.tv_nsec;
543 raw_time.tv_sec = 0;
544 raw_time.tv_nsec = 0;
545 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
546 boot.tv_sec = xtime.tv_sec;
547 boot.tv_nsec = xtime.tv_nsec;
548 }
313 set_normalized_timespec(&wall_to_monotonic, 549 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 550 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 551 update_xtime_cache(0);
316 total_sleep_time = 0; 552 total_sleep_time.tv_sec = 0;
553 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 554 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 555}
319 556
320/* time in seconds when suspend began */ 557/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 558static struct timespec timekeeping_suspend_time;
322 559
323/** 560/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 561 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +568,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 568static int timekeeping_resume(struct sys_device *dev)
332{ 569{
333 unsigned long flags; 570 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 571 struct timespec ts;
572
573 read_persistent_clock(&ts);
335 574
336 clocksource_resume(); 575 clocksource_resume();
337 576
338 write_seqlock_irqsave(&xtime_lock, flags); 577 write_seqlock_irqsave(&xtime_lock, flags);
339 578
340 if (now && (now > timekeeping_suspend_time)) { 579 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 580 ts = timespec_sub(ts, timekeeping_suspend_time);
342 581 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 583 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 584 }
347 update_xtime_cache(0); 585 update_xtime_cache(0);
348 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
349 clock->cycle_last = 0; 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 588 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 589 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 590 write_sequnlock_irqrestore(&xtime_lock, flags);
354 591
@@ -366,10 +603,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 603{
367 unsigned long flags; 604 unsigned long flags;
368 605
369 timekeeping_suspend_time = read_persistent_clock(); 606 read_persistent_clock(&timekeeping_suspend_time);
370 607
371 write_seqlock_irqsave(&xtime_lock, flags); 608 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 609 timekeeping_forward_now();
373 timekeeping_suspended = 1; 610 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&xtime_lock, flags);
375 612
@@ -404,7 +641,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 641 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 642 * to compensate for late or lost adjustments.
406 */ 643 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 644static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 645 s64 *offset)
409{ 646{
410 s64 tick_error, i; 647 s64 tick_error, i;
@@ -420,7 +657,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 657 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 658 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 659 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 660 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 661 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 662 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 663 error2 >>= 2;
@@ -429,8 +666,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 666 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 667 * remove the single look ahead already included in the error.
431 */ 668 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 669 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 670 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 671 error = ((error - tick_error) >> look_ahead) + tick_error;
435 672
436 /* Finally calculate the adjustment shift value. */ 673 /* Finally calculate the adjustment shift value. */
@@ -455,18 +692,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 692 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 693 * for other values we can do a bit more work.
457 */ 694 */
458static void clocksource_adjust(s64 offset) 695static void timekeeping_adjust(s64 offset)
459{ 696{
460 s64 error, interval = clock->cycle_interval; 697 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 698 int adj;
462 699
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 700 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 701 if (error > interval) {
465 error >>= 2; 702 error >>= 2;
466 if (likely(error <= interval)) 703 if (likely(error <= interval))
467 adj = 1; 704 adj = 1;
468 else 705 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 706 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 707 } else if (error < -interval) {
471 error >>= 2; 708 error >>= 2;
472 if (likely(error >= -interval)) { 709 if (likely(error >= -interval)) {
@@ -474,15 +711,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 711 interval = -interval;
475 offset = -offset; 712 offset = -offset;
476 } else 713 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 714 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 715 } else
479 return; 716 return;
480 717
481 clock->mult += adj; 718 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 719 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 720 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 721 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 722 timekeeper.ntp_error_shift;
486} 723}
487 724
488/** 725/**
@@ -492,53 +729,59 @@ static void clocksource_adjust(s64 offset)
492 */ 729 */
493void update_wall_time(void) 730void update_wall_time(void)
494{ 731{
732 struct clocksource *clock;
495 cycle_t offset; 733 cycle_t offset;
734 u64 nsecs;
496 735
497 /* Make sure we're fully resumed: */ 736 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 737 if (unlikely(timekeeping_suspended))
499 return; 738 return;
500 739
740 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 741#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 742 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 743#else
504 offset = clock->cycle_interval; 744 offset = timekeeper.cycle_interval;
505#endif 745#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 747
508 /* normally this loop will run just once, however in the 748 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 749 * case of lost or late ticks, it will accumulate correctly.
510 */ 750 */
511 while (offset >= clock->cycle_interval) { 751 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
753
512 /* accumulate one interval */ 754 /* accumulate one interval */
513 offset -= clock->cycle_interval; 755 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 756 clock->cycle_last += timekeeper.cycle_interval;
515 757
516 clock->xtime_nsec += clock->xtime_interval; 758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 759 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 760 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 761 xtime.tv_sec++;
520 second_overflow(); 762 second_overflow();
521 } 763 }
522 764
523 clock->raw_time.tv_nsec += clock->raw_interval; 765 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 767 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 768 raw_time.tv_sec++;
527 } 769 }
528 770
529 /* accumulate error between NTP and clock interval */ 771 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 772 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
532 } 775 }
533 776
534 /* correct the clock when NTP error is too big */ 777 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 778 timekeeping_adjust(offset);
536 779
537 /* 780 /*
538 * Since in the loop above, we accumulate any amount of time 781 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 782 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 783 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 784 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 785 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 786 * cause it to underflow.
544 * 787 *
@@ -550,24 +793,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 793 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 794 * xtime_nsec is not as small.
552 */ 795 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 796 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 797 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 798 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 799 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 800 }
558 801
559 /* store full nanoseconds into xtime after rounding it up and 802 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 803 * add the remainder to the error difference.
561 */ 804 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 805 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 806 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 807 timekeeper.ntp_error += timekeeper.xtime_nsec <<
808 timekeeper.ntp_error_shift;
565 809
566 update_xtime_cache(cyc2ns(clock, offset)); 810 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
811 update_xtime_cache(nsecs);
567 812
568 /* check to see if there is a new clocksource to use */ 813 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 814 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 815}
572 816
573/** 817/**
@@ -583,9 +827,12 @@ void update_wall_time(void)
583 */ 827 */
584void getboottime(struct timespec *ts) 828void getboottime(struct timespec *ts)
585{ 829{
586 set_normalized_timespec(ts, 830 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 831 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 832 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
833 };
834
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 836}
590 837
591/** 838/**
@@ -594,7 +841,7 @@ void getboottime(struct timespec *ts)
594 */ 841 */
595void monotonic_to_bootbased(struct timespec *ts) 842void monotonic_to_bootbased(struct timespec *ts)
596{ 843{
597 ts->tv_sec += total_sleep_time; 844 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 845}
599 846
600unsigned long get_seconds(void) 847unsigned long get_seconds(void)
@@ -603,6 +850,10 @@ unsigned long get_seconds(void)
603} 850}
604EXPORT_SYMBOL(get_seconds); 851EXPORT_SYMBOL(get_seconds);
605 852
853struct timespec __current_kernel_time(void)
854{
855 return xtime_cache;
856}
606 857
607struct timespec current_kernel_time(void) 858struct timespec current_kernel_time(void)
608{ 859{
@@ -618,3 +869,20 @@ struct timespec current_kernel_time(void)
618 return now; 869 return now;
619} 870}
620EXPORT_SYMBOL(current_kernel_time); 871EXPORT_SYMBOL(current_kernel_time);
872
873struct timespec get_monotonic_coarse(void)
874{
875 struct timespec now, mono;
876 unsigned long seq;
877
878 do {
879 seq = read_seqbegin(&xtime_lock);
880
881 now = xtime_cache;
882 mono = wall_to_monotonic;
883 } while (read_seqretry(&xtime_lock, seq));
884
885 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
886 now.tv_nsec + mono.tv_nsec);
887 return now;
888}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 275 return single_open(filp, timer_list_show, NULL);
276} 276}
277 277
278static struct file_operations timer_list_fops = { 278static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 279 .open = timer_list_open,
280 .read = seq_read, 280 .read = seq_read,
281 .llseek = seq_lseek, 281 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 395 return single_open(filp, tstats_show, NULL);
396} 396}
397 397
398static struct file_operations tstats_fops = { 398static const struct file_operations tstats_fops = {
399 .open = tstats_open, 399 .open = tstats_open,
400 .read = seq_read, 400 .read = seq_read,
401 .write = tstats_write, 401 .write = tstats_write,
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..5db5a8d26811 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
46#include <asm/timex.h> 46#include <asm/timex.h>
47#include <asm/io.h> 47#include <asm/io.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/timer.h>
51
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 52u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 53
51EXPORT_SYMBOL(jiffies_64); 54EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
72 spinlock_t lock; 75 spinlock_t lock;
73 struct timer_list *running_timer; 76 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 77 unsigned long timer_jiffies;
78 unsigned long next_timer;
75 struct tvec_root tv1; 79 struct tvec_root tv1;
76 struct tvec tv2; 80 struct tvec tv2;
77 struct tvec tv3; 81 struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
520static inline void debug_timer_deactivate(struct timer_list *timer) { } 524static inline void debug_timer_deactivate(struct timer_list *timer) { }
521#endif 525#endif
522 526
527static inline void debug_init(struct timer_list *timer)
528{
529 debug_timer_init(timer);
530 trace_timer_init(timer);
531}
532
533static inline void
534debug_activate(struct timer_list *timer, unsigned long expires)
535{
536 debug_timer_activate(timer);
537 trace_timer_start(timer, expires);
538}
539
540static inline void debug_deactivate(struct timer_list *timer)
541{
542 debug_timer_deactivate(timer);
543 trace_timer_cancel(timer);
544}
545
523static void __init_timer(struct timer_list *timer, 546static void __init_timer(struct timer_list *timer,
524 const char *name, 547 const char *name,
525 struct lock_class_key *key) 548 struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
548 const char *name, 571 const char *name,
549 struct lock_class_key *key) 572 struct lock_class_key *key)
550{ 573{
551 debug_timer_init(timer); 574 debug_init(timer);
552 __init_timer(timer, name, key); 575 __init_timer(timer, name, key);
553} 576}
554EXPORT_SYMBOL(init_timer_key); 577EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
567{ 590{
568 struct list_head *entry = &timer->entry; 591 struct list_head *entry = &timer->entry;
569 592
570 debug_timer_deactivate(timer); 593 debug_deactivate(timer);
571 594
572 __list_del(entry->prev, entry->next); 595 __list_del(entry->prev, entry->next);
573 if (clear_pending) 596 if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 645
623 if (timer_pending(timer)) { 646 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 647 detach_timer(timer, 0);
648 if (timer->expires == base->next_timer &&
649 !tbase_get_deferrable(timer->base))
650 base->next_timer = base->timer_jiffies;
625 ret = 1; 651 ret = 1;
626 } else { 652 } else {
627 if (pending_only) 653 if (pending_only)
628 goto out_unlock; 654 goto out_unlock;
629 } 655 }
630 656
631 debug_timer_activate(timer); 657 debug_activate(timer, expires);
632 658
633 new_base = __get_cpu_var(tvec_bases); 659 new_base = __get_cpu_var(tvec_bases);
634 660
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 689 }
664 690
665 timer->expires = expires; 691 timer->expires = expires;
692 if (time_before(timer->expires, base->next_timer) &&
693 !tbase_get_deferrable(timer->base))
694 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 695 internal_add_timer(base, timer);
667 696
668out_unlock: 697out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
780 BUG_ON(timer_pending(timer) || !timer->function); 809 BUG_ON(timer_pending(timer) || !timer->function);
781 spin_lock_irqsave(&base->lock, flags); 810 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 811 timer_set_base(timer, base);
783 debug_timer_activate(timer); 812 debug_activate(timer, timer->expires);
813 if (time_before(timer->expires, base->next_timer) &&
814 !tbase_get_deferrable(timer->base))
815 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 816 internal_add_timer(base, timer);
785 /* 817 /*
786 * Check whether the other CPU is idle and needs to be 818 * Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 849 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 850 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 851 detach_timer(timer, 1);
852 if (timer->expires == base->next_timer &&
853 !tbase_get_deferrable(timer->base))
854 base->next_timer = base->timer_jiffies;
820 ret = 1; 855 ret = 1;
821 } 856 }
822 spin_unlock_irqrestore(&base->lock, flags); 857 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 885 ret = 0;
851 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 887 detach_timer(timer, 1);
888 if (timer->expires == base->next_timer &&
889 !tbase_get_deferrable(timer->base))
890 base->next_timer = base->timer_jiffies;
853 ret = 1; 891 ret = 1;
854 } 892 }
855out: 893out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
984 */ 1022 */
985 lock_map_acquire(&lockdep_map); 1023 lock_map_acquire(&lockdep_map);
986 1024
1025 trace_timer_expire_entry(timer);
987 fn(data); 1026 fn(data);
1027 trace_timer_expire_exit(timer);
988 1028
989 lock_map_release(&lockdep_map); 1029 lock_map_release(&lockdep_map);
990 1030
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1047#ifdef CONFIG_NO_HZ
1008/* 1048/*
1009 * Find out when the next timer event is due to happen. This 1049 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1050 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1051 * This function needs to be called with interrupts disabled.
1012 */ 1052 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1053static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1054{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1174 unsigned long expires;
1135 1175
1136 spin_lock(&base->lock); 1176 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1177 if (time_before_eq(base->next_timer, base->timer_jiffies))
1178 base->next_timer = __next_timer_interrupt(base);
1179 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1180 spin_unlock(&base->lock);
1139 1181
1140 if (time_before_eq(expires, now)) 1182 if (time_before_eq(expires, now))
@@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
1169{ 1211{
1170 struct tvec_base *base = __get_cpu_var(tvec_bases); 1212 struct tvec_base *base = __get_cpu_var(tvec_bases);
1171 1213
1172 perf_counter_do_pending(); 1214 perf_event_do_pending();
1173 1215
1174 hrtimer_run_pending(); 1216 hrtimer_run_pending();
1175 1217
@@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1522 INIT_LIST_HEAD(base->tv1.vec + j); 1564 INIT_LIST_HEAD(base->tv1.vec + j);
1523 1565
1524 base->timer_jiffies = jiffies; 1566 base->timer_jiffies = jiffies;
1567 base->next_timer = base->timer_jiffies;
1525 return 0; 1568 return 0;
1526} 1569}
1527 1570
@@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1534 timer = list_first_entry(head, struct timer_list, entry); 1577 timer = list_first_entry(head, struct timer_list, entry);
1535 detach_timer(timer, 0); 1578 detach_timer(timer, 0);
1536 timer_set_base(timer, new_base); 1579 timer_set_base(timer, new_base);
1580 if (time_before(timer->expires, new_base->next_timer) &&
1581 !tbase_get_deferrable(timer->base))
1582 new_base->next_timer = timer->expires;
1537 internal_add_timer(new_base, timer); 1583 internal_add_timer(new_base, timer);
1538 } 1584 }
1539} 1585}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5efeb4229ea0..06c3d5be6759 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
73# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
74# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
75# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
76# hidding of the automatic options options. 86# hidding of the automatic options.
77 87
78config TRACING 88config TRACING
79 bool 89 bool
@@ -490,6 +500,18 @@ config FTRACE_STARTUP_TEST
490 functioning properly. It will do tests on all the configured 500 functioning properly. It will do tests on all the configured
491 tracers of ftrace. 501 tracers of ftrace.
492 502
503config EVENT_TRACE_TEST_SYSCALLS
504 bool "Run selftest on syscall events"
505 depends on FTRACE_STARTUP_TEST
506 help
507 This option will also enable testing every syscall event.
508 It only enables the event and disables it and runs various loads
509 with the event enabled. This adds a bit more time for kernel boot
510 up since it runs this on every system call defined.
511
512 TBD - enable a way to actually call the syscalls as we test their
513 events
514
493config MMIOTRACE 515config MMIOTRACE
494 bool "Memory mapped IO tracing" 516 bool "Memory mapped IO tracing"
495 depends on HAVE_MMIOTRACE_SUPPORT && PCI 517 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index ce3b1cd02732..0f84c52e58fe 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -55,5 +54,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 56obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
57obj-$(CONFIG_EVENT_TRACING) += power-traces.o
58 58
59libftrace-y := ftrace.o 59libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
856} 856}
857 857
858/** 858/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
860 * @q: queue the io is for
861 * @rq: the source request
862 * @dev: target device
863 * @from: source sector
864 *
865 * Description:
866 * Device mapper remaps request to other devices.
867 * Add a trace for that action.
868 *
869 **/
870static void blk_add_trace_rq_remap(struct request_queue *q,
871 struct request *rq, dev_t dev,
872 sector_t from)
873{
874 struct blk_trace *bt = q->blk_trace;
875 struct blk_io_trace_remap r;
876
877 if (likely(!bt))
878 return;
879
880 r.device_from = cpu_to_be32(dev);
881 r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
882 r.sector_from = cpu_to_be64(from);
883
884 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
885 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
886 sizeof(r), &r);
887}
888
889/**
859 * blk_add_driver_data - Add binary message with driver-specific data 890 * blk_add_driver_data - Add binary message with driver-specific data
860 * @q: queue the io is for 891 * @q: queue the io is for
861 * @rq: io request 892 * @rq: io request
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
922 WARN_ON(ret); 953 WARN_ON(ret);
923 ret = register_trace_block_remap(blk_add_trace_remap); 954 ret = register_trace_block_remap(blk_add_trace_remap);
924 WARN_ON(ret); 955 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
957 WARN_ON(ret);
925} 958}
926 959
927static void blk_unregister_tracepoints(void) 960static void blk_unregister_tracepoints(void)
928{ 961{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
929 unregister_trace_block_remap(blk_add_trace_remap); 963 unregister_trace_block_remap(blk_add_trace_remap);
930 unregister_trace_block_split(blk_add_trace_split); 964 unregister_trace_block_split(blk_add_trace_split);
931 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
@@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev)
1657 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1691 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1658} 1692}
1659 1693
1694void blk_trace_remove_sysfs(struct device *dev)
1695{
1696 sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
1697}
1698
1660#endif /* CONFIG_BLK_DEV_IO_TRACE */ 1699#endif /* CONFIG_BLK_DEV_IO_TRACE */
1661 1700
1662#ifdef CONFIG_EVENT_TRACING 1701#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96f..b10c0d90a6ff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -60,6 +60,13 @@ static int last_ftrace_enabled;
60/* Quick disabling of function tracer. */ 60/* Quick disabling of function tracer. */
61int function_trace_stop; 61int function_trace_stop;
62 62
63/* List for set_ftrace_pid's pids. */
64LIST_HEAD(ftrace_pids);
65struct ftrace_pid {
66 struct list_head list;
67 struct pid *pid;
68};
69
63/* 70/*
64 * ftrace_disabled is set when an anomaly is discovered. 71 * ftrace_disabled is set when an anomaly is discovered.
65 * ftrace_disabled is much stronger than ftrace_enabled. 72 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -78,6 +85,10 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
78ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 85ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
79ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 86ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
80 87
88#ifdef CONFIG_FUNCTION_GRAPH_TRACER
89static int ftrace_set_func(unsigned long *array, int *idx, char *buffer);
90#endif
91
81static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 92static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
82{ 93{
83 struct ftrace_ops *op = ftrace_list; 94 struct ftrace_ops *op = ftrace_list;
@@ -155,7 +166,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
155 else 166 else
156 func = ftrace_list_func; 167 func = ftrace_list_func;
157 168
158 if (ftrace_pid_trace) { 169 if (!list_empty(&ftrace_pids)) {
159 set_ftrace_pid_function(func); 170 set_ftrace_pid_function(func);
160 func = ftrace_pid_func; 171 func = ftrace_pid_func;
161 } 172 }
@@ -203,7 +214,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
203 if (ftrace_list->next == &ftrace_list_end) { 214 if (ftrace_list->next == &ftrace_list_end) {
204 ftrace_func_t func = ftrace_list->func; 215 ftrace_func_t func = ftrace_list->func;
205 216
206 if (ftrace_pid_trace) { 217 if (!list_empty(&ftrace_pids)) {
207 set_ftrace_pid_function(func); 218 set_ftrace_pid_function(func);
208 func = ftrace_pid_func; 219 func = ftrace_pid_func;
209 } 220 }
@@ -225,9 +236,13 @@ static void ftrace_update_pid_func(void)
225 if (ftrace_trace_function == ftrace_stub) 236 if (ftrace_trace_function == ftrace_stub)
226 return; 237 return;
227 238
239#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
228 func = ftrace_trace_function; 240 func = ftrace_trace_function;
241#else
242 func = __ftrace_trace_function;
243#endif
229 244
230 if (ftrace_pid_trace) { 245 if (!list_empty(&ftrace_pids)) {
231 set_ftrace_pid_function(func); 246 set_ftrace_pid_function(func);
232 func = ftrace_pid_func; 247 func = ftrace_pid_func;
233 } else { 248 } else {
@@ -817,8 +832,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817} 832}
818#endif /* CONFIG_FUNCTION_PROFILER */ 833#endif /* CONFIG_FUNCTION_PROFILER */
819 834
820/* set when tracing only a pid */
821struct pid *ftrace_pid_trace;
822static struct pid * const ftrace_swapper_pid = &init_struct_pid; 835static struct pid * const ftrace_swapper_pid = &init_struct_pid;
823 836
824#ifdef CONFIG_DYNAMIC_FTRACE 837#ifdef CONFIG_DYNAMIC_FTRACE
@@ -1074,14 +1087,9 @@ static void ftrace_replace_code(int enable)
1074 failed = __ftrace_replace_code(rec, enable); 1087 failed = __ftrace_replace_code(rec, enable);
1075 if (failed) { 1088 if (failed) {
1076 rec->flags |= FTRACE_FL_FAILED; 1089 rec->flags |= FTRACE_FL_FAILED;
1077 if ((system_state == SYSTEM_BOOTING) || 1090 ftrace_bug(failed, rec->ip);
1078 !core_kernel_text(rec->ip)) { 1091 /* Stop processing */
1079 ftrace_free_rec(rec); 1092 return;
1080 } else {
1081 ftrace_bug(failed, rec->ip);
1082 /* Stop processing */
1083 return;
1084 }
1085 } 1093 }
1086 } while_for_each_ftrace_rec(); 1094 } while_for_each_ftrace_rec();
1087} 1095}
@@ -1262,12 +1270,34 @@ static int ftrace_update_code(struct module *mod)
1262 ftrace_new_addrs = p->newlist; 1270 ftrace_new_addrs = p->newlist;
1263 p->flags = 0L; 1271 p->flags = 0L;
1264 1272
1265 /* convert record (i.e, patch mcount-call with NOP) */ 1273 /*
1266 if (ftrace_code_disable(mod, p)) { 1274 * Do the initial record convertion from mcount jump
1267 p->flags |= FTRACE_FL_CONVERTED; 1275 * to the NOP instructions.
1268 ftrace_update_cnt++; 1276 */
1269 } else 1277 if (!ftrace_code_disable(mod, p)) {
1270 ftrace_free_rec(p); 1278 ftrace_free_rec(p);
1279 continue;
1280 }
1281
1282 p->flags |= FTRACE_FL_CONVERTED;
1283 ftrace_update_cnt++;
1284
1285 /*
1286 * If the tracing is enabled, go ahead and enable the record.
1287 *
1288 * The reason not to enable the record immediatelly is the
1289 * inherent check of ftrace_make_nop/ftrace_make_call for
1290 * correct previous instructions. Making first the NOP
1291 * conversion puts the module to the correct state, thus
1292 * passing the ftrace_make_call check.
1293 */
1294 if (ftrace_start_up) {
1295 int failed = __ftrace_replace_code(p, 1);
1296 if (failed) {
1297 ftrace_bug(failed, p->ip);
1298 ftrace_free_rec(p);
1299 }
1300 }
1271 } 1301 }
1272 1302
1273 stop = ftrace_now(raw_smp_processor_id()); 1303 stop = ftrace_now(raw_smp_processor_id());
@@ -1323,11 +1353,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1323 1353
1324enum { 1354enum {
1325 FTRACE_ITER_FILTER = (1 << 0), 1355 FTRACE_ITER_FILTER = (1 << 0),
1326 FTRACE_ITER_CONT = (1 << 1), 1356 FTRACE_ITER_NOTRACE = (1 << 1),
1327 FTRACE_ITER_NOTRACE = (1 << 2), 1357 FTRACE_ITER_FAILURES = (1 << 2),
1328 FTRACE_ITER_FAILURES = (1 << 3), 1358 FTRACE_ITER_PRINTALL = (1 << 3),
1329 FTRACE_ITER_PRINTALL = (1 << 4), 1359 FTRACE_ITER_HASH = (1 << 4),
1330 FTRACE_ITER_HASH = (1 << 5),
1331}; 1360};
1332 1361
1333#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1362#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1366,7 @@ struct ftrace_iterator {
1337 int hidx; 1366 int hidx;
1338 int idx; 1367 int idx;
1339 unsigned flags; 1368 unsigned flags;
1340 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1369 struct trace_parser parser;
1341 unsigned buffer_idx;
1342}; 1370};
1343 1371
1344static void * 1372static void *
@@ -1407,7 +1435,7 @@ static int t_hash_show(struct seq_file *m, void *v)
1407 if (rec->ops->print) 1435 if (rec->ops->print)
1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1436 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1409 1437
1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); 1438 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1411 1439
1412 if (rec->data) 1440 if (rec->data)
1413 seq_printf(m, ":%p", rec->data); 1441 seq_printf(m, ":%p", rec->data);
@@ -1517,12 +1545,12 @@ static int t_show(struct seq_file *m, void *v)
1517 if (!rec) 1545 if (!rec)
1518 return 0; 1546 return 0;
1519 1547
1520 seq_printf(m, "%pf\n", (void *)rec->ip); 1548 seq_printf(m, "%ps\n", (void *)rec->ip);
1521 1549
1522 return 0; 1550 return 0;
1523} 1551}
1524 1552
1525static struct seq_operations show_ftrace_seq_ops = { 1553static const struct seq_operations show_ftrace_seq_ops = {
1526 .start = t_start, 1554 .start = t_start,
1527 .next = t_next, 1555 .next = t_next,
1528 .stop = t_stop, 1556 .stop = t_stop,
@@ -1604,6 +1632,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1604 if (!iter) 1632 if (!iter)
1605 return -ENOMEM; 1633 return -ENOMEM;
1606 1634
1635 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1636 kfree(iter);
1637 return -ENOMEM;
1638 }
1639
1607 mutex_lock(&ftrace_regex_lock); 1640 mutex_lock(&ftrace_regex_lock);
1608 if ((file->f_mode & FMODE_WRITE) && 1641 if ((file->f_mode & FMODE_WRITE) &&
1609 (file->f_flags & O_TRUNC)) 1642 (file->f_flags & O_TRUNC))
@@ -1618,8 +1651,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1618 if (!ret) { 1651 if (!ret) {
1619 struct seq_file *m = file->private_data; 1652 struct seq_file *m = file->private_data;
1620 m->private = iter; 1653 m->private = iter;
1621 } else 1654 } else {
1655 trace_parser_put(&iter->parser);
1622 kfree(iter); 1656 kfree(iter);
1657 }
1623 } else 1658 } else
1624 file->private_data = iter; 1659 file->private_data = iter;
1625 mutex_unlock(&ftrace_regex_lock); 1660 mutex_unlock(&ftrace_regex_lock);
@@ -1652,60 +1687,6 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1652 return ret; 1687 return ret;
1653} 1688}
1654 1689
1655enum {
1656 MATCH_FULL,
1657 MATCH_FRONT_ONLY,
1658 MATCH_MIDDLE_ONLY,
1659 MATCH_END_ONLY,
1660};
1661
1662/*
1663 * (static function - no need for kernel doc)
1664 *
1665 * Pass in a buffer containing a glob and this function will
1666 * set search to point to the search part of the buffer and
1667 * return the type of search it is (see enum above).
1668 * This does modify buff.
1669 *
1670 * Returns enum type.
1671 * search returns the pointer to use for comparison.
1672 * not returns 1 if buff started with a '!'
1673 * 0 otherwise.
1674 */
1675static int
1676ftrace_setup_glob(char *buff, int len, char **search, int *not)
1677{
1678 int type = MATCH_FULL;
1679 int i;
1680
1681 if (buff[0] == '!') {
1682 *not = 1;
1683 buff++;
1684 len--;
1685 } else
1686 *not = 0;
1687
1688 *search = buff;
1689
1690 for (i = 0; i < len; i++) {
1691 if (buff[i] == '*') {
1692 if (!i) {
1693 *search = buff + 1;
1694 type = MATCH_END_ONLY;
1695 } else {
1696 if (type == MATCH_END_ONLY)
1697 type = MATCH_MIDDLE_ONLY;
1698 else
1699 type = MATCH_FRONT_ONLY;
1700 buff[i] = 0;
1701 break;
1702 }
1703 }
1704 }
1705
1706 return type;
1707}
1708
1709static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1710{ 1691{
1711 int matched = 0; 1692 int matched = 0;
@@ -1754,7 +1735,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1754 int not; 1735 int not;
1755 1736
1756 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1757 type = ftrace_setup_glob(buff, len, &search, &not); 1738 type = filter_parse_regex(buff, len, &search, &not);
1758 1739
1759 search_len = strlen(search); 1740 search_len = strlen(search);
1760 1741
@@ -1822,7 +1803,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1822 } 1803 }
1823 1804
1824 if (strlen(buff)) { 1805 if (strlen(buff)) {
1825 type = ftrace_setup_glob(buff, strlen(buff), &search, &not); 1806 type = filter_parse_regex(buff, strlen(buff), &search, &not);
1826 search_len = strlen(search); 1807 search_len = strlen(search);
1827 } 1808 }
1828 1809
@@ -1987,7 +1968,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1987 int count = 0; 1968 int count = 0;
1988 char *search; 1969 char *search;
1989 1970
1990 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 1971 type = filter_parse_regex(glob, strlen(glob), &search, &not);
1991 len = strlen(search); 1972 len = strlen(search);
1992 1973
1993 /* we do not support '!' for function probes */ 1974 /* we do not support '!' for function probes */
@@ -2059,12 +2040,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2059 int i, len = 0; 2040 int i, len = 0;
2060 char *search; 2041 char *search;
2061 2042
2062 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2043 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2063 glob = NULL; 2044 glob = NULL;
2064 else { 2045 else if (glob) {
2065 int not; 2046 int not;
2066 2047
2067 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2048 type = filter_parse_regex(glob, strlen(glob), &search, &not);
2068 len = strlen(search); 2049 len = strlen(search);
2069 2050
2070 /* we do not support '!' for function probes */ 2051 /* we do not support '!' for function probes */
@@ -2196,11 +2177,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos, int enable) 2177 size_t cnt, loff_t *ppos, int enable)
2197{ 2178{
2198 struct ftrace_iterator *iter; 2179 struct ftrace_iterator *iter;
2199 char ch; 2180 struct trace_parser *parser;
2200 size_t read = 0; 2181 ssize_t ret, read;
2201 ssize_t ret;
2202 2182
2203 if (!cnt || cnt < 0) 2183 if (!cnt)
2204 return 0; 2184 return 0;
2205 2185
2206 mutex_lock(&ftrace_regex_lock); 2186 mutex_lock(&ftrace_regex_lock);
@@ -2211,72 +2191,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2211 } else 2191 } else
2212 iter = file->private_data; 2192 iter = file->private_data;
2213 2193
2214 if (!*ppos) { 2194 parser = &iter->parser;
2215 iter->flags &= ~FTRACE_ITER_CONT; 2195 read = trace_get_user(parser, ubuf, cnt, ppos);
2216 iter->buffer_idx = 0;
2217 }
2218
2219 ret = get_user(ch, ubuf++);
2220 if (ret)
2221 goto out;
2222 read++;
2223 cnt--;
2224 2196
2225 /* 2197 if (read >= 0 && trace_parser_loaded(parser) &&
2226 * If the parser haven't finished with the last write, 2198 !trace_parser_cont(parser)) {
2227 * continue reading the user input without skipping spaces. 2199 ret = ftrace_process_regex(parser->buffer,
2228 */ 2200 parser->idx, enable);
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2230 /* skip white space */
2231 while (cnt && isspace(ch)) {
2232 ret = get_user(ch, ubuf++);
2233 if (ret)
2234 goto out;
2235 read++;
2236 cnt--;
2237 }
2238
2239 /* only spaces were written */
2240 if (isspace(ch)) {
2241 *ppos += read;
2242 ret = read;
2243 goto out;
2244 }
2245
2246 iter->buffer_idx = 0;
2247 }
2248
2249 while (cnt && !isspace(ch)) {
2250 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2251 iter->buffer[iter->buffer_idx++] = ch;
2252 else {
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = get_user(ch, ubuf++);
2257 if (ret) 2201 if (ret)
2258 goto out; 2202 goto out;
2259 read++;
2260 cnt--;
2261 }
2262 2203
2263 if (isspace(ch)) { 2204 trace_parser_clear(parser);
2264 iter->buffer[iter->buffer_idx] = 0;
2265 ret = ftrace_process_regex(iter->buffer,
2266 iter->buffer_idx, enable);
2267 if (ret)
2268 goto out;
2269 iter->buffer_idx = 0;
2270 } else {
2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 } 2205 }
2274 2206
2275 *ppos += read;
2276 ret = read; 2207 ret = read;
2277 out:
2278 mutex_unlock(&ftrace_regex_lock);
2279 2208
2209 mutex_unlock(&ftrace_regex_lock);
2210out:
2280 return ret; 2211 return ret;
2281} 2212}
2282 2213
@@ -2343,6 +2274,7 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
2343#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE 2274#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE
2344static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; 2275static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
2345static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; 2276static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
2277static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
2346 2278
2347static int __init set_ftrace_notrace(char *str) 2279static int __init set_ftrace_notrace(char *str)
2348{ 2280{
@@ -2358,6 +2290,31 @@ static int __init set_ftrace_filter(char *str)
2358} 2290}
2359__setup("ftrace_filter=", set_ftrace_filter); 2291__setup("ftrace_filter=", set_ftrace_filter);
2360 2292
2293#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2294static int __init set_graph_function(char *str)
2295{
2296 strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
2297 return 1;
2298}
2299__setup("ftrace_graph_filter=", set_graph_function);
2300
2301static void __init set_ftrace_early_graph(char *buf)
2302{
2303 int ret;
2304 char *func;
2305
2306 while (buf) {
2307 func = strsep(&buf, ",");
2308 /* we allow only one expression at a time */
2309 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2310 func);
2311 if (ret)
2312 printk(KERN_DEBUG "ftrace: function %s not "
2313 "traceable\n", func);
2314 }
2315}
2316#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2317
2361static void __init set_ftrace_early_filter(char *buf, int enable) 2318static void __init set_ftrace_early_filter(char *buf, int enable)
2362{ 2319{
2363 char *func; 2320 char *func;
@@ -2374,6 +2331,10 @@ static void __init set_ftrace_early_filters(void)
2374 set_ftrace_early_filter(ftrace_filter_buf, 1); 2331 set_ftrace_early_filter(ftrace_filter_buf, 1);
2375 if (ftrace_notrace_buf[0]) 2332 if (ftrace_notrace_buf[0])
2376 set_ftrace_early_filter(ftrace_notrace_buf, 0); 2333 set_ftrace_early_filter(ftrace_notrace_buf, 0);
2334#ifdef CONFIG_FUNCTION_GRAPH_TRACER
2335 if (ftrace_graph_buf[0])
2336 set_ftrace_early_graph(ftrace_graph_buf);
2337#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2377} 2338}
2378 2339
2379static int 2340static int
@@ -2381,6 +2342,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2381{ 2342{
2382 struct seq_file *m = (struct seq_file *)file->private_data; 2343 struct seq_file *m = (struct seq_file *)file->private_data;
2383 struct ftrace_iterator *iter; 2344 struct ftrace_iterator *iter;
2345 struct trace_parser *parser;
2384 2346
2385 mutex_lock(&ftrace_regex_lock); 2347 mutex_lock(&ftrace_regex_lock);
2386 if (file->f_mode & FMODE_READ) { 2348 if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2352,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2390 } else 2352 } else
2391 iter = file->private_data; 2353 iter = file->private_data;
2392 2354
2393 if (iter->buffer_idx) { 2355 parser = &iter->parser;
2394 iter->buffer[iter->buffer_idx] = 0; 2356 if (trace_parser_loaded(parser)) {
2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2357 parser->buffer[parser->idx] = 0;
2358 ftrace_match_records(parser->buffer, parser->idx, enable);
2396 } 2359 }
2397 2360
2398 mutex_lock(&ftrace_lock); 2361 mutex_lock(&ftrace_lock);
@@ -2400,7 +2363,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2400 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2363 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2401 mutex_unlock(&ftrace_lock); 2364 mutex_unlock(&ftrace_lock);
2402 2365
2366 trace_parser_put(parser);
2403 kfree(iter); 2367 kfree(iter);
2368
2404 mutex_unlock(&ftrace_regex_lock); 2369 mutex_unlock(&ftrace_regex_lock);
2405 return 0; 2370 return 0;
2406} 2371}
@@ -2457,11 +2422,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2457static void * 2422static void *
2458__g_next(struct seq_file *m, loff_t *pos) 2423__g_next(struct seq_file *m, loff_t *pos)
2459{ 2424{
2460 unsigned long *array = m->private;
2461
2462 if (*pos >= ftrace_graph_count) 2425 if (*pos >= ftrace_graph_count)
2463 return NULL; 2426 return NULL;
2464 return &array[*pos]; 2427 return &ftrace_graph_funcs[*pos];
2465} 2428}
2466 2429
2467static void * 2430static void *
@@ -2499,12 +2462,12 @@ static int g_show(struct seq_file *m, void *v)
2499 return 0; 2462 return 0;
2500 } 2463 }
2501 2464
2502 seq_printf(m, "%pf\n", v); 2465 seq_printf(m, "%ps\n", (void *)*ptr);
2503 2466
2504 return 0; 2467 return 0;
2505} 2468}
2506 2469
2507static struct seq_operations ftrace_graph_seq_ops = { 2470static const struct seq_operations ftrace_graph_seq_ops = {
2508 .start = g_start, 2471 .start = g_start,
2509 .next = g_next, 2472 .next = g_next,
2510 .stop = g_stop, 2473 .stop = g_stop,
@@ -2525,16 +2488,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2525 ftrace_graph_count = 0; 2488 ftrace_graph_count = 0;
2526 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2489 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2527 } 2490 }
2491 mutex_unlock(&graph_lock);
2528 2492
2529 if (file->f_mode & FMODE_READ) { 2493 if (file->f_mode & FMODE_READ)
2530 ret = seq_open(file, &ftrace_graph_seq_ops); 2494 ret = seq_open(file, &ftrace_graph_seq_ops);
2531 if (!ret) {
2532 struct seq_file *m = file->private_data;
2533 m->private = ftrace_graph_funcs;
2534 }
2535 } else
2536 file->private_data = ftrace_graph_funcs;
2537 mutex_unlock(&graph_lock);
2538 2495
2539 return ret; 2496 return ret;
2540} 2497}
@@ -2563,7 +2520,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2563 return -ENODEV; 2520 return -ENODEV;
2564 2521
2565 /* decode regex */ 2522 /* decode regex */
2566 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not); 2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2567 if (not) 2524 if (not)
2568 return -EINVAL; 2525 return -EINVAL;
2569 2526
@@ -2602,12 +2559,8 @@ static ssize_t
2602ftrace_graph_write(struct file *file, const char __user *ubuf, 2559ftrace_graph_write(struct file *file, const char __user *ubuf,
2603 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2604{ 2561{
2605 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2562 struct trace_parser parser;
2606 unsigned long *array; 2563 ssize_t read, ret;
2607 size_t read = 0;
2608 ssize_t ret;
2609 int index = 0;
2610 char ch;
2611 2564
2612 if (!cnt || cnt < 0) 2565 if (!cnt || cnt < 0)
2613 return 0; 2566 return 0;
@@ -2616,60 +2569,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2616 2569
2617 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2570 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2618 ret = -EBUSY; 2571 ret = -EBUSY;
2619 goto out; 2572 goto out_unlock;
2620 } 2573 }
2621 2574
2622 if (file->f_mode & FMODE_READ) { 2575 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2623 struct seq_file *m = file->private_data; 2576 ret = -ENOMEM;
2624 array = m->private; 2577 goto out_unlock;
2625 } else
2626 array = file->private_data;
2627
2628 ret = get_user(ch, ubuf++);
2629 if (ret)
2630 goto out;
2631 read++;
2632 cnt--;
2633
2634 /* skip white space */
2635 while (cnt && isspace(ch)) {
2636 ret = get_user(ch, ubuf++);
2637 if (ret)
2638 goto out;
2639 read++;
2640 cnt--;
2641 } 2578 }
2642 2579
2643 if (isspace(ch)) { 2580 read = trace_get_user(&parser, ubuf, cnt, ppos);
2644 *ppos += read;
2645 ret = read;
2646 goto out;
2647 }
2648 2581
2649 while (cnt && !isspace(ch)) { 2582 if (read >= 0 && trace_parser_loaded((&parser))) {
2650 if (index < FTRACE_BUFF_MAX) 2583 parser.buffer[parser.idx] = 0;
2651 buffer[index++] = ch; 2584
2652 else { 2585 /* we allow only one expression at a time */
2653 ret = -EINVAL; 2586 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2654 goto out; 2587 parser.buffer);
2655 }
2656 ret = get_user(ch, ubuf++);
2657 if (ret) 2588 if (ret)
2658 goto out; 2589 goto out_free;
2659 read++;
2660 cnt--;
2661 } 2590 }
2662 buffer[index] = 0;
2663
2664 /* we allow only one expression at a time */
2665 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2666 if (ret)
2667 goto out;
2668
2669 file->f_pos += read;
2670 2591
2671 ret = read; 2592 ret = read;
2672 out: 2593
2594out_free:
2595 trace_parser_put(&parser);
2596out_unlock:
2673 mutex_unlock(&graph_lock); 2597 mutex_unlock(&graph_lock);
2674 2598
2675 return ret; 2599 return ret;
@@ -2707,7 +2631,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
2707 return 0; 2631 return 0;
2708} 2632}
2709 2633
2710static int ftrace_convert_nops(struct module *mod, 2634static int ftrace_process_locs(struct module *mod,
2711 unsigned long *start, 2635 unsigned long *start,
2712 unsigned long *end) 2636 unsigned long *end)
2713{ 2637{
@@ -2740,19 +2664,17 @@ static int ftrace_convert_nops(struct module *mod,
2740} 2664}
2741 2665
2742#ifdef CONFIG_MODULES 2666#ifdef CONFIG_MODULES
2743void ftrace_release(void *start, void *end) 2667void ftrace_release_mod(struct module *mod)
2744{ 2668{
2745 struct dyn_ftrace *rec; 2669 struct dyn_ftrace *rec;
2746 struct ftrace_page *pg; 2670 struct ftrace_page *pg;
2747 unsigned long s = (unsigned long)start;
2748 unsigned long e = (unsigned long)end;
2749 2671
2750 if (ftrace_disabled || !start || start == end) 2672 if (ftrace_disabled)
2751 return; 2673 return;
2752 2674
2753 mutex_lock(&ftrace_lock); 2675 mutex_lock(&ftrace_lock);
2754 do_for_each_ftrace_rec(pg, rec) { 2676 do_for_each_ftrace_rec(pg, rec) {
2755 if ((rec->ip >= s) && (rec->ip < e)) { 2677 if (within_module_core(rec->ip, mod)) {
2756 /* 2678 /*
2757 * rec->ip is changed in ftrace_free_rec() 2679 * rec->ip is changed in ftrace_free_rec()
2758 * It should not between s and e if record was freed. 2680 * It should not between s and e if record was freed.
@@ -2769,7 +2691,7 @@ static void ftrace_init_module(struct module *mod,
2769{ 2691{
2770 if (ftrace_disabled || start == end) 2692 if (ftrace_disabled || start == end)
2771 return; 2693 return;
2772 ftrace_convert_nops(mod, start, end); 2694 ftrace_process_locs(mod, start, end);
2773} 2695}
2774 2696
2775static int ftrace_module_notify(struct notifier_block *self, 2697static int ftrace_module_notify(struct notifier_block *self,
@@ -2784,9 +2706,7 @@ static int ftrace_module_notify(struct notifier_block *self,
2784 mod->num_ftrace_callsites); 2706 mod->num_ftrace_callsites);
2785 break; 2707 break;
2786 case MODULE_STATE_GOING: 2708 case MODULE_STATE_GOING:
2787 ftrace_release(mod->ftrace_callsites, 2709 ftrace_release_mod(mod);
2788 mod->ftrace_callsites +
2789 mod->num_ftrace_callsites);
2790 break; 2710 break;
2791 } 2711 }
2792 2712
@@ -2832,7 +2752,7 @@ void __init ftrace_init(void)
2832 2752
2833 last_ftrace_enabled = ftrace_enabled = 1; 2753 last_ftrace_enabled = ftrace_enabled = 1;
2834 2754
2835 ret = ftrace_convert_nops(NULL, 2755 ret = ftrace_process_locs(NULL,
2836 __start_mcount_loc, 2756 __start_mcount_loc,
2837 __stop_mcount_loc); 2757 __stop_mcount_loc);
2838 2758
@@ -2865,23 +2785,6 @@ static inline void ftrace_startup_enable(int command) { }
2865# define ftrace_shutdown_sysctl() do { } while (0) 2785# define ftrace_shutdown_sysctl() do { } while (0)
2866#endif /* CONFIG_DYNAMIC_FTRACE */ 2786#endif /* CONFIG_DYNAMIC_FTRACE */
2867 2787
2868static ssize_t
2869ftrace_pid_read(struct file *file, char __user *ubuf,
2870 size_t cnt, loff_t *ppos)
2871{
2872 char buf[64];
2873 int r;
2874
2875 if (ftrace_pid_trace == ftrace_swapper_pid)
2876 r = sprintf(buf, "swapper tasks\n");
2877 else if (ftrace_pid_trace)
2878 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
2879 else
2880 r = sprintf(buf, "no pid\n");
2881
2882 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2883}
2884
2885static void clear_ftrace_swapper(void) 2788static void clear_ftrace_swapper(void)
2886{ 2789{
2887 struct task_struct *p; 2790 struct task_struct *p;
@@ -2932,14 +2835,12 @@ static void set_ftrace_pid(struct pid *pid)
2932 rcu_read_unlock(); 2835 rcu_read_unlock();
2933} 2836}
2934 2837
2935static void clear_ftrace_pid_task(struct pid **pid) 2838static void clear_ftrace_pid_task(struct pid *pid)
2936{ 2839{
2937 if (*pid == ftrace_swapper_pid) 2840 if (pid == ftrace_swapper_pid)
2938 clear_ftrace_swapper(); 2841 clear_ftrace_swapper();
2939 else 2842 else
2940 clear_ftrace_pid(*pid); 2843 clear_ftrace_pid(pid);
2941
2942 *pid = NULL;
2943} 2844}
2944 2845
2945static void set_ftrace_pid_task(struct pid *pid) 2846static void set_ftrace_pid_task(struct pid *pid)
@@ -2950,11 +2851,140 @@ static void set_ftrace_pid_task(struct pid *pid)
2950 set_ftrace_pid(pid); 2851 set_ftrace_pid(pid);
2951} 2852}
2952 2853
2854static int ftrace_pid_add(int p)
2855{
2856 struct pid *pid;
2857 struct ftrace_pid *fpid;
2858 int ret = -EINVAL;
2859
2860 mutex_lock(&ftrace_lock);
2861
2862 if (!p)
2863 pid = ftrace_swapper_pid;
2864 else
2865 pid = find_get_pid(p);
2866
2867 if (!pid)
2868 goto out;
2869
2870 ret = 0;
2871
2872 list_for_each_entry(fpid, &ftrace_pids, list)
2873 if (fpid->pid == pid)
2874 goto out_put;
2875
2876 ret = -ENOMEM;
2877
2878 fpid = kmalloc(sizeof(*fpid), GFP_KERNEL);
2879 if (!fpid)
2880 goto out_put;
2881
2882 list_add(&fpid->list, &ftrace_pids);
2883 fpid->pid = pid;
2884
2885 set_ftrace_pid_task(pid);
2886
2887 ftrace_update_pid_func();
2888 ftrace_startup_enable(0);
2889
2890 mutex_unlock(&ftrace_lock);
2891 return 0;
2892
2893out_put:
2894 if (pid != ftrace_swapper_pid)
2895 put_pid(pid);
2896
2897out:
2898 mutex_unlock(&ftrace_lock);
2899 return ret;
2900}
2901
2902static void ftrace_pid_reset(void)
2903{
2904 struct ftrace_pid *fpid, *safe;
2905
2906 mutex_lock(&ftrace_lock);
2907 list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) {
2908 struct pid *pid = fpid->pid;
2909
2910 clear_ftrace_pid_task(pid);
2911
2912 list_del(&fpid->list);
2913 kfree(fpid);
2914 }
2915
2916 ftrace_update_pid_func();
2917 ftrace_startup_enable(0);
2918
2919 mutex_unlock(&ftrace_lock);
2920}
2921
2922static void *fpid_start(struct seq_file *m, loff_t *pos)
2923{
2924 mutex_lock(&ftrace_lock);
2925
2926 if (list_empty(&ftrace_pids) && (!*pos))
2927 return (void *) 1;
2928
2929 return seq_list_start(&ftrace_pids, *pos);
2930}
2931
2932static void *fpid_next(struct seq_file *m, void *v, loff_t *pos)
2933{
2934 if (v == (void *)1)
2935 return NULL;
2936
2937 return seq_list_next(v, &ftrace_pids, pos);
2938}
2939
2940static void fpid_stop(struct seq_file *m, void *p)
2941{
2942 mutex_unlock(&ftrace_lock);
2943}
2944
2945static int fpid_show(struct seq_file *m, void *v)
2946{
2947 const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list);
2948
2949 if (v == (void *)1) {
2950 seq_printf(m, "no pid\n");
2951 return 0;
2952 }
2953
2954 if (fpid->pid == ftrace_swapper_pid)
2955 seq_printf(m, "swapper tasks\n");
2956 else
2957 seq_printf(m, "%u\n", pid_vnr(fpid->pid));
2958
2959 return 0;
2960}
2961
2962static const struct seq_operations ftrace_pid_sops = {
2963 .start = fpid_start,
2964 .next = fpid_next,
2965 .stop = fpid_stop,
2966 .show = fpid_show,
2967};
2968
2969static int
2970ftrace_pid_open(struct inode *inode, struct file *file)
2971{
2972 int ret = 0;
2973
2974 if ((file->f_mode & FMODE_WRITE) &&
2975 (file->f_flags & O_TRUNC))
2976 ftrace_pid_reset();
2977
2978 if (file->f_mode & FMODE_READ)
2979 ret = seq_open(file, &ftrace_pid_sops);
2980
2981 return ret;
2982}
2983
2953static ssize_t 2984static ssize_t
2954ftrace_pid_write(struct file *filp, const char __user *ubuf, 2985ftrace_pid_write(struct file *filp, const char __user *ubuf,
2955 size_t cnt, loff_t *ppos) 2986 size_t cnt, loff_t *ppos)
2956{ 2987{
2957 struct pid *pid;
2958 char buf[64]; 2988 char buf[64];
2959 long val; 2989 long val;
2960 int ret; 2990 int ret;
@@ -2967,57 +2997,38 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
2967 2997
2968 buf[cnt] = 0; 2998 buf[cnt] = 0;
2969 2999
3000 /*
3001 * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid"
3002 * to clean the filter quietly.
3003 */
3004 strstrip(buf);
3005 if (strlen(buf) == 0)
3006 return 1;
3007
2970 ret = strict_strtol(buf, 10, &val); 3008 ret = strict_strtol(buf, 10, &val);
2971 if (ret < 0) 3009 if (ret < 0)
2972 return ret; 3010 return ret;
2973 3011
2974 mutex_lock(&ftrace_lock); 3012 ret = ftrace_pid_add(val);
2975 if (val < 0) {
2976 /* disable pid tracing */
2977 if (!ftrace_pid_trace)
2978 goto out;
2979
2980 clear_ftrace_pid_task(&ftrace_pid_trace);
2981
2982 } else {
2983 /* swapper task is special */
2984 if (!val) {
2985 pid = ftrace_swapper_pid;
2986 if (pid == ftrace_pid_trace)
2987 goto out;
2988 } else {
2989 pid = find_get_pid(val);
2990
2991 if (pid == ftrace_pid_trace) {
2992 put_pid(pid);
2993 goto out;
2994 }
2995 }
2996
2997 if (ftrace_pid_trace)
2998 clear_ftrace_pid_task(&ftrace_pid_trace);
2999
3000 if (!pid)
3001 goto out;
3002 3013
3003 ftrace_pid_trace = pid; 3014 return ret ? ret : cnt;
3004 3015}
3005 set_ftrace_pid_task(ftrace_pid_trace);
3006 }
3007
3008 /* update the function call */
3009 ftrace_update_pid_func();
3010 ftrace_startup_enable(0);
3011 3016
3012 out: 3017static int
3013 mutex_unlock(&ftrace_lock); 3018ftrace_pid_release(struct inode *inode, struct file *file)
3019{
3020 if (file->f_mode & FMODE_READ)
3021 seq_release(inode, file);
3014 3022
3015 return cnt; 3023 return 0;
3016} 3024}
3017 3025
3018static const struct file_operations ftrace_pid_fops = { 3026static const struct file_operations ftrace_pid_fops = {
3019 .read = ftrace_pid_read, 3027 .open = ftrace_pid_open,
3020 .write = ftrace_pid_write, 3028 .write = ftrace_pid_write,
3029 .read = seq_read,
3030 .llseek = seq_lseek,
3031 .release = ftrace_pid_release,
3021}; 3032};
3022 3033
3023static __init int ftrace_init_debugfs(void) 3034static __init int ftrace_init_debugfs(void)
@@ -3100,7 +3111,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3100 3111
3101int 3112int
3102ftrace_enable_sysctl(struct ctl_table *table, int write, 3113ftrace_enable_sysctl(struct ctl_table *table, int write,
3103 struct file *file, void __user *buffer, size_t *lenp, 3114 void __user *buffer, size_t *lenp,
3104 loff_t *ppos) 3115 loff_t *ppos)
3105{ 3116{
3106 int ret; 3117 int ret;
@@ -3110,7 +3121,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3110 3121
3111 mutex_lock(&ftrace_lock); 3122 mutex_lock(&ftrace_lock);
3112 3123
3113 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3124 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3114 3125
3115 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3126 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3116 goto out; 3127 goto out;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
501 return 1; 501 return 1;
502 } 502 }
503 503
504 if (!register_tracer(&kmem_tracer)) { 504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n"); 505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1; 506 return 1;
507 } 507 }
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718cf..e43c928356ee 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -399,18 +397,21 @@ int ring_buffer_print_page_header(struct trace_seq *s)
399 int ret; 397 int ret;
400 398
401 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" 399 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
402 "offset:0;\tsize:%u;\n", 400 "offset:0;\tsize:%u;\tsigned:%u;\n",
403 (unsigned int)sizeof(field.time_stamp)); 401 (unsigned int)sizeof(field.time_stamp),
402 (unsigned int)is_signed_type(u64));
404 403
405 ret = trace_seq_printf(s, "\tfield: local_t commit;\t" 404 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
406 "offset:%u;\tsize:%u;\n", 405 "offset:%u;\tsize:%u;\tsigned:%u;\n",
407 (unsigned int)offsetof(typeof(field), commit), 406 (unsigned int)offsetof(typeof(field), commit),
408 (unsigned int)sizeof(field.commit)); 407 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long));
409 409
410 ret = trace_seq_printf(s, "\tfield: char data;\t" 410 ret = trace_seq_printf(s, "\tfield: char data;\t"
411 "offset:%u;\tsize:%u;\n", 411 "offset:%u;\tsize:%u;\tsigned:%u;\n",
412 (unsigned int)offsetof(typeof(field), data), 412 (unsigned int)offsetof(typeof(field), data),
413 (unsigned int)BUF_PAGE_SIZE); 413 (unsigned int)BUF_PAGE_SIZE,
414 (unsigned int)is_signed_type(char));
414 415
415 return ret; 416 return ret;
416} 417}
@@ -701,8 +702,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
701 702
702 val &= ~RB_FLAG_MASK; 703 val &= ~RB_FLAG_MASK;
703 704
704 ret = (unsigned long)cmpxchg(&list->next, 705 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag); 706 val | old_flag, val | new_flag);
706 707
707 /* check if the reader took the page */ 708 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val) 709 if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +795,7 @@ static int rb_head_page_replace(struct buffer_page *old,
794 val = *ptr & ~RB_FLAG_MASK; 795 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD; 796 val |= RB_PAGE_HEAD;
796 797
797 ret = cmpxchg(ptr, val, &new->list); 798 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798 799
799 return ret == val; 800 return ret == val;
800} 801}
@@ -2997,15 +2998,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2997} 2998}
2998 2999
2999static struct ring_buffer_event * 3000static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3001rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3001{ 3002{
3002 struct ring_buffer_per_cpu *cpu_buffer;
3003 struct ring_buffer_event *event; 3003 struct ring_buffer_event *event;
3004 struct buffer_page *reader; 3004 struct buffer_page *reader;
3005 int nr_loops = 0; 3005 int nr_loops = 0;
3006 3006
3007 cpu_buffer = buffer->buffers[cpu];
3008
3009 again: 3007 again:
3010 /* 3008 /*
3011 * We repeat when a timestamp is encountered. It is possible 3009 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3047,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3049 case RINGBUF_TYPE_DATA: 3047 case RINGBUF_TYPE_DATA:
3050 if (ts) { 3048 if (ts) {
3051 *ts = cpu_buffer->read_stamp + event->time_delta; 3049 *ts = cpu_buffer->read_stamp + event->time_delta;
3052 ring_buffer_normalize_time_stamp(buffer, 3050 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3053 cpu_buffer->cpu, ts); 3051 cpu_buffer->cpu, ts);
3054 } 3052 }
3055 return event; 3053 return event;
@@ -3168,7 +3166,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3168 local_irq_save(flags); 3166 local_irq_save(flags);
3169 if (dolock) 3167 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock); 3168 spin_lock(&cpu_buffer->reader_lock);
3171 event = rb_buffer_peek(buffer, cpu, ts); 3169 event = rb_buffer_peek(cpu_buffer, ts);
3172 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3170 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer); 3171 rb_advance_reader(cpu_buffer);
3174 if (dolock) 3172 if (dolock)
@@ -3237,7 +3235,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3237 if (dolock) 3235 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3239 3237
3240 event = rb_buffer_peek(buffer, cpu, ts); 3238 event = rb_buffer_peek(cpu_buffer, ts);
3241 if (event) 3239 if (event)
3242 rb_advance_reader(cpu_buffer); 3240 rb_advance_reader(cpu_buffer);
3243 3241
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..026e715a0c7a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,19 +125,19 @@ int ftrace_dump_on_oops;
125 125
126static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
127 127
128#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
129static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_cmdline_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
136 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
138 return 1; 138 return 1;
139} 139}
140__setup("ftrace=", set_ftrace); 140__setup("ftrace=", set_cmdline_ftrace);
141 141
142static int __init set_ftrace_dump_on_oops(char *str) 142static int __init set_ftrace_dump_on_oops(char *str)
143{ 143{
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
242static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
243 243
244/* 244/*
245 * max_tracer_type_len is used to simplify the allocating of
246 * buffers to read userspace tracer names. We keep track of
247 * the longest tracer name registered.
248 */
249static int max_tracer_type_len;
250
251/*
252 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
253 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
254 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
275 */ 268 */
276void trace_wake_up(void) 269void trace_wake_up(void)
277{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
278 /* 275 /*
279 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
280 * have for now: 277 * have for now:
281 */ 278 */
282 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
283 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
284} 283}
285 284
286static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -339,6 +338,112 @@ static struct {
339 338
340int trace_clock_id; 339int trace_clock_id;
341 340
341/*
342 * trace_parser_get_init - gets the buffer for trace parser
343 */
344int trace_parser_get_init(struct trace_parser *parser, int size)
345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
355
356/*
357 * trace_parser_put - frees the buffer for trace parser
358 */
359void trace_parser_put(struct trace_parser *parser)
360{
361 kfree(parser->buffer);
362}
363
364/*
365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
381
382 if (!*ppos)
383 trace_parser_clear(parser);
384
385 ret = get_user(ch, ubuf++);
386 if (ret)
387 goto out;
388
389 read++;
390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
445}
446
342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
343{ 448{
344 int len; 449 int len;
@@ -513,7 +618,6 @@ __releases(kernel_lock)
513__acquires(kernel_lock) 618__acquires(kernel_lock)
514{ 619{
515 struct tracer *t; 620 struct tracer *t;
516 int len;
517 int ret = 0; 621 int ret = 0;
518 622
519 if (!type->name) { 623 if (!type->name) {
@@ -521,6 +625,11 @@ __acquires(kernel_lock)
521 return -1; 625 return -1;
522 } 626 }
523 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
524 /* 633 /*
525 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
526 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -535,7 +644,7 @@ __acquires(kernel_lock)
535 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
536 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
537 /* already found */ 646 /* already found */
538 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
539 type->name); 648 type->name);
540 ret = -1; 649 ret = -1;
541 goto out; 650 goto out;
@@ -586,9 +695,6 @@ __acquires(kernel_lock)
586 695
587 type->next = trace_types; 696 type->next = trace_types;
588 trace_types = type; 697 trace_types = type;
589 len = strlen(type->name);
590 if (len > max_tracer_type_len)
591 max_tracer_type_len = len;
592 698
593 out: 699 out:
594 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -597,7 +703,7 @@ __acquires(kernel_lock)
597 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
598 goto out_unlock; 704 goto out_unlock;
599 705
600 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
601 goto out_unlock; 707 goto out_unlock;
602 708
603 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -619,14 +725,13 @@ __acquires(kernel_lock)
619void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
620{ 726{
621 struct tracer **t; 727 struct tracer **t;
622 int len;
623 728
624 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
625 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
626 if (*t == type) 731 if (*t == type)
627 goto found; 732 goto found;
628 } 733 }
629 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
630 goto out; 735 goto out;
631 736
632 found: 737 found:
@@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type)
639 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
640 current_trace = &nop_trace; 745 current_trace = &nop_trace;
641 } 746 }
642 747out:
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
654} 749}
655 750
@@ -719,6 +814,11 @@ static void trace_init_cmdlines(void)
719 cmdline_idx = 0; 814 cmdline_idx = 0;
720} 815}
721 816
817int is_tracing_stopped(void)
818{
819 return trace_stop_count;
820}
821
722/** 822/**
723 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
724 * 824 *
@@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
886 986
887 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
888 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
889 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
890 entry->flags = 990 entry->flags =
891#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
892 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1068 return; 1168 return;
1069 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1070 1170
1171 entry->tgid = current->tgid;
1071 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1072 1173
1073 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr,
1094 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1095 int pc) 1196 int pc)
1096{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1097 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1098 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer; 1201 struct ring_buffer *buffer = tr->buffer;
@@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1107 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1108 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1109 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1110 trace_buffer_unlock_commit(buffer, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1111} 1215}
1112 1216
1113void 1217void
@@ -1289,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr,
1289 1393
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{ 1395{
1292 return trace_array_printk(&global_trace, ip, fmt, args); 1396 return trace_array_vprintk(&global_trace, ip, fmt, args);
1293} 1397}
1294EXPORT_SYMBOL_GPL(trace_vprintk); 1398EXPORT_SYMBOL_GPL(trace_vprintk);
1295 1399
@@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m)
1530 seq_puts(m, "# | / _----=> need-resched \n"); 1634 seq_puts(m, "# | / _----=> need-resched \n");
1531 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1635 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1532 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1636 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1533 seq_puts(m, "# |||| / \n"); 1637 seq_puts(m, "# |||| /_--=> lock-depth \n");
1534 seq_puts(m, "# ||||| delay \n"); 1638 seq_puts(m, "# |||||/ delay \n");
1535 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1639 seq_puts(m, "# cmd pid |||||| time | caller \n");
1536 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1640 seq_puts(m, "# \\ / |||||| \\ | / \n");
1537} 1641}
1538 1642
1539static void print_func_help_header(struct seq_file *m) 1643static void print_func_help_header(struct seq_file *m)
@@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
1845 return 0; 1949 return 0;
1846} 1950}
1847 1951
1848static struct seq_operations tracer_seq_ops = { 1952static const struct seq_operations tracer_seq_ops = {
1849 .start = s_start, 1953 .start = s_start,
1850 .next = s_next, 1954 .next = s_next,
1851 .stop = s_stop, 1955 .stop = s_stop,
@@ -1880,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
1880 if (current_trace) 1984 if (current_trace)
1881 *iter->trace = *current_trace; 1985 *iter->trace = *current_trace;
1882 1986
1883 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 1987 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1884 goto fail; 1988 goto fail;
1885 1989
1886 cpumask_clear(iter->started);
1887
1888 if (current_trace && current_trace->print_max) 1990 if (current_trace && current_trace->print_max)
1889 iter->tr = &max_tr; 1991 iter->tr = &max_tr;
1890 else 1992 else
@@ -2059,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v)
2059 return 0; 2161 return 0;
2060} 2162}
2061 2163
2062static struct seq_operations show_traces_seq_ops = { 2164static const struct seq_operations show_traces_seq_ops = {
2063 .start = t_start, 2165 .start = t_start,
2064 .next = t_next, 2166 .next = t_next,
2065 .stop = t_stop, 2167 .stop = t_stop,
@@ -2489,7 +2591,7 @@ static ssize_t
2489tracing_set_trace_read(struct file *filp, char __user *ubuf, 2591tracing_set_trace_read(struct file *filp, char __user *ubuf,
2490 size_t cnt, loff_t *ppos) 2592 size_t cnt, loff_t *ppos)
2491{ 2593{
2492 char buf[max_tracer_type_len+2]; 2594 char buf[MAX_TRACER_SIZE+2];
2493 int r; 2595 int r;
2494 2596
2495 mutex_lock(&trace_types_lock); 2597 mutex_lock(&trace_types_lock);
@@ -2639,15 +2741,15 @@ static ssize_t
2639tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2741tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2640 size_t cnt, loff_t *ppos) 2742 size_t cnt, loff_t *ppos)
2641{ 2743{
2642 char buf[max_tracer_type_len+1]; 2744 char buf[MAX_TRACER_SIZE+1];
2643 int i; 2745 int i;
2644 size_t ret; 2746 size_t ret;
2645 int err; 2747 int err;
2646 2748
2647 ret = cnt; 2749 ret = cnt;
2648 2750
2649 if (cnt > max_tracer_type_len) 2751 if (cnt > MAX_TRACER_SIZE)
2650 cnt = max_tracer_type_len; 2752 cnt = MAX_TRACER_SIZE;
2651 2753
2652 if (copy_from_user(&buf, ubuf, cnt)) 2754 if (copy_from_user(&buf, ubuf, cnt))
2653 return -EFAULT; 2755 return -EFAULT;
@@ -4285,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
4285 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4286 goto out_free_buffer_mask; 4388 goto out_free_buffer_mask;
4287 4389
4288 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4289 goto out_free_tracing_cpumask; 4391 goto out_free_tracing_cpumask;
4290 4392
4291 /* To save memory, keep the ring buffer size to its minimum */ 4393 /* To save memory, keep the ring buffer size to its minimum */
@@ -4296,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
4296 4398
4297 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4399 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4298 cpumask_copy(tracing_cpumask, cpu_all_mask); 4400 cpumask_copy(tracing_cpumask, cpu_all_mask);
4299 cpumask_clear(tracing_reader_cpumask);
4300 4401
4301 /* TODO: make the number of buffers hot pluggable with CPUS */ 4402 /* TODO: make the number of buffers hot pluggable with CPUS */
4302 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4403 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea7e0bcbd539..91c3d0e9a5a1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,10 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -40,164 +40,60 @@ enum trace_type {
40 TRACE_HW_BRANCHES, 40 TRACE_HW_BRANCHES,
41 TRACE_KMEM_ALLOC, 41 TRACE_KMEM_ALLOC,
42 TRACE_KMEM_FREE, 42 TRACE_KMEM_FREE,
43 TRACE_POWER,
44 TRACE_BLK, 43 TRACE_BLK,
45 TRACE_KSYM, 44 TRACE_KSYM,
46 45
47 __TRACE_LAST_TYPE, 46 __TRACE_LAST_TYPE,
48}; 47};
49 48
50/* 49enum kmemtrace_type_id {
51 * Function trace entry - function address and parent function addres: 50 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
52 */ 51 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
53struct ftrace_entry { 52 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
54 struct trace_entry ent;
55 unsigned long ip;
56 unsigned long parent_ip;
57};
58
59/* Function call entry */
60struct ftrace_graph_ent_entry {
61 struct trace_entry ent;
62 struct ftrace_graph_ent graph_ent;
63}; 53};
64 54
65/* Function return entry */
66struct ftrace_graph_ret_entry {
67 struct trace_entry ent;
68 struct ftrace_graph_ret ret;
69};
70extern struct tracer boot_tracer; 55extern struct tracer boot_tracer;
71 56
72/* 57#undef __field
73 * Context switch trace entry - which task (and prio) we switched from/to: 58#define __field(type, item) type item;
74 */
75struct ctx_switch_entry {
76 struct trace_entry ent;
77 unsigned int prev_pid;
78 unsigned char prev_prio;
79 unsigned char prev_state;
80 unsigned int next_pid;
81 unsigned char next_prio;
82 unsigned char next_state;
83 unsigned int next_cpu;
84};
85
86/*
87 * Special (free-form) trace entry:
88 */
89struct special_entry {
90 struct trace_entry ent;
91 unsigned long arg1;
92 unsigned long arg2;
93 unsigned long arg3;
94};
95
96/*
97 * Stack-trace entry:
98 */
99
100#define FTRACE_STACK_ENTRIES 8
101
102struct stack_entry {
103 struct trace_entry ent;
104 unsigned long caller[FTRACE_STACK_ENTRIES];
105};
106
107struct userstack_entry {
108 struct trace_entry ent;
109 unsigned long caller[FTRACE_STACK_ENTRIES];
110};
111
112/*
113 * trace_printk entry:
114 */
115struct bprint_entry {
116 struct trace_entry ent;
117 unsigned long ip;
118 const char *fmt;
119 u32 buf[];
120};
121
122struct print_entry {
123 struct trace_entry ent;
124 unsigned long ip;
125 char buf[];
126};
127
128#define TRACE_OLD_SIZE 88
129
130struct trace_field_cont {
131 unsigned char type;
132 /* Temporary till we get rid of this completely */
133 char buf[TRACE_OLD_SIZE - 1];
134};
135 59
136struct trace_mmiotrace_rw { 60#undef __field_struct
137 struct trace_entry ent; 61#define __field_struct(type, item) __field(type, item)
138 struct mmiotrace_rw rw;
139};
140 62
141struct trace_mmiotrace_map { 63#undef __field_desc
142 struct trace_entry ent; 64#define __field_desc(type, container, item)
143 struct mmiotrace_map map;
144};
145 65
146struct trace_boot_call { 66#undef __array
147 struct trace_entry ent; 67#define __array(type, item, size) type item[size];
148 struct boot_trace_call boot_call;
149};
150 68
151struct trace_boot_ret { 69#undef __array_desc
152 struct trace_entry ent; 70#define __array_desc(type, container, item, size)
153 struct boot_trace_ret boot_ret;
154};
155 71
156#define TRACE_FUNC_SIZE 30 72#undef __dynamic_array
157#define TRACE_FILE_SIZE 20 73#define __dynamic_array(type, item) type item[];
158struct trace_branch {
159 struct trace_entry ent;
160 unsigned line;
161 char func[TRACE_FUNC_SIZE+1];
162 char file[TRACE_FILE_SIZE+1];
163 char correct;
164};
165 74
166struct hw_branch_entry { 75#undef F_STRUCT
167 struct trace_entry ent; 76#define F_STRUCT(args...) args
168 u64 from;
169 u64 to;
170};
171 77
172struct trace_power { 78#undef FTRACE_ENTRY
173 struct trace_entry ent; 79#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
174 struct power_trace state_data; 80 struct struct_name { \
175}; 81 struct trace_entry ent; \
82 tstruct \
83 }
176 84
177enum kmemtrace_type_id { 85#undef TP_ARGS
178 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 86#define TP_ARGS(args...) args
179 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
180 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
181};
182 87
183struct kmemtrace_alloc_entry { 88#undef FTRACE_ENTRY_DUP
184 struct trace_entry ent; 89#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
185 enum kmemtrace_type_id type_id;
186 unsigned long call_site;
187 const void *ptr;
188 size_t bytes_req;
189 size_t bytes_alloc;
190 gfp_t gfp_flags;
191 int node;
192};
193 90
194struct kmemtrace_free_entry { 91#include "trace_entries.h"
195 struct trace_entry ent;
196 enum kmemtrace_type_id type_id;
197 unsigned long call_site;
198 const void *ptr;
199};
200 92
93/*
94 * syscalls are special, and need special handling, this is why
95 * they are not included in trace_entries.h
96 */
201struct syscall_trace_enter { 97struct syscall_trace_enter {
202 struct trace_entry ent; 98 struct trace_entry ent;
203 int nr; 99 int nr;
@@ -210,23 +106,12 @@ struct syscall_trace_exit {
210 unsigned long ret; 106 unsigned long ret;
211}; 107};
212 108
213#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
214extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
215
216struct ksym_trace_entry {
217 struct trace_entry ent;
218 unsigned long ip;
219 unsigned char type;
220 char ksym_name[KSYM_NAME_LEN];
221 char cmd[TASK_COMM_LEN];
222};
223
224/* 109/*
225 * trace_flag_type is an enumeration that holds different 110 * trace_flag_type is an enumeration that holds different
226 * states when a trace occurs. These are: 111 * states when a trace occurs. These are:
227 * IRQS_OFF - interrupts were disabled 112 * IRQS_OFF - interrupts were disabled
228 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 113 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
229 * NEED_RESCED - reschedule is requested 114 * NEED_RESCHED - reschedule is requested
230 * HARDIRQ - inside an interrupt handler 115 * HARDIRQ - inside an interrupt handler
231 * SOFTIRQ - inside a softirq handler 116 * SOFTIRQ - inside a softirq handler
232 */ 117 */
@@ -325,7 +210,6 @@ extern void __ftrace_bad_type(void);
325 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 210 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
326 TRACE_GRAPH_RET); \ 211 TRACE_GRAPH_RET); \
327 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 212 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
328 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
329 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 213 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
330 TRACE_KMEM_ALLOC); \ 214 TRACE_KMEM_ALLOC); \
331 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 215 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -406,7 +290,6 @@ struct tracer {
406 struct tracer *next; 290 struct tracer *next;
407 int print_max; 291 int print_max;
408 struct tracer_flags *flags; 292 struct tracer_flags *flags;
409 struct tracer_stat *stats;
410}; 293};
411 294
412 295
@@ -485,6 +368,10 @@ void tracing_stop_sched_switch_record(void);
485void tracing_start_sched_switch_record(void); 368void tracing_start_sched_switch_record(void);
486int register_tracer(struct tracer *type); 369int register_tracer(struct tracer *type);
487void unregister_tracer(struct tracer *type); 370void unregister_tracer(struct tracer *type);
371int is_tracing_stopped(void);
372
373#define KSYM_SELFTEST_ENTRY "ksym_selftest_dummy"
374extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr);
488 375
489extern unsigned long nsecs_to_usecs(unsigned long nsecs); 376extern unsigned long nsecs_to_usecs(unsigned long nsecs);
490 377
@@ -525,20 +412,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
525 412
526extern cycle_t ftrace_now(int cpu); 413extern cycle_t ftrace_now(int cpu);
527 414
528#ifdef CONFIG_CONTEXT_SWITCH_TRACER
529typedef void
530(*tracer_switch_func_t)(void *private,
531 void *__rq,
532 struct task_struct *prev,
533 struct task_struct *next);
534
535struct tracer_switch_ops {
536 tracer_switch_func_t func;
537 void *private;
538 struct tracer_switch_ops *next;
539};
540#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
541
542extern void trace_find_cmdline(int pid, char comm[]); 415extern void trace_find_cmdline(int pid, char comm[]);
543 416
544#ifdef CONFIG_DYNAMIC_FTRACE 417#ifdef CONFIG_DYNAMIC_FTRACE
@@ -621,10 +494,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
621 return 0; 494 return 0;
622} 495}
623#else 496#else
624static inline int ftrace_trace_addr(unsigned long addr)
625{
626 return 1;
627}
628static inline int ftrace_graph_addr(unsigned long addr) 497static inline int ftrace_graph_addr(unsigned long addr)
629{ 498{
630 return 1; 499 return 1;
@@ -638,12 +507,12 @@ print_graph_function(struct trace_iterator *iter)
638} 507}
639#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 508#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
640 509
641extern struct pid *ftrace_pid_trace; 510extern struct list_head ftrace_pids;
642 511
643#ifdef CONFIG_FUNCTION_TRACER 512#ifdef CONFIG_FUNCTION_TRACER
644static inline int ftrace_trace_task(struct task_struct *task) 513static inline int ftrace_trace_task(struct task_struct *task)
645{ 514{
646 if (!ftrace_pid_trace) 515 if (list_empty(&ftrace_pids))
647 return 1; 516 return 1;
648 517
649 return test_tsk_trace_trace(task); 518 return test_tsk_trace_trace(task);
@@ -656,6 +525,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
656#endif 525#endif
657 526
658/* 527/*
528 * struct trace_parser - servers for reading the user input separated by spaces
529 * @cont: set if the input is not complete - no final space char was found
530 * @buffer: holds the parsed user input
531 * @idx: user input lenght
532 * @size: buffer size
533 */
534struct trace_parser {
535 bool cont;
536 char *buffer;
537 unsigned idx;
538 unsigned size;
539};
540
541static inline bool trace_parser_loaded(struct trace_parser *parser)
542{
543 return (parser->idx != 0);
544}
545
546static inline bool trace_parser_cont(struct trace_parser *parser)
547{
548 return parser->cont;
549}
550
551static inline void trace_parser_clear(struct trace_parser *parser)
552{
553 parser->cont = false;
554 parser->idx = 0;
555}
556
557extern int trace_parser_get_init(struct trace_parser *parser, int size);
558extern void trace_parser_put(struct trace_parser *parser);
559extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
560 size_t cnt, loff_t *ppos);
561
562/*
659 * trace_iterator_flags is an enumeration that defines bit 563 * trace_iterator_flags is an enumeration that defines bit
660 * positions into trace_flags that controls the output. 564 * positions into trace_flags that controls the output.
661 * 565 *
@@ -790,7 +694,6 @@ struct event_filter {
790 int n_preds; 694 int n_preds;
791 struct filter_pred **preds; 695 struct filter_pred **preds;
792 char *filter_string; 696 char *filter_string;
793 bool no_reset;
794}; 697};
795 698
796struct event_subsystem { 699struct event_subsystem {
@@ -802,22 +705,40 @@ struct event_subsystem {
802}; 705};
803 706
804struct filter_pred; 707struct filter_pred;
708struct regex;
805 709
806typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, 710typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
807 int val1, int val2); 711 int val1, int val2);
808 712
809struct filter_pred { 713typedef int (*regex_match_func)(char *str, struct regex *r, int len);
810 filter_pred_fn_t fn; 714
811 u64 val; 715enum regex_type {
812 char str_val[MAX_FILTER_STR_VAL]; 716 MATCH_FULL = 0,
813 int str_len; 717 MATCH_FRONT_ONLY,
814 char *field_name; 718 MATCH_MIDDLE_ONLY,
815 int offset; 719 MATCH_END_ONLY,
816 int not; 720};
817 int op; 721
818 int pop_n; 722struct regex {
723 char pattern[MAX_FILTER_STR_VAL];
724 int len;
725 int field_len;
726 regex_match_func match;
819}; 727};
820 728
729struct filter_pred {
730 filter_pred_fn_t fn;
731 u64 val;
732 struct regex regex;
733 char *field_name;
734 int offset;
735 int not;
736 int op;
737 int pop_n;
738};
739
740extern enum regex_type
741filter_parse_regex(char *buff, int len, char **search, int *not);
821extern void print_event_filter(struct ftrace_event_call *call, 742extern void print_event_filter(struct ftrace_event_call *call,
822 struct trace_seq *s); 743 struct trace_seq *s);
823extern int apply_event_filter(struct ftrace_event_call *call, 744extern int apply_event_filter(struct ftrace_event_call *call,
@@ -833,7 +754,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
833 struct ring_buffer *buffer, 754 struct ring_buffer *buffer,
834 struct ring_buffer_event *event) 755 struct ring_buffer_event *event)
835{ 756{
836 if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { 757 if (unlikely(call->filter_active) &&
758 !filter_match_preds(call->filter, rec)) {
837 ring_buffer_discard_commit(buffer, event); 759 ring_buffer_discard_commit(buffer, event);
838 return 1; 760 return 1;
839 } 761 }
@@ -841,58 +763,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
841 return 0; 763 return 0;
842} 764}
843 765
844#define DEFINE_COMPARISON_PRED(type) \
845static int filter_pred_##type(struct filter_pred *pred, void *event, \
846 int val1, int val2) \
847{ \
848 type *addr = (type *)(event + pred->offset); \
849 type val = (type)pred->val; \
850 int match = 0; \
851 \
852 switch (pred->op) { \
853 case OP_LT: \
854 match = (*addr < val); \
855 break; \
856 case OP_LE: \
857 match = (*addr <= val); \
858 break; \
859 case OP_GT: \
860 match = (*addr > val); \
861 break; \
862 case OP_GE: \
863 match = (*addr >= val); \
864 break; \
865 default: \
866 break; \
867 } \
868 \
869 return match; \
870}
871
872#define DEFINE_EQUALITY_PRED(size) \
873static int filter_pred_##size(struct filter_pred *pred, void *event, \
874 int val1, int val2) \
875{ \
876 u##size *addr = (u##size *)(event + pred->offset); \
877 u##size val = (u##size)pred->val; \
878 int match; \
879 \
880 match = (val == *addr) ^ pred->not; \
881 \
882 return match; \
883}
884
885extern struct mutex event_mutex; 766extern struct mutex event_mutex;
886extern struct list_head ftrace_events; 767extern struct list_head ftrace_events;
887 768
888extern const char *__start___trace_bprintk_fmt[]; 769extern const char *__start___trace_bprintk_fmt[];
889extern const char *__stop___trace_bprintk_fmt[]; 770extern const char *__stop___trace_bprintk_fmt[];
890 771
891#undef TRACE_EVENT_FORMAT 772#undef FTRACE_ENTRY
892#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 773#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
893 extern struct ftrace_event_call event_##call; 774 extern struct ftrace_event_call event_##call;
894#undef TRACE_EVENT_FORMAT_NOFILTER 775#undef FTRACE_ENTRY_DUP
895#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 776#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
896#include "trace_event_types.h" 777 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
778#include "trace_entries.h"
897 779
898#endif /* _LINUX_KERNEL_TRACE_H */ 780#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467e..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
129 129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
132 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
133 struct ring_buffer *buffer; 134 struct ring_buffer *buffer;
134 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(buffer, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
161 struct ring_buffer *buffer; 164 struct ring_buffer *buffer;
162 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
175 goto out; 178 goto out;
176 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
177 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
178 trace_buffer_unlock_commit(buffer, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
179 out: 183 out:
180 preempt_enable(); 184 preempt_enable();
181} 185}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer;
37 unsigned long flags; 38 unsigned long flags;
38 int cpu, pc; 39 int cpu, pc;
39 const char *p; 40 const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
54 goto out; 55 goto out;
55 56
56 pc = preempt_count(); 57 pc = preempt_count();
57 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, 58 buffer = tr->buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
58 sizeof(*entry), flags, pc); 60 sizeof(*entry), flags, pc);
59 if (!event) 61 if (!event)
60 goto out; 62 goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
74 entry->line = f->line; 76 entry->line = f->line;
75 entry->correct = val == expect; 77 entry->correct = val == expect;
76 78
77 if (!filter_check_discard(call, entry, tr->buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
79 81
80 out: 82 out:
81 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..e19747d4f860
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,382 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
367
368FTRACE_ENTRY(ksym_trace, ksym_trace_entry,
369
370 TRACE_KSYM,
371
372 F_STRUCT(
373 __field( unsigned long, ip )
374 __field( unsigned char, type )
375 __array( char , ksym_name, KSYM_NAME_LEN )
376 __array( char , cmd, TASK_COMM_LEN )
377 ),
378
379 F_printk("ip: %pF type: %d ksym_name: %s cmd: %s",
380 (void *)__entry->ip, (unsigned int)__entry->type,
381 __entry->ksym_name, __entry->cmd)
382);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..8d5c171cc998 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,65 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret) {
50 total_profile_count++;
51 return 0;
52 }
53
54fail_buf_nmi:
55 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi);
57 free_percpu(trace_profile_buf);
58 trace_profile_buf_nmi = NULL;
59 trace_profile_buf = NULL;
60 }
61fail_buf:
62 atomic_dec(&event->profile_count);
63
64 return ret;
65}
66
10int ftrace_profile_enable(int event_id) 67int ftrace_profile_enable(int event_id)
11{ 68{
12 struct ftrace_event_call *event; 69 struct ftrace_event_call *event;
@@ -14,8 +71,9 @@ int ftrace_profile_enable(int event_id)
14 71
15 mutex_lock(&event_mutex); 72 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 73 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 74 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 75 try_module_get(event->mod)) {
76 ret = ftrace_profile_enable_event(event);
19 break; 77 break;
20 } 78 }
21 } 79 }
@@ -24,6 +82,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 82 return ret;
25} 83}
26 84
85static void ftrace_profile_disable_event(struct ftrace_event_call *event)
86{
87 char *buf, *nmi_buf;
88
89 if (!atomic_add_negative(-1, &event->profile_count))
90 return;
91
92 event->profile_disable();
93
94 if (!--total_profile_count) {
95 buf = trace_profile_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL);
97
98 nmi_buf = trace_profile_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
100
101 /*
102 * Ensure every events in profiling have finished before
103 * releasing the buffers
104 */
105 synchronize_sched();
106
107 free_percpu(buf);
108 free_percpu(nmi_buf);
109 }
110}
111
27void ftrace_profile_disable(int event_id) 112void ftrace_profile_disable(int event_id)
28{ 113{
29 struct ftrace_event_call *event; 114 struct ftrace_event_call *event;
@@ -31,7 +116,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 116 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 117 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 118 if (event->id == event_id) {
34 event->profile_disable(event); 119 ftrace_profile_disable_event(event);
120 module_put(event->mod);
35 break; 121 break;
36 } 122 }
37 } 123 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 78b1ed230177..7c18d154ea28 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
21 21
22#include "trace_output.h" 22#include "trace_output.h"
23 23
24#undef TRACE_SYSTEM
24#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
25 26
26DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
@@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
86 __common_field(unsigned char, flags); 87 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count); 88 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid); 89 __common_field(int, pid);
89 __common_field(int, tgid); 90 __common_field(int, lock_depth);
90 91
91 return ret; 92 return ret;
92} 93}
@@ -230,73 +231,38 @@ static ssize_t
230ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
231 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
232{ 233{
233 size_t read = 0; 234 struct trace_parser parser;
234 int i, set = 1; 235 ssize_t read, ret;
235 ssize_t ret;
236 char *buf;
237 char ch;
238 236
239 if (!cnt || cnt < 0) 237 if (!cnt)
240 return 0; 238 return 0;
241 239
242 ret = tracing_update_buffers(); 240 ret = tracing_update_buffers();
243 if (ret < 0) 241 if (ret < 0)
244 return ret; 242 return ret;
245 243
246 ret = get_user(ch, ubuf++); 244 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
247 if (ret)
248 return ret;
249 read++;
250 cnt--;
251
252 /* skip white space */
253 while (cnt && isspace(ch)) {
254 ret = get_user(ch, ubuf++);
255 if (ret)
256 return ret;
257 read++;
258 cnt--;
259 }
260
261 /* Only white space found? */
262 if (isspace(ch)) {
263 file->f_pos += read;
264 ret = read;
265 return ret;
266 }
267
268 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
269 if (!buf)
270 return -ENOMEM; 245 return -ENOMEM;
271 246
272 if (cnt > EVENT_BUF_SIZE) 247 read = trace_get_user(&parser, ubuf, cnt, ppos);
273 cnt = EVENT_BUF_SIZE; 248
249 if (read >= 0 && trace_parser_loaded((&parser))) {
250 int set = 1;
274 251
275 i = 0; 252 if (*parser.buffer == '!')
276 while (cnt && !isspace(ch)) {
277 if (!i && ch == '!')
278 set = 0; 253 set = 0;
279 else
280 buf[i++] = ch;
281 254
282 ret = get_user(ch, ubuf++); 255 parser.buffer[parser.idx] = 0;
256
257 ret = ftrace_set_clr_event(parser.buffer + !set, set);
283 if (ret) 258 if (ret)
284 goto out_free; 259 goto out_put;
285 read++;
286 cnt--;
287 } 260 }
288 buf[i] = 0;
289
290 file->f_pos += read;
291
292 ret = ftrace_set_clr_event(buf, set);
293 if (ret)
294 goto out_free;
295 261
296 ret = read; 262 ret = read;
297 263
298 out_free: 264 out_put:
299 kfree(buf); 265 trace_parser_put(&parser);
300 266
301 return ret; 267 return ret;
302} 268}
@@ -304,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
304static void * 270static void *
305t_next(struct seq_file *m, void *v, loff_t *pos) 271t_next(struct seq_file *m, void *v, loff_t *pos)
306{ 272{
307 struct list_head *list = m->private; 273 struct ftrace_event_call *call = v;
308 struct ftrace_event_call *call;
309 274
310 (*pos)++; 275 (*pos)++;
311 276
312 for (;;) { 277 list_for_each_entry_continue(call, &ftrace_events, list) {
313 if (list == &ftrace_events)
314 return NULL;
315
316 call = list_entry(list, struct ftrace_event_call, list);
317
318 /* 278 /*
319 * The ftrace subsystem is for showing formats only. 279 * The ftrace subsystem is for showing formats only.
320 * They can not be enabled or disabled via the event files. 280 * They can not be enabled or disabled via the event files.
321 */ 281 */
322 if (call->regfunc) 282 if (call->regfunc)
323 break; 283 return call;
324
325 list = list->next;
326 } 284 }
327 285
328 m->private = list->next; 286 return NULL;
329
330 return call;
331} 287}
332 288
333static void *t_start(struct seq_file *m, loff_t *pos) 289static void *t_start(struct seq_file *m, loff_t *pos)
334{ 290{
335 struct ftrace_event_call *call = NULL; 291 struct ftrace_event_call *call;
336 loff_t l; 292 loff_t l;
337 293
338 mutex_lock(&event_mutex); 294 mutex_lock(&event_mutex);
339 295
340 m->private = ftrace_events.next; 296 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
341 for (l = 0; l <= *pos; ) { 297 for (l = 0; l <= *pos; ) {
342 call = t_next(m, NULL, &l); 298 call = t_next(m, call, &l);
343 if (!call) 299 if (!call)
344 break; 300 break;
345 } 301 }
@@ -349,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
349static void * 305static void *
350s_next(struct seq_file *m, void *v, loff_t *pos) 306s_next(struct seq_file *m, void *v, loff_t *pos)
351{ 307{
352 struct list_head *list = m->private; 308 struct ftrace_event_call *call = v;
353 struct ftrace_event_call *call;
354 309
355 (*pos)++; 310 (*pos)++;
356 311
357 retry: 312 list_for_each_entry_continue(call, &ftrace_events, list) {
358 if (list == &ftrace_events) 313 if (call->enabled)
359 return NULL; 314 return call;
360
361 call = list_entry(list, struct ftrace_event_call, list);
362
363 if (!call->enabled) {
364 list = list->next;
365 goto retry;
366 } 315 }
367 316
368 m->private = list->next; 317 return NULL;
369
370 return call;
371} 318}
372 319
373static void *s_start(struct seq_file *m, loff_t *pos) 320static void *s_start(struct seq_file *m, loff_t *pos)
374{ 321{
375 struct ftrace_event_call *call = NULL; 322 struct ftrace_event_call *call;
376 loff_t l; 323 loff_t l;
377 324
378 mutex_lock(&event_mutex); 325 mutex_lock(&event_mutex);
379 326
380 m->private = ftrace_events.next; 327 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
381 for (l = 0; l <= *pos; ) { 328 for (l = 0; l <= *pos; ) {
382 call = s_next(m, NULL, &l); 329 call = s_next(m, call, &l);
383 if (!call) 330 if (!call)
384 break; 331 break;
385 } 332 }
@@ -560,7 +507,7 @@ extern char *__bad_type_size(void);
560#define FIELD(type, name) \ 507#define FIELD(type, name) \
561 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ 508 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
562 #type, "common_" #name, offsetof(typeof(field), name), \ 509 #type, "common_" #name, offsetof(typeof(field), name), \
563 sizeof(field.name) 510 sizeof(field.name), is_signed_type(type)
564 511
565static int trace_write_header(struct trace_seq *s) 512static int trace_write_header(struct trace_seq *s)
566{ 513{
@@ -568,17 +515,17 @@ static int trace_write_header(struct trace_seq *s)
568 515
569 /* struct trace_entry */ 516 /* struct trace_entry */
570 return trace_seq_printf(s, 517 return trace_seq_printf(s,
571 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
572 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 519 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
573 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 520 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
574 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 521 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
575 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 522 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
576 "\n", 523 "\n",
577 FIELD(unsigned short, type), 524 FIELD(unsigned short, type),
578 FIELD(unsigned char, flags), 525 FIELD(unsigned char, flags),
579 FIELD(unsigned char, preempt_count), 526 FIELD(unsigned char, preempt_count),
580 FIELD(int, pid), 527 FIELD(int, pid),
581 FIELD(int, tgid)); 528 FIELD(int, lock_depth));
582} 529}
583 530
584static ssize_t 531static ssize_t
@@ -931,9 +878,9 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
931 "'%s/filter' entry\n", name); 878 "'%s/filter' entry\n", name);
932 } 879 }
933 880
934 entry = trace_create_file("enable", 0644, system->entry, 881 trace_create_file("enable", 0644, system->entry,
935 (void *)system->name, 882 (void *)system->name,
936 &ftrace_system_enable_fops); 883 &ftrace_system_enable_fops);
937 884
938 return system->entry; 885 return system->entry;
939} 886}
@@ -945,7 +892,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 const struct file_operations *filter, 892 const struct file_operations *filter,
946 const struct file_operations *format) 893 const struct file_operations *format)
947{ 894{
948 struct dentry *entry;
949 int ret; 895 int ret;
950 896
951 /* 897 /*
@@ -963,12 +909,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
963 } 909 }
964 910
965 if (call->regfunc) 911 if (call->regfunc)
966 entry = trace_create_file("enable", 0644, call->dir, call, 912 trace_create_file("enable", 0644, call->dir, call,
967 enable); 913 enable);
968 914
969 if (call->id && call->profile_enable) 915 if (call->id && call->profile_enable)
970 entry = trace_create_file("id", 0444, call->dir, call, 916 trace_create_file("id", 0444, call->dir, call,
971 id); 917 id);
972 918
973 if (call->define_fields) { 919 if (call->define_fields) {
974 ret = call->define_fields(call); 920 ret = call->define_fields(call);
@@ -977,16 +923,16 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
977 " events/%s\n", call->name); 923 " events/%s\n", call->name);
978 return ret; 924 return ret;
979 } 925 }
980 entry = trace_create_file("filter", 0644, call->dir, call, 926 trace_create_file("filter", 0644, call->dir, call,
981 filter); 927 filter);
982 } 928 }
983 929
984 /* A trace may not want to export its format */ 930 /* A trace may not want to export its format */
985 if (!call->show_format) 931 if (!call->show_format)
986 return 0; 932 return 0;
987 933
988 entry = trace_create_file("format", 0444, call->dir, call, 934 trace_create_file("format", 0444, call->dir, call,
989 format); 935 format);
990 936
991 return 0; 937 return 0;
992} 938}
@@ -1187,7 +1133,7 @@ static int trace_module_notify(struct notifier_block *self,
1187} 1133}
1188#endif /* CONFIG_MODULES */ 1134#endif /* CONFIG_MODULES */
1189 1135
1190struct notifier_block trace_module_nb = { 1136static struct notifier_block trace_module_nb = {
1191 .notifier_call = trace_module_notify, 1137 .notifier_call = trace_module_notify,
1192 .priority = 0, 1138 .priority = 0,
1193}; 1139};
@@ -1359,6 +1305,18 @@ static __init void event_trace_self_tests(void)
1359 if (!call->regfunc) 1305 if (!call->regfunc)
1360 continue; 1306 continue;
1361 1307
1308/*
1309 * Testing syscall events here is pretty useless, but
1310 * we still do it if configured. But this is time consuming.
1311 * What we really need is a user thread to perform the
1312 * syscalls as we test.
1313 */
1314#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1315 if (call->system &&
1316 strcmp(call->system, "syscalls") == 0)
1317 continue;
1318#endif
1319
1362 pr_info("Testing event %s: ", call->name); 1320 pr_info("Testing event %s: ", call->name);
1363 1321
1364 /* 1322 /*
@@ -1432,7 +1390,7 @@ static __init void event_trace_self_tests(void)
1432 1390
1433#ifdef CONFIG_FUNCTION_TRACER 1391#ifdef CONFIG_FUNCTION_TRACER
1434 1392
1435static DEFINE_PER_CPU(atomic_t, test_event_disable); 1393static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1436 1394
1437static void 1395static void
1438function_test_events_call(unsigned long ip, unsigned long parent_ip) 1396function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1449,7 +1407,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1449 pc = preempt_count(); 1407 pc = preempt_count();
1450 resched = ftrace_preempt_disable(); 1408 resched = ftrace_preempt_disable();
1451 cpu = raw_smp_processor_id(); 1409 cpu = raw_smp_processor_id();
1452 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1410 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1453 1411
1454 if (disabled != 1) 1412 if (disabled != 1)
1455 goto out; 1413 goto out;
@@ -1468,7 +1426,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1468 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1426 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1469 1427
1470 out: 1428 out:
1471 atomic_dec(&per_cpu(test_event_disable, cpu)); 1429 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1472 ftrace_preempt_enable(resched); 1430 ftrace_preempt_enable(resched);
1473} 1431}
1474 1432
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf629..21d34757b955 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -18,11 +18,10 @@
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com> 18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */ 19 */
20 20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h> 21#include <linux/module.h>
24#include <linux/ctype.h> 22#include <linux/ctype.h>
25#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h>
26 25
27#include "trace.h" 26#include "trace.h"
28#include "trace_output.h" 27#include "trace_output.h"
@@ -31,6 +30,7 @@ enum filter_op_ids
31{ 30{
32 OP_OR, 31 OP_OR,
33 OP_AND, 32 OP_AND,
33 OP_GLOB,
34 OP_NE, 34 OP_NE,
35 OP_EQ, 35 OP_EQ,
36 OP_LT, 36 OP_LT,
@@ -48,16 +48,17 @@ struct filter_op {
48}; 48};
49 49
50static struct filter_op filter_ops[] = { 50static struct filter_op filter_ops[] = {
51 { OP_OR, "||", 1 }, 51 { OP_OR, "||", 1 },
52 { OP_AND, "&&", 2 }, 52 { OP_AND, "&&", 2 },
53 { OP_NE, "!=", 4 }, 53 { OP_GLOB, "~", 4 },
54 { OP_EQ, "==", 4 }, 54 { OP_NE, "!=", 4 },
55 { OP_LT, "<", 5 }, 55 { OP_EQ, "==", 4 },
56 { OP_LE, "<=", 5 }, 56 { OP_LT, "<", 5 },
57 { OP_GT, ">", 5 }, 57 { OP_LE, "<=", 5 },
58 { OP_GE, ">=", 5 }, 58 { OP_GT, ">", 5 },
59 { OP_NONE, "OP_NONE", 0 }, 59 { OP_GE, ">=", 5 },
60 { OP_OPEN_PAREN, "(", 0 }, 60 { OP_NONE, "OP_NONE", 0 },
61 { OP_OPEN_PAREN, "(", 0 },
61}; 62};
62 63
63enum { 64enum {
@@ -121,6 +122,47 @@ struct filter_parse_state {
121 } operand; 122 } operand;
122}; 123};
123 124
125#define DEFINE_COMPARISON_PRED(type) \
126static int filter_pred_##type(struct filter_pred *pred, void *event, \
127 int val1, int val2) \
128{ \
129 type *addr = (type *)(event + pred->offset); \
130 type val = (type)pred->val; \
131 int match = 0; \
132 \
133 switch (pred->op) { \
134 case OP_LT: \
135 match = (*addr < val); \
136 break; \
137 case OP_LE: \
138 match = (*addr <= val); \
139 break; \
140 case OP_GT: \
141 match = (*addr > val); \
142 break; \
143 case OP_GE: \
144 match = (*addr >= val); \
145 break; \
146 default: \
147 break; \
148 } \
149 \
150 return match; \
151}
152
153#define DEFINE_EQUALITY_PRED(size) \
154static int filter_pred_##size(struct filter_pred *pred, void *event, \
155 int val1, int val2) \
156{ \
157 u##size *addr = (u##size *)(event + pred->offset); \
158 u##size val = (u##size)pred->val; \
159 int match; \
160 \
161 match = (val == *addr) ^ pred->not; \
162 \
163 return match; \
164}
165
124DEFINE_COMPARISON_PRED(s64); 166DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 167DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 168DEFINE_COMPARISON_PRED(s32);
@@ -156,9 +198,9 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
156 char *addr = (char *)(event + pred->offset); 198 char *addr = (char *)(event + pred->offset);
157 int cmp, match; 199 int cmp, match;
158 200
159 cmp = strncmp(addr, pred->str_val, pred->str_len); 201 cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len);
160 202
161 match = (!cmp) ^ pred->not; 203 match = cmp ^ pred->not;
162 204
163 return match; 205 return match;
164} 206}
@@ -170,9 +212,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
170 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
171 int cmp, match; 213 int cmp, match;
172 214
173 cmp = strncmp(*addr, pred->str_val, pred->str_len); 215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len);
174 216
175 match = (!cmp) ^ pred->not; 217 match = cmp ^ pred->not;
176 218
177 return match; 219 return match;
178} 220}
@@ -196,9 +238,9 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
196 char *addr = (char *)(event + str_loc); 238 char *addr = (char *)(event + str_loc);
197 int cmp, match; 239 int cmp, match;
198 240
199 cmp = strncmp(addr, pred->str_val, str_len); 241 cmp = pred->regex.match(addr, &pred->regex, str_len);
200 242
201 match = (!cmp) ^ pred->not; 243 match = cmp ^ pred->not;
202 244
203 return match; 245 return match;
204} 246}
@@ -209,10 +251,121 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
209 return 0; 251 return 0;
210} 252}
211 253
254/* Basic regex callbacks */
255static int regex_match_full(char *str, struct regex *r, int len)
256{
257 if (strncmp(str, r->pattern, len) == 0)
258 return 1;
259 return 0;
260}
261
262static int regex_match_front(char *str, struct regex *r, int len)
263{
264 if (strncmp(str, r->pattern, len) == 0)
265 return 1;
266 return 0;
267}
268
269static int regex_match_middle(char *str, struct regex *r, int len)
270{
271 if (strstr(str, r->pattern))
272 return 1;
273 return 0;
274}
275
276static int regex_match_end(char *str, struct regex *r, int len)
277{
278 char *ptr = strstr(str, r->pattern);
279
280 if (ptr && (ptr[r->len] == 0))
281 return 1;
282 return 0;
283}
284
285/**
286 * filter_parse_regex - parse a basic regex
287 * @buff: the raw regex
288 * @len: length of the regex
289 * @search: will point to the beginning of the string to compare
290 * @not: tell whether the match will have to be inverted
291 *
292 * This passes in a buffer containing a regex and this function will
293 * set search to point to the search part of the buffer and
294 * return the type of search it is (see enum above).
295 * This does modify buff.
296 *
297 * Returns enum type.
298 * search returns the pointer to use for comparison.
299 * not returns 1 if buff started with a '!'
300 * 0 otherwise.
301 */
302enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not)
303{
304 int type = MATCH_FULL;
305 int i;
306
307 if (buff[0] == '!') {
308 *not = 1;
309 buff++;
310 len--;
311 } else
312 *not = 0;
313
314 *search = buff;
315
316 for (i = 0; i < len; i++) {
317 if (buff[i] == '*') {
318 if (!i) {
319 *search = buff + 1;
320 type = MATCH_END_ONLY;
321 } else {
322 if (type == MATCH_END_ONLY)
323 type = MATCH_MIDDLE_ONLY;
324 else
325 type = MATCH_FRONT_ONLY;
326 buff[i] = 0;
327 break;
328 }
329 }
330 }
331
332 return type;
333}
334
335static void filter_build_regex(struct filter_pred *pred)
336{
337 struct regex *r = &pred->regex;
338 char *search;
339 enum regex_type type = MATCH_FULL;
340 int not = 0;
341
342 if (pred->op == OP_GLOB) {
343 type = filter_parse_regex(r->pattern, r->len, &search, &not);
344 r->len = strlen(search);
345 memmove(r->pattern, search, r->len+1);
346 }
347
348 switch (type) {
349 case MATCH_FULL:
350 r->match = regex_match_full;
351 break;
352 case MATCH_FRONT_ONLY:
353 r->match = regex_match_front;
354 break;
355 case MATCH_MIDDLE_ONLY:
356 r->match = regex_match_middle;
357 break;
358 case MATCH_END_ONLY:
359 r->match = regex_match_end;
360 break;
361 }
362
363 pred->not ^= not;
364}
365
212/* return 1 if event matches, 0 otherwise (discard) */ 366/* return 1 if event matches, 0 otherwise (discard) */
213int filter_match_preds(struct ftrace_event_call *call, void *rec) 367int filter_match_preds(struct event_filter *filter, void *rec)
214{ 368{
215 struct event_filter *filter = call->filter;
216 int match, top = 0, val1 = 0, val2 = 0; 369 int match, top = 0, val1 = 0, val2 = 0;
217 int stack[MAX_FILTER_PRED]; 370 int stack[MAX_FILTER_PRED];
218 struct filter_pred *pred; 371 struct filter_pred *pred;
@@ -355,7 +508,7 @@ static void filter_clear_pred(struct filter_pred *pred)
355{ 508{
356 kfree(pred->field_name); 509 kfree(pred->field_name);
357 pred->field_name = NULL; 510 pred->field_name = NULL;
358 pred->str_len = 0; 511 pred->regex.len = 0;
359} 512}
360 513
361static int filter_set_pred(struct filter_pred *dest, 514static int filter_set_pred(struct filter_pred *dest,
@@ -385,9 +538,8 @@ static void filter_disable_preds(struct ftrace_event_call *call)
385 filter->preds[i]->fn = filter_pred_none; 538 filter->preds[i]->fn = filter_pred_none;
386} 539}
387 540
388void destroy_preds(struct ftrace_event_call *call) 541static void __free_preds(struct event_filter *filter)
389{ 542{
390 struct event_filter *filter = call->filter;
391 int i; 543 int i;
392 544
393 if (!filter) 545 if (!filter)
@@ -400,21 +552,24 @@ void destroy_preds(struct ftrace_event_call *call)
400 kfree(filter->preds); 552 kfree(filter->preds);
401 kfree(filter->filter_string); 553 kfree(filter->filter_string);
402 kfree(filter); 554 kfree(filter);
555}
556
557void destroy_preds(struct ftrace_event_call *call)
558{
559 __free_preds(call->filter);
403 call->filter = NULL; 560 call->filter = NULL;
561 call->filter_active = 0;
404} 562}
405 563
406static int init_preds(struct ftrace_event_call *call) 564static struct event_filter *__alloc_preds(void)
407{ 565{
408 struct event_filter *filter; 566 struct event_filter *filter;
409 struct filter_pred *pred; 567 struct filter_pred *pred;
410 int i; 568 int i;
411 569
412 if (call->filter) 570 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
413 return 0; 571 if (!filter)
414 572 return ERR_PTR(-ENOMEM);
415 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
416 if (!call->filter)
417 return -ENOMEM;
418 573
419 filter->n_preds = 0; 574 filter->n_preds = 0;
420 575
@@ -430,12 +585,24 @@ static int init_preds(struct ftrace_event_call *call)
430 filter->preds[i] = pred; 585 filter->preds[i] = pred;
431 } 586 }
432 587
433 return 0; 588 return filter;
434 589
435oom: 590oom:
436 destroy_preds(call); 591 __free_preds(filter);
592 return ERR_PTR(-ENOMEM);
593}
437 594
438 return -ENOMEM; 595static int init_preds(struct ftrace_event_call *call)
596{
597 if (call->filter)
598 return 0;
599
600 call->filter_active = 0;
601 call->filter = __alloc_preds();
602 if (IS_ERR(call->filter))
603 return PTR_ERR(call->filter);
604
605 return 0;
439} 606}
440 607
441static int init_subsystem_preds(struct event_subsystem *system) 608static int init_subsystem_preds(struct event_subsystem *system)
@@ -458,14 +625,7 @@ static int init_subsystem_preds(struct event_subsystem *system)
458 return 0; 625 return 0;
459} 626}
460 627
461enum { 628static void filter_free_subsystem_preds(struct event_subsystem *system)
462 FILTER_DISABLE_ALL,
463 FILTER_INIT_NO_RESET,
464 FILTER_SKIP_NO_RESET,
465};
466
467static void filter_free_subsystem_preds(struct event_subsystem *system,
468 int flag)
469{ 629{
470 struct ftrace_event_call *call; 630 struct ftrace_event_call *call;
471 631
@@ -476,14 +636,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
476 if (strcmp(call->system, system->name) != 0) 636 if (strcmp(call->system, system->name) != 0)
477 continue; 637 continue;
478 638
479 if (flag == FILTER_INIT_NO_RESET) {
480 call->filter->no_reset = false;
481 continue;
482 }
483
484 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
485 continue;
486
487 filter_disable_preds(call); 639 filter_disable_preds(call);
488 remove_filter_string(call->filter); 640 remove_filter_string(call->filter);
489 } 641 }
@@ -491,10 +643,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system,
491 643
492static int filter_add_pred_fn(struct filter_parse_state *ps, 644static int filter_add_pred_fn(struct filter_parse_state *ps,
493 struct ftrace_event_call *call, 645 struct ftrace_event_call *call,
646 struct event_filter *filter,
494 struct filter_pred *pred, 647 struct filter_pred *pred,
495 filter_pred_fn_t fn) 648 filter_pred_fn_t fn)
496{ 649{
497 struct event_filter *filter = call->filter;
498 int idx, err; 650 int idx, err;
499 651
500 if (filter->n_preds == MAX_FILTER_PRED) { 652 if (filter->n_preds == MAX_FILTER_PRED) {
@@ -509,7 +661,6 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
509 return err; 661 return err;
510 662
511 filter->n_preds++; 663 filter->n_preds++;
512 call->filter_active = 1;
513 664
514 return 0; 665 return 0;
515} 666}
@@ -534,7 +685,10 @@ static bool is_string_field(struct ftrace_event_field *field)
534 685
535static int is_legal_op(struct ftrace_event_field *field, int op) 686static int is_legal_op(struct ftrace_event_field *field, int op)
536{ 687{
537 if (is_string_field(field) && (op != OP_EQ && op != OP_NE)) 688 if (is_string_field(field) &&
689 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
690 return 0;
691 if (!is_string_field(field) && op == OP_GLOB)
538 return 0; 692 return 0;
539 693
540 return 1; 694 return 1;
@@ -585,6 +739,7 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
585 739
586static int filter_add_pred(struct filter_parse_state *ps, 740static int filter_add_pred(struct filter_parse_state *ps,
587 struct ftrace_event_call *call, 741 struct ftrace_event_call *call,
742 struct event_filter *filter,
588 struct filter_pred *pred, 743 struct filter_pred *pred,
589 bool dry_run) 744 bool dry_run)
590{ 745{
@@ -619,21 +774,22 @@ static int filter_add_pred(struct filter_parse_state *ps,
619 } 774 }
620 775
621 if (is_string_field(field)) { 776 if (is_string_field(field)) {
622 pred->str_len = field->size; 777 filter_build_regex(pred);
623 778
624 if (field->filter_type == FILTER_STATIC_STRING) 779 if (field->filter_type == FILTER_STATIC_STRING) {
625 fn = filter_pred_string; 780 fn = filter_pred_string;
626 else if (field->filter_type == FILTER_DYN_STRING) 781 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING)
627 fn = filter_pred_strloc; 783 fn = filter_pred_strloc;
628 else { 784 else {
629 fn = filter_pred_pchar; 785 fn = filter_pred_pchar;
630 pred->str_len = strlen(pred->str_val); 786 pred->regex.field_len = strlen(pred->regex.pattern);
631 } 787 }
632 } else { 788 } else {
633 if (field->is_signed) 789 if (field->is_signed)
634 ret = strict_strtoll(pred->str_val, 0, &val); 790 ret = strict_strtoll(pred->regex.pattern, 0, &val);
635 else 791 else
636 ret = strict_strtoull(pred->str_val, 0, &val); 792 ret = strict_strtoull(pred->regex.pattern, 0, &val);
637 if (ret) { 793 if (ret) {
638 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); 794 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
639 return -EINVAL; 795 return -EINVAL;
@@ -653,45 +809,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
653 809
654add_pred_fn: 810add_pred_fn:
655 if (!dry_run) 811 if (!dry_run)
656 return filter_add_pred_fn(ps, call, pred, fn); 812 return filter_add_pred_fn(ps, call, filter, pred, fn);
657 return 0;
658}
659
660static int filter_add_subsystem_pred(struct filter_parse_state *ps,
661 struct event_subsystem *system,
662 struct filter_pred *pred,
663 char *filter_string,
664 bool dry_run)
665{
666 struct ftrace_event_call *call;
667 int err = 0;
668 bool fail = true;
669
670 list_for_each_entry(call, &ftrace_events, list) {
671
672 if (!call->define_fields)
673 continue;
674
675 if (strcmp(call->system, system->name))
676 continue;
677
678 if (call->filter->no_reset)
679 continue;
680
681 err = filter_add_pred(ps, call, pred, dry_run);
682 if (err)
683 call->filter->no_reset = true;
684 else
685 fail = false;
686
687 if (!dry_run)
688 replace_filter_string(call->filter, filter_string);
689 }
690
691 if (fail) {
692 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
693 return err;
694 }
695 return 0; 813 return 0;
696} 814}
697 815
@@ -892,8 +1010,9 @@ static void postfix_clear(struct filter_parse_state *ps)
892 1010
893 while (!list_empty(&ps->postfix)) { 1011 while (!list_empty(&ps->postfix)) {
894 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1012 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
895 kfree(elt->operand);
896 list_del(&elt->list); 1013 list_del(&elt->list);
1014 kfree(elt->operand);
1015 kfree(elt);
897 } 1016 }
898} 1017}
899 1018
@@ -1003,8 +1122,8 @@ static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
1003 return NULL; 1122 return NULL;
1004 } 1123 }
1005 1124
1006 strcpy(pred->str_val, operand2); 1125 strcpy(pred->regex.pattern, operand2);
1007 pred->str_len = strlen(operand2); 1126 pred->regex.len = strlen(pred->regex.pattern);
1008 1127
1009 pred->op = op; 1128 pred->op = op;
1010 1129
@@ -1048,8 +1167,8 @@ static int check_preds(struct filter_parse_state *ps)
1048 return 0; 1167 return 0;
1049} 1168}
1050 1169
1051static int replace_preds(struct event_subsystem *system, 1170static int replace_preds(struct ftrace_event_call *call,
1052 struct ftrace_event_call *call, 1171 struct event_filter *filter,
1053 struct filter_parse_state *ps, 1172 struct filter_parse_state *ps,
1054 char *filter_string, 1173 char *filter_string,
1055 bool dry_run) 1174 bool dry_run)
@@ -1096,11 +1215,7 @@ static int replace_preds(struct event_subsystem *system,
1096add_pred: 1215add_pred:
1097 if (!pred) 1216 if (!pred)
1098 return -ENOMEM; 1217 return -ENOMEM;
1099 if (call) 1218 err = filter_add_pred(ps, call, filter, pred, dry_run);
1100 err = filter_add_pred(ps, call, pred, false);
1101 else
1102 err = filter_add_subsystem_pred(ps, system, pred,
1103 filter_string, dry_run);
1104 filter_free_pred(pred); 1219 filter_free_pred(pred);
1105 if (err) 1220 if (err)
1106 return err; 1221 return err;
@@ -1111,10 +1226,50 @@ add_pred:
1111 return 0; 1226 return 0;
1112} 1227}
1113 1228
1114int apply_event_filter(struct ftrace_event_call *call, char *filter_string) 1229static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps,
1231 char *filter_string)
1115{ 1232{
1233 struct event_filter *filter = system->filter;
1234 struct ftrace_event_call *call;
1235 bool fail = true;
1116 int err; 1236 int err;
1117 1237
1238 list_for_each_entry(call, &ftrace_events, list) {
1239
1240 if (!call->define_fields)
1241 continue;
1242
1243 if (strcmp(call->system, system->name) != 0)
1244 continue;
1245
1246 /* try to see if the filter can be applied */
1247 err = replace_preds(call, filter, ps, filter_string, true);
1248 if (err)
1249 continue;
1250
1251 /* really apply the filter */
1252 filter_disable_preds(call);
1253 err = replace_preds(call, filter, ps, filter_string, false);
1254 if (err)
1255 filter_disable_preds(call);
1256 else {
1257 call->filter_active = 1;
1258 replace_filter_string(filter, filter_string);
1259 }
1260 fail = false;
1261 }
1262
1263 if (fail) {
1264 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1265 return -EINVAL;
1266 }
1267 return 0;
1268}
1269
1270int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1271{
1272 int err;
1118 struct filter_parse_state *ps; 1273 struct filter_parse_state *ps;
1119 1274
1120 mutex_lock(&event_mutex); 1275 mutex_lock(&event_mutex);
@@ -1126,8 +1281,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1126 if (!strcmp(strstrip(filter_string), "0")) { 1281 if (!strcmp(strstrip(filter_string), "0")) {
1127 filter_disable_preds(call); 1282 filter_disable_preds(call);
1128 remove_filter_string(call->filter); 1283 remove_filter_string(call->filter);
1129 mutex_unlock(&event_mutex); 1284 goto out_unlock;
1130 return 0;
1131 } 1285 }
1132 1286
1133 err = -ENOMEM; 1287 err = -ENOMEM;
@@ -1145,10 +1299,11 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1145 goto out; 1299 goto out;
1146 } 1300 }
1147 1301
1148 err = replace_preds(NULL, call, ps, filter_string, false); 1302 err = replace_preds(call, call->filter, ps, filter_string, false);
1149 if (err) 1303 if (err)
1150 append_filter_err(ps, call->filter); 1304 append_filter_err(ps, call->filter);
1151 1305 else
1306 call->filter_active = 1;
1152out: 1307out:
1153 filter_opstack_clear(ps); 1308 filter_opstack_clear(ps);
1154 postfix_clear(ps); 1309 postfix_clear(ps);
@@ -1163,7 +1318,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1163 char *filter_string) 1318 char *filter_string)
1164{ 1319{
1165 int err; 1320 int err;
1166
1167 struct filter_parse_state *ps; 1321 struct filter_parse_state *ps;
1168 1322
1169 mutex_lock(&event_mutex); 1323 mutex_lock(&event_mutex);
@@ -1173,10 +1327,9 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1173 goto out_unlock; 1327 goto out_unlock;
1174 1328
1175 if (!strcmp(strstrip(filter_string), "0")) { 1329 if (!strcmp(strstrip(filter_string), "0")) {
1176 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL); 1330 filter_free_subsystem_preds(system);
1177 remove_filter_string(system->filter); 1331 remove_filter_string(system->filter);
1178 mutex_unlock(&event_mutex); 1332 goto out_unlock;
1179 return 0;
1180 } 1333 }
1181 1334
1182 err = -ENOMEM; 1335 err = -ENOMEM;
@@ -1193,31 +1346,87 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1193 goto out; 1346 goto out;
1194 } 1347 }
1195 1348
1196 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET); 1349 err = replace_system_preds(system, ps, filter_string);
1197 1350 if (err)
1198 /* try to see the filter can be applied to which events */
1199 err = replace_preds(system, NULL, ps, filter_string, true);
1200 if (err) {
1201 append_filter_err(ps, system->filter); 1351 append_filter_err(ps, system->filter);
1202 goto out; 1352
1353out:
1354 filter_opstack_clear(ps);
1355 postfix_clear(ps);
1356 kfree(ps);
1357out_unlock:
1358 mutex_unlock(&event_mutex);
1359
1360 return err;
1361}
1362
1363#ifdef CONFIG_EVENT_PROFILE
1364
1365void ftrace_profile_free_filter(struct perf_event *event)
1366{
1367 struct event_filter *filter = event->filter;
1368
1369 event->filter = NULL;
1370 __free_preds(filter);
1371}
1372
1373int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1374 char *filter_str)
1375{
1376 int err;
1377 struct event_filter *filter;
1378 struct filter_parse_state *ps;
1379 struct ftrace_event_call *call = NULL;
1380
1381 mutex_lock(&event_mutex);
1382
1383 list_for_each_entry(call, &ftrace_events, list) {
1384 if (call->id == event_id)
1385 break;
1203 } 1386 }
1204 1387
1205 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET); 1388 err = -EINVAL;
1389 if (!call)
1390 goto out_unlock;
1391
1392 err = -EEXIST;
1393 if (event->filter)
1394 goto out_unlock;
1206 1395
1207 /* really apply the filter to the events */ 1396 filter = __alloc_preds();
1208 err = replace_preds(system, NULL, ps, filter_string, false); 1397 if (IS_ERR(filter)) {
1209 if (err) { 1398 err = PTR_ERR(filter);
1210 append_filter_err(ps, system->filter); 1399 goto out_unlock;
1211 filter_free_subsystem_preds(system, 2);
1212 } 1400 }
1213 1401
1214out: 1402 err = -ENOMEM;
1403 ps = kzalloc(sizeof(*ps), GFP_KERNEL);
1404 if (!ps)
1405 goto free_preds;
1406
1407 parse_init(ps, filter_ops, filter_str);
1408 err = filter_parse(ps);
1409 if (err)
1410 goto free_ps;
1411
1412 err = replace_preds(call, filter, ps, filter_str, false);
1413 if (!err)
1414 event->filter = filter;
1415
1416free_ps:
1215 filter_opstack_clear(ps); 1417 filter_opstack_clear(ps);
1216 postfix_clear(ps); 1418 postfix_clear(ps);
1217 kfree(ps); 1419 kfree(ps);
1420
1421free_preds:
1422 if (err)
1423 __free_preds(filter);
1424
1218out_unlock: 1425out_unlock:
1219 mutex_unlock(&event_mutex); 1426 mutex_unlock(&event_mutex);
1220 1427
1221 return err; 1428 return err;
1222} 1429}
1223 1430
1431#endif /* CONFIG_EVENT_PROFILE */
1432
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index df1bf6e48bb9..31da218ee10f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,146 +15,128 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item), is_signed_type(type)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
35 84
36#undef TRACE_FIELD_SPECIAL 85#undef __array
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 86#define __array(type, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 89 offsetof(typeof(field), item), \
41 (unsigned int)sizeof(field.item)); \ 90 sizeof(field.item), is_signed_type(type)); \
42 if (!ret) \ 91 if (!ret) \
43 return 0; 92 return 0;
44 93
45#undef TRACE_FIELD_ZERO_CHAR 94#undef __array_desc
46#define TRACE_FIELD_ZERO_CHAR(item) \ 95#define __array_desc(type, container, item, len) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
48 "offset:%u;\tsize:0;\n", \ 97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
50 if (!ret) \ 101 if (!ret) \
51 return 0; 102 return 0;
52 103
53#undef TRACE_FIELD_SIGN 104#undef __dynamic_array
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 105#define __dynamic_array(type, item) \
55 TRACE_FIELD(type, item, assign) 106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
56 112
57#undef TP_RAW_FMT 113#undef F_printk
58#define TP_RAW_FMT(args...) args 114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
59 115
60#undef TRACE_EVENT_FORMAT 116#undef __entry
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define __entry REC
62static int \
63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
65{ \
66 struct args field; \
67 int ret; \
68 \
69 tstruct; \
70 \
71 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
72 \
73 return ret; \
74}
75 118
76#undef TRACE_EVENT_FORMAT_NOFILTER 119#undef FTRACE_ENTRY
77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
78 tpfmt) \
79static int \ 121static int \
80ftrace_format_##call(struct ftrace_event_call *unused, \ 122ftrace_format_##name(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \ 123 struct trace_seq *s) \
82{ \ 124{ \
83 struct args field; \ 125 struct struct_name field __attribute__((unused)); \
84 int ret; \ 126 int ret = 0; \
85 \ 127 \
86 tstruct; \ 128 tstruct; \
87 \ 129 \
88 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 130 trace_seq_printf(s, "\nprint fmt: " print); \
89 \ 131 \
90 return ret; \ 132 return ret; \
91} 133}
92 134
93#include "trace_event_types.h" 135#include "trace_entries.h"
94
95#undef TRACE_ZERO_CHAR
96#define TRACE_ZERO_CHAR(arg)
97
98#undef TRACE_FIELD
99#define TRACE_FIELD(type, item, assign)\
100 entry->item = assign;
101
102#undef TRACE_FIELD
103#define TRACE_FIELD(type, item, assign)\
104 entry->item = assign;
105
106#undef TRACE_FIELD_SIGN
107#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
108 TRACE_FIELD(type, item, assign)
109
110#undef TP_CMD
111#define TP_CMD(cmd...) cmd
112
113#undef TRACE_ENTRY
114#define TRACE_ENTRY entry
115
116#undef TRACE_FIELD_SPECIAL
117#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
118 cmd;
119
120#undef TRACE_EVENT_FORMAT
121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
123static int ftrace_raw_init_event_##call(void); \
124 \
125struct ftrace_event_call __used \
126__attribute__((__aligned__(4))) \
127__attribute__((section("_ftrace_events"))) event_##call = { \
128 .name = #call, \
129 .id = proto, \
130 .system = __stringify(TRACE_SYSTEM), \
131 .raw_init = ftrace_raw_init_event_##call, \
132 .show_format = ftrace_format_##call, \
133 .define_fields = ftrace_define_fields_##call, \
134}; \
135static int ftrace_raw_init_event_##call(void) \
136{ \
137 INIT_LIST_HEAD(&event_##call.fields); \
138 return 0; \
139} \
140
141#undef TRACE_EVENT_FORMAT_NOFILTER
142#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
143 tpfmt) \
144 \
145struct ftrace_event_call __used \
146__attribute__((__aligned__(4))) \
147__attribute__((section("_ftrace_events"))) event_##call = { \
148 .name = #call, \
149 .id = proto, \
150 .system = __stringify(TRACE_SYSTEM), \
151 .show_format = ftrace_format_##call, \
152};
153 136
154#include "trace_event_types.h"
155 137
156#undef TRACE_FIELD 138#undef __field
157#define TRACE_FIELD(type, item, assign) \ 139#define __field(type, item) \
158 ret = trace_define_field(event_call, #type, #item, \ 140 ret = trace_define_field(event_call, #type, #item, \
159 offsetof(typeof(field), item), \ 141 offsetof(typeof(field), item), \
160 sizeof(field.item), \ 142 sizeof(field.item), \
@@ -162,32 +144,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
162 if (ret) \ 144 if (ret) \
163 return ret; 145 return ret;
164 146
165#undef TRACE_FIELD_SPECIAL 147#undef __field_desc
166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 148#define __field_desc(type, container, item) \
149 ret = trace_define_field(event_call, #type, #item, \
150 offsetof(typeof(field), \
151 container.item), \
152 sizeof(field.container.item), \
153 is_signed_type(type), FILTER_OTHER); \
154 if (ret) \
155 return ret;
156
157#undef __array
158#define __array(type, item, len) \
159 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 160 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
168 offsetof(typeof(field), item), \ 161 offsetof(typeof(field), item), \
169 sizeof(field.item), 0, FILTER_OTHER); \ 162 sizeof(field.item), 0, FILTER_OTHER); \
170 if (ret) \ 163 if (ret) \
171 return ret; 164 return ret;
172 165
173#undef TRACE_FIELD_SIGN 166#undef __array_desc
174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 167#define __array_desc(type, container, item, len) \
175 ret = trace_define_field(event_call, #type, #item, \ 168 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
176 offsetof(typeof(field), item), \ 169 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
177 sizeof(field.item), is_signed, \ 170 offsetof(typeof(field), \
171 container.item), \
172 sizeof(field.container.item), 0, \
178 FILTER_OTHER); \ 173 FILTER_OTHER); \
179 if (ret) \ 174 if (ret) \
180 return ret; 175 return ret;
181 176
182#undef TRACE_FIELD_ZERO_CHAR 177#undef __dynamic_array
183#define TRACE_FIELD_ZERO_CHAR(item) 178#define __dynamic_array(type, item)
184 179
185#undef TRACE_EVENT_FORMAT 180#undef FTRACE_ENTRY
186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 181#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
187int \ 182int \
188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 183ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
189{ \ 184{ \
190 struct args field; \ 185 struct struct_name field; \
191 int ret; \ 186 int ret; \
192 \ 187 \
193 ret = trace_define_common_fields(event_call); \ 188 ret = trace_define_common_fields(event_call); \
@@ -199,8 +194,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
199 return ret; \ 194 return ret; \
200} 195}
201 196
202#undef TRACE_EVENT_FORMAT_NOFILTER 197#include "trace_entries.h"
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 198
204 tpfmt) 199
200#undef __field
201#define __field(type, item)
202
203#undef __field_desc
204#define __field_desc(type, container, item)
205
206#undef __array
207#define __array(type, item, len)
208
209#undef __array_desc
210#define __array_desc(type, container, item, len)
211
212#undef __dynamic_array
213#define __dynamic_array(type, item)
214
215#undef FTRACE_ENTRY
216#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
217static int ftrace_raw_init_event_##call(void); \
218 \
219struct ftrace_event_call __used \
220__attribute__((__aligned__(4))) \
221__attribute__((section("_ftrace_events"))) event_##call = { \
222 .name = #call, \
223 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event_##call, \
226 .show_format = ftrace_format_##call, \
227 .define_fields = ftrace_define_fields_##call, \
228}; \
229static int ftrace_raw_init_event_##call(void) \
230{ \
231 INIT_LIST_HEAD(&event_##call.fields); \
232 return 0; \
233} \
205 234
206#include "trace_event_types.h" 235#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518fc..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
290{ 290{
291 long count = (long)data; 291 long count = (long)data;
292 292
293 seq_printf(m, "%pf:", (void *)ip); 293 seq_printf(m, "%ps:", (void *)ip);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c3132..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
364} 364}
365 365
366 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
367/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
368static enum print_line_t 377static enum print_line_t
369verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
521 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
522 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
523 } 532 }
533
524 /* Proc */ 534 /* Proc */
525 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
526 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
659 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
660 } 670 }
661 671
662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
663 if (!ret) 673 if (!ret)
664 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
665 675
@@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
702 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
703 } 713 }
704 714
705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
706 if (!ret) 716 if (!ret)
707 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
708 718
@@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
758 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
759 } 769 }
760 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
761 return 0; 778 return 0;
762} 779}
763 780
@@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
952 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
953} 970}
954 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
955static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
956{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
957 /* 1st line */ 1001 /* 1st line */
958 seq_printf(s, "# "); 1002 seq_printf(s, "#");
959 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
960 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
961 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
962 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
963 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
964 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
965 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
966 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
967 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
968 1014
969 /* 2nd line */ 1015 /* 2nd line */
970 seq_printf(s, "# "); 1016 seq_printf(s, "#");
971 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
972 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
973 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
974 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
975 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
976 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
977 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
978 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
979 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..69543a905cd5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
@@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to)
165 struct ftrace_event_call *call = &event_hw_branch; 165 struct ftrace_event_call *call = &event_hw_branch;
166 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
167 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
168 struct ring_buffer *buf;
168 struct hw_branch_entry *entry; 169 struct hw_branch_entry *entry;
169 unsigned long irq1; 170 unsigned long irq1;
170 int cpu; 171 int cpu;
@@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to)
180 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 181 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
181 goto out; 182 goto out;
182 183
183 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, 184 buf = tr->buffer;
185 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
184 sizeof(*entry), 0, 0); 186 sizeof(*entry), 0, 0);
185 if (!event) 187 if (!event)
186 goto out; 188 goto out;
@@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to)
189 entry->ent.type = TRACE_HW_BRANCHES; 191 entry->ent.type = TRACE_HW_BRANCHES;
190 entry->from = from; 192 entry->from = from;
191 entry->to = to; 193 entry->to = to;
192 if (!filter_check_discard(call, entry, tr->buffer, event)) 194 if (!filter_check_discard(call, entry, buf, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0); 195 trace_buffer_unlock_commit(buf, event, 0, 0);
194 196
195 out: 197 out:
196 atomic_dec(&tr->data[cpu]->disabled); 198 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d12..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d3..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
310 struct ring_buffer *buffer = tr->buffer; 311 struct ring_buffer *buffer = tr->buffer;
311 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
312 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
320 } 321 }
321 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
322 entry->rw = *rw; 323 entry->rw = *rw;
323 trace_buffer_unlock_commit(buffer, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
324} 327}
325 328
326void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
335 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
336{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
337 struct ring_buffer *buffer = tr->buffer; 341 struct ring_buffer *buffer = tr->buffer;
338 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
339 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
347 } 351 }
348 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
349 entry->map = *map; 353 entry->map = *map;
350 trace_buffer_unlock_commit(buffer, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
351} 357}
352 358
353void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..ed17565826b0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -482,8 +487,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
482 return 0; 487 return 0;
483 488
484 if (entry->preempt_count) 489 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 490 ret = trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 491 else
492 ret = trace_seq_putc(s, '.');
493
494 if (!ret)
495 return 0;
496
497 if (entry->lock_depth < 0)
498 return trace_seq_putc(s, '.');
499
500 return trace_seq_printf(s, "%d", entry->lock_depth);
501}
502
503static int
504lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
505{
506 char comm[TASK_COMM_LEN];
507
508 trace_find_cmdline(entry->pid, comm);
509
510 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
511 comm, entry->pid, cpu))
512 return 0;
513
514 return trace_print_lat_fmt(s, entry);
487} 515}
488 516
489static unsigned long preempt_mark_thresh = 100; 517static unsigned long preempt_mark_thresh = 100;
@@ -857,7 +885,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
857 trace_assign_type(field, iter->ent); 885 trace_assign_type(field, iter->ent);
858 886
859 if (!S) 887 if (!S)
860 task_state_char(field->prev_state); 888 S = task_state_char(field->prev_state);
861 T = task_state_char(field->next_state); 889 T = task_state_char(field->next_state);
862 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 890 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
863 field->prev_pid, 891 field->prev_pid,
@@ -892,7 +920,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
892 trace_assign_type(field, iter->ent); 920 trace_assign_type(field, iter->ent);
893 921
894 if (!S) 922 if (!S)
895 task_state_char(field->prev_state); 923 S = task_state_char(field->prev_state);
896 T = task_state_char(field->next_state); 924 T = task_state_char(field->next_state);
897 925
898 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 926 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
42 struct trace_power *entry;
43 struct trace_array_cpu *data;
44 struct trace_array *tr = power_trace;
45
46 if (!trace_power_enabled)
47 return;
48
49 buffer = tr->buffer;
50
51 preempt_disable();
52 it->end = ktime_get();
53 data = tr->data[smp_processor_id()];
54
55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
56 sizeof(*entry), 0, 0);
57 if (!event)
58 goto out;
59 entry = ring_buffer_event_data(event);
60 entry->state_data = *it;
61 if (!filter_check_discard(call, entry, buffer, event))
62 trace_buffer_unlock_commit(buffer, event, 0, 0);
63 out:
64 preempt_enable();
65}
66
67static void probe_power_mark(struct power_trace *it, unsigned int type,
68 unsigned int level)
69{
70 struct ftrace_event_call *call = &event_power;
71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
73 struct trace_power *entry;
74 struct trace_array_cpu *data;
75 struct trace_array *tr = power_trace;
76
77 if (!trace_power_enabled)
78 return;
79
80 buffer = tr->buffer;
81
82 memset(it, 0, sizeof(struct power_trace));
83 it->state = level;
84 it->type = type;
85 it->stamp = ktime_get();
86 preempt_disable();
87 it->end = it->stamp;
88 data = tr->data[smp_processor_id()];
89
90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
91 sizeof(*entry), 0, 0);
92 if (!event)
93 goto out;
94 entry = ring_buffer_event_data(event);
95 entry->state_data = *it;
96 if (!filter_check_discard(call, entry, buffer, event))
97 trace_buffer_unlock_commit(buffer, event, 0, 0);
98 out:
99 preempt_enable();
100}
101
102static int tracing_power_register(void)
103{
104 int ret;
105
106 ret = register_trace_power_start(probe_power_start);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_start\n");
110 return ret;
111 }
112 ret = register_trace_power_end(probe_power_end);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_end\n");
116 goto fail_start;
117 }
118 ret = register_trace_power_mark(probe_power_mark);
119 if (ret) {
120 pr_info("power trace: Couldn't activate tracepoint"
121 " probe to trace_power_mark\n");
122 goto fail_end;
123 }
124 return ret;
125fail_end:
126 unregister_trace_power_end(probe_power_end);
127fail_start:
128 unregister_trace_power_start(probe_power_start);
129 return ret;
130}
131
132static void start_power_trace(struct trace_array *tr)
133{
134 trace_power_enabled = 1;
135}
136
137static void stop_power_trace(struct trace_array *tr)
138{
139 trace_power_enabled = 0;
140}
141
142static void power_trace_reset(struct trace_array *tr)
143{
144 trace_power_enabled = 0;
145 unregister_trace_power_start(probe_power_start);
146 unregister_trace_power_end(probe_power_end);
147 unregister_trace_power_mark(probe_power_mark);
148}
149
150
151static int power_trace_init(struct trace_array *tr)
152{
153 power_trace = tr;
154
155 trace_power_enabled = 1;
156 tracing_power_register();
157
158 tracing_reset_online_cpus(tr);
159 return 0;
160}
161
162static enum print_line_t power_print_line(struct trace_iterator *iter)
163{
164 int ret = 0;
165 struct trace_entry *entry = iter->ent;
166 struct trace_power *field ;
167 struct power_trace *it;
168 struct trace_seq *s = &iter->seq;
169 struct timespec stamp;
170 struct timespec duration;
171
172 trace_assign_type(field, entry);
173 it = &field->state_data;
174 stamp = ktime_to_timespec(it->stamp);
175 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
176
177 if (entry->type == TRACE_POWER) {
178 if (it->type == POWER_CSTATE)
179 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
180 stamp.tv_sec,
181 stamp.tv_nsec,
182 it->state, iter->cpu,
183 duration.tv_sec,
184 duration.tv_nsec);
185 if (it->type == POWER_PSTATE)
186 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
187 stamp.tv_sec,
188 stamp.tv_nsec,
189 it->state, iter->cpu);
190 if (!ret)
191 return TRACE_TYPE_PARTIAL_LINE;
192 return TRACE_TYPE_HANDLED;
193 }
194 return TRACE_TYPE_UNHANDLED;
195}
196
197static void power_print_header(struct seq_file *s)
198{
199 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
200 seq_puts(s, "# | | |\n");
201}
202
203static struct tracer power_tracer __read_mostly =
204{
205 .name = "power",
206 .init = power_trace_init,
207 .start = start_power_trace,
208 .stop = stop_power_trace,
209 .reset = power_trace_reset,
210 .print_line = power_print_line,
211 .print_header = power_print_header,
212};
213
214static int init_power_trace(void)
215{
216 return register_tracer(&power_tracer);
217}
218device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c6..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
245 236
246 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
247 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
248 240
249 wakeup_task = p; 241 wakeup_task = p;
@@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
293 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
294 } 286 }
295 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
296 wakeup_reset(tr); 295 wakeup_reset(tr);
297 296
298 /* 297 /*
@@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..d00d1a8f1f26 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h> 4#include <linux/ftrace.h>
5#include <linux/perf_counter.h> 5#include <linux/perf_event.h>
6#include <asm/syscall.h> 6#include <asm/syscall.h>
7 7
8#include "trace_output.h" 8#include "trace_output.h"
@@ -14,6 +14,69 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 16
17extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[];
19
20static struct syscall_metadata **syscalls_metadata;
21
22static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
23{
24 struct syscall_metadata *start;
25 struct syscall_metadata *stop;
26 char str[KSYM_SYMBOL_LEN];
27
28
29 start = (struct syscall_metadata *)__start_syscalls_metadata;
30 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
31 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
32
33 for ( ; start < stop; start++) {
34 /*
35 * Only compare after the "sys" prefix. Archs that use
36 * syscall wrappers may have syscalls symbols aliases prefixed
37 * with "SyS" instead of "sys", leading to an unwanted
38 * mismatch.
39 */
40 if (start->name && !strcmp(start->name + 3, str + 3))
41 return start;
42 }
43 return NULL;
44}
45
46static struct syscall_metadata *syscall_nr_to_meta(int nr)
47{
48 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
49 return NULL;
50
51 return syscalls_metadata[nr];
52}
53
54int syscall_name_to_nr(char *name)
55{
56 int i;
57
58 if (!syscalls_metadata)
59 return -1;
60
61 for (i = 0; i < NR_syscalls; i++) {
62 if (syscalls_metadata[i]) {
63 if (!strcmp(syscalls_metadata[i]->name, name))
64 return i;
65 }
66 }
67 return -1;
68}
69
70void set_syscall_enter_id(int num, int id)
71{
72 syscalls_metadata[num]->enter_id = id;
73}
74
75void set_syscall_exit_id(int num, int id)
76{
77 syscalls_metadata[num]->exit_id = id;
78}
79
17enum print_line_t 80enum print_line_t
18print_syscall_enter(struct trace_iterator *iter, int flags) 81print_syscall_enter(struct trace_iterator *iter, int flags)
19{ 82{
@@ -103,7 +166,8 @@ extern char *__bad_type_size(void);
103#define SYSCALL_FIELD(type, name) \ 166#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \ 167 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \ 168 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name) 169 #type, #name, offsetof(typeof(trace), name), \
170 sizeof(trace.name), is_signed_type(type)
107 171
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 172int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
109{ 173{
@@ -120,7 +184,8 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
120 if (!entry) 184 if (!entry)
121 return 0; 185 return 0;
122 186
123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 187 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
188 "\tsigned:%u;\n",
124 SYSCALL_FIELD(int, nr)); 189 SYSCALL_FIELD(int, nr));
125 if (!ret) 190 if (!ret)
126 return 0; 191 return 0;
@@ -130,8 +195,10 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
130 entry->args[i]); 195 entry->args[i]);
131 if (!ret) 196 if (!ret)
132 return 0; 197 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset, 198 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
134 sizeof(unsigned long)); 199 "\tsigned:%u;\n", offset,
200 sizeof(unsigned long),
201 is_signed_type(unsigned long));
135 if (!ret) 202 if (!ret)
136 return 0; 203 return 0;
137 offset += sizeof(unsigned long); 204 offset += sizeof(unsigned long);
@@ -163,10 +230,12 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
163 struct syscall_trace_exit trace; 230 struct syscall_trace_exit trace;
164 231
165 ret = trace_seq_printf(s, 232 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 233 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 234 "\tsigned:%u;\n"
235 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
236 "\tsigned:%u;\n",
168 SYSCALL_FIELD(int, nr), 237 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret)); 238 SYSCALL_FIELD(long, ret));
170 if (!ret) 239 if (!ret)
171 return 0; 240 return 0;
172 241
@@ -212,7 +281,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 281 if (ret)
213 return ret; 282 return ret;
214 283
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, 284 ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
216 FILTER_OTHER); 285 FILTER_OTHER);
217 286
218 return ret; 287 return ret;
@@ -375,6 +444,29 @@ struct trace_event event_syscall_exit = {
375 .trace = print_syscall_exit, 444 .trace = print_syscall_exit,
376}; 445};
377 446
447int __init init_ftrace_syscalls(void)
448{
449 struct syscall_metadata *meta;
450 unsigned long addr;
451 int i;
452
453 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
454 NR_syscalls, GFP_KERNEL);
455 if (!syscalls_metadata) {
456 WARN_ON(1);
457 return -ENOMEM;
458 }
459
460 for (i = 0; i < NR_syscalls; i++) {
461 addr = arch_syscall_addr(i);
462 meta = find_syscall_meta(addr);
463 syscalls_metadata[i] = meta;
464 }
465
466 return 0;
467}
468core_initcall(init_ftrace_syscalls);
469
378#ifdef CONFIG_EVENT_PROFILE 470#ifdef CONFIG_EVENT_PROFILE
379 471
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 472static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
@@ -384,10 +476,13 @@ static int sys_prof_refcount_exit;
384 476
385static void prof_syscall_enter(struct pt_regs *regs, long id) 477static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 478{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 479 struct syscall_metadata *sys_data;
480 struct syscall_trace_enter *rec;
481 unsigned long flags;
482 char *raw_data;
389 int syscall_nr; 483 int syscall_nr;
390 int size; 484 int size;
485 int cpu;
391 486
392 syscall_nr = syscall_get_nr(current, regs); 487 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 488 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +497,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 497 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 498 size -= sizeof(u32);
404 499
405 do { 500 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 501 "profile buffer not large enough"))
502 return;
503
504 /* Protect the per cpu buffer, begin the rcu read side */
505 local_irq_save(flags);
407 506
408 /* zero the dead bytes from align to not leak stack to user */ 507 cpu = smp_processor_id();
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410 508
411 rec = (struct syscall_trace_enter *) raw_data; 509 if (in_nmi())
412 tracing_generic_entry_update(&rec->ent, 0, 0); 510 raw_data = rcu_dereference(trace_profile_buf_nmi);
413 rec->ent.type = sys_data->enter_id; 511 else
414 rec->nr = syscall_nr; 512 raw_data = rcu_dereference(trace_profile_buf);
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 513
416 (unsigned long *)&rec->args); 514 if (!raw_data)
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); 515 goto end;
418 } while(0); 516
517 raw_data = per_cpu_ptr(raw_data, cpu);
518
519 /* zero the dead bytes from align to not leak stack to user */
520 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
521
522 rec = (struct syscall_trace_enter *) raw_data;
523 tracing_generic_entry_update(&rec->ent, 0, 0);
524 rec->ent.type = sys_data->enter_id;
525 rec->nr = syscall_nr;
526 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
527 (unsigned long *)&rec->args);
528 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
529
530end:
531 local_irq_restore(flags);
419} 532}
420 533
421int reg_prof_syscall_enter(char *name) 534int reg_prof_syscall_enter(char *name)
@@ -460,8 +573,12 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 573static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 574{
462 struct syscall_metadata *sys_data; 575 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 576 struct syscall_trace_exit *rec;
577 unsigned long flags;
464 int syscall_nr; 578 int syscall_nr;
579 char *raw_data;
580 int size;
581 int cpu;
465 582
466 syscall_nr = syscall_get_nr(current, regs); 583 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 584 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +588,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 588 if (!sys_data)
472 return; 589 return;
473 590
474 tracing_generic_entry_update(&rec.ent, 0, 0); 591 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 592 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 593 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs); 594
595 /*
596 * Impossible, but be paranoid with the future
597 * How to put this check outside runtime?
598 */
599 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
600 "exit event has grown above profile buffer size"))
601 return;
602
603 /* Protect the per cpu buffer, begin the rcu read side */
604 local_irq_save(flags);
605 cpu = smp_processor_id();
606
607 if (in_nmi())
608 raw_data = rcu_dereference(trace_profile_buf_nmi);
609 else
610 raw_data = rcu_dereference(trace_profile_buf);
611
612 if (!raw_data)
613 goto end;
614
615 raw_data = per_cpu_ptr(raw_data, cpu);
616
617 /* zero the dead bytes from align to not leak stack to user */
618 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
619
620 rec = (struct syscall_trace_exit *)raw_data;
478 621
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 622 tracing_generic_entry_update(&rec->ent, 0, 0);
623 rec->ent.type = sys_data->exit_id;
624 rec->nr = syscall_nr;
625 rec->ret = syscall_get_return_value(current, regs);
626
627 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
628
629end:
630 local_irq_restore(flags);
480} 631}
481 632
482int reg_prof_syscall_exit(char *name) 633int reg_prof_syscall_exit(char *name)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9489a0a9b1be..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
48 48
49/* 49/*
50 * Note about RCU : 50 * Note about RCU :
51 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
52 * state is reached. 52 * state is reached.
53 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
54 */ 54 */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..ccefe574dcf7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 640EXPORT_SYMBOL(schedule_delayed_work);
641 641
642/** 642/**
643 * flush_delayed_work - block until a dwork_struct's callback has terminated
644 * @dwork: the delayed work which is to be flushed
645 *
646 * Any timeout is cancelled, and any pending work is run immediately.
647 */
648void flush_delayed_work(struct delayed_work *dwork)
649{
650 if (del_timer(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu());
653 __queue_work(cwq, &dwork->work);
654 put_cpu();
655 }
656 flush_work(&dwork->work);
657}
658EXPORT_SYMBOL(flush_delayed_work);
659
660/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 661 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 662 * @cpu: cpu to use
645 * @dwork: job to be done 663 * @dwork: job to be done