aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cgroup.c1128
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpuset.c66
-rw-r--r--kernel/cred.c19
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c166
-rw-r--r--kernel/fork.c85
-rw-r--r--kernel/futex.c140
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c150
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/itimer.c169
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/lockdep.c23
-rw-r--r--kernel/lockdep_proc.c2
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c184
-rw-r--r--kernel/mutex-debug.c1
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/perf_counter.c4963
-rw-r--r--kernel/perf_event.c5108
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c155
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c11
-rw-r--r--kernel/rcupdate.c188
-rw-r--r--kernel/rcutorture.c47
-rw-r--r--kernel/rcutree.c431
-rw-r--r--kernel/rcutree.h88
-rw-r--r--kernel/rcutree_plugin.h191
-rw-r--r--kernel/rcutree_trace.c16
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c23
-rw-r--r--kernel/sched.c583
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c468
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c20
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/slow-work.c12
-rw-r--r--kernel/smp.c36
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c46
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c153
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c536
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c64
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c39
-rw-r--r--kernel/trace/ftrace.c233
-rw-r--r--kernel/trace/kmemtrace.c2
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c197
-rw-r--r--kernel/trace/trace.h279
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h366
-rw-r--r--kernel/trace/trace_event_profile.c92
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c133
-rw-r--r--kernel/trace/trace_events_filter.c44
-rw-r--r--kernel/trace/trace_export.c284
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c66
-rw-r--r--kernel/trace/trace_hw_branches.c10
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c50
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c52
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c103
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/workqueue.c18
112 files changed, 10597 insertions, 9818 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 961379caf666..b8d4cd8ac0b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 59obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 60obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 61obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 62obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 63obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -87,17 +86,15 @@ obj-$(CONFIG_RELAY) += relay.o
87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 88obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
90obj-$(CONFIG_MARKERS) += marker.o
91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 89obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
92obj-$(CONFIG_LATENCYTOP) += latencytop.o 90obj-$(CONFIG_LATENCYTOP) += latencytop.o
93obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
94obj-$(CONFIG_FUNCTION_TRACER) += trace/ 91obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 92obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_X86_DS) += trace/ 93obj-$(CONFIG_X86_DS) += trace/
97obj-$(CONFIG_RING_BUFFER) += trace/ 94obj-$(CONFIG_RING_BUFFER) += trace/
98obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
99obj-$(CONFIG_SLOW_WORK) += slow-work.o 96obj-$(CONFIG_SLOW_WORK) += slow-work.o
100obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 97obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101 98
102ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
103# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c7ece8f027f2..ca83b73fba19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -596,8 +702,8 @@ void cgroup_unlock(void)
596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 702static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup", 709 .name = "cgroup",
@@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
677 */ 783 */
678 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
679 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
680 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
681 } 793 }
682 iput(inode); 794 iput(inode);
@@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
841 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
842 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
843 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
844 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
845 return 0; 959 return 0;
846} 960}
@@ -849,6 +963,12 @@ struct cgroup_sb_opts {
849 unsigned long subsys_bits; 963 unsigned long subsys_bits;
850 unsigned long flags; 964 unsigned long flags;
851 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
852}; 972};
853 973
854/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
863 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
864#endif 984#endif
865 985
866 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
867 opts->flags = 0;
868 opts->release_agent = NULL;
869 987
870 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
871 if (!*token) 989 if (!*token)
@@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
879 if (!ss->disabled) 997 if (!ss->disabled)
880 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
881 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
882 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
883 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
884 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
885 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
886 if (opts->release_agent) 1007 if (opts->release_agent)
887 return -EINVAL; 1008 return -EINVAL;
888 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
889 if (!opts->release_agent) 1011 if (!opts->release_agent)
890 return -ENOMEM; 1012 return -ENOMEM;
891 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
892 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
893 } else { 1036 } else {
894 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
895 int i; 1038 int i;
@@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
906 } 1049 }
907 } 1050 }
908 1051
1052 /* Consistency checks */
1053
909 /* 1054 /*
910 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
911 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
915 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
916 return -EINVAL; 1061 return -EINVAL;
917 1062
918 /* We can't have an empty hierarchy */ 1063
919 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
920 return -EINVAL; 1073 return -EINVAL;
921 1074
922 return 0; 1075 return 0;
@@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
944 goto out_unlock; 1097 goto out_unlock;
945 } 1098 }
946 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
947 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
948 if (ret) 1107 if (ret)
949 goto out_unlock; 1108 goto out_unlock;
@@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
955 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
956 out_unlock: 1115 out_unlock:
957 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
958 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
959 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
960 unlock_kernel(); 1120 unlock_kernel();
961 return ret; 1121 return ret;
962} 1122}
963 1123
964static struct super_operations cgroup_ops = { 1124static const struct super_operations cgroup_ops = {
965 .statfs = simple_statfs, 1125 .statfs = simple_statfs,
966 .drop_inode = generic_delete_inode, 1126 .drop_inode = generic_delete_inode,
967 .show_options = cgroup_show_options, 1127 .show_options = cgroup_show_options,
@@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
974 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
975 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
976 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
978 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
979} 1139}
1140
980static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
981{ 1142{
982 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
988 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
989} 1150}
990 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
991static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
992{ 1178{
993 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
994 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
995 1181
996 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
997 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
998 return 0; 1184 return 0;
999 1185
1000 /* Next check flags */ 1186 /*
1001 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1002 return 0; 1192 return 0;
1003 1193
1004 return 1; 1194 return 1;
1005} 1195}
1006 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1007static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1008{ 1236{
1009 int ret; 1237 int ret;
1010 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1011 1245
1012 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1013 if (ret) 1247 if (ret)
1014 return ret; 1248 return ret;
1015 1249
1016 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1017 root->sb = sb; 1251 opts->new_root->sb = sb;
1018 1252
1019 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1020 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1051 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1052{ 1286{
1053 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1054 int ret = 0; 1289 int ret = 0;
1055 struct super_block *sb; 1290 struct super_block *sb;
1056 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1057 struct list_head tmp_cg_links;
1058 1292
1059 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1060 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1061 if (ret) { 1295 if (ret)
1062 kfree(opts.release_agent); 1296 goto out_err;
1063 return ret;
1064 }
1065
1066 root = kzalloc(sizeof(*root), GFP_KERNEL);
1067 if (!root) {
1068 kfree(opts.release_agent);
1069 return -ENOMEM;
1070 }
1071 1297
1072 init_cgroup_root(root); 1298 /*
1073 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1074 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1075 if (opts.release_agent) { 1301 */
1076 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1077 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1078 } 1306 }
1307 opts.new_root = new_root;
1079 1308
1080 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1081 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1082 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1083 kfree(root); 1312 ret = PTR_ERR(sb);
1084 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1085 } 1315 }
1086 1316
1087 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1088 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1089 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1090 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1091 root = NULL; 1321 struct list_head tmp_cg_links;
1092 } else {
1093 /* New superblock */
1094 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1095 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1096 int i; 1325 int i;
1097 1326
1098 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1105 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1106 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1107 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1108 /* 1349 /*
1109 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1110 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1123 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1124 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1125 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1126 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1127 } 1369 }
1128 1370
1129 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1155 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1156 1398
1157 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1158 mutex_unlock(&inode->i_mutex);
1159 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1160 } 1408 }
1161 1409
1162 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1163 return 0; 1413 return 0;
1164 1414
1165 free_cg_links:
1166 free_cg_links(&tmp_cg_links);
1167 drop_new_super: 1415 drop_new_super:
1168 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1169 return ret; 1421 return ret;
1170} 1422}
1171 1423
@@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1211 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1212 1464
1213 kill_litter_super(sb); 1465 kill_litter_super(sb);
1214 kfree(root); 1466 cgroup_drop_root(root);
1215} 1467}
1216 1468
1217static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1276 return 0; 1528 return 0;
1277} 1529}
1278 1530
1279/*
1280 * Return the first subsystem attached to a cgroup's hierarchy, and
1281 * its subsystem id.
1282 */
1283
1284static void get_first_subsys(const struct cgroup *cgrp,
1285 struct cgroup_subsys_state **css, int *subsys_id)
1286{
1287 const struct cgroupfs_root *root = cgrp->root;
1288 const struct cgroup_subsys *test_ss;
1289 BUG_ON(list_empty(&root->subsys_list));
1290 test_ss = list_entry(root->subsys_list.next,
1291 struct cgroup_subsys, sibling);
1292 if (css) {
1293 *css = cgrp->subsys[test_ss->subsys_id];
1294 BUG_ON(!*css);
1295 }
1296 if (subsys_id)
1297 *subsys_id = test_ss->subsys_id;
1298}
1299
1300/** 1531/**
1301 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1302 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1313 struct css_set *cg; 1544 struct css_set *cg;
1314 struct css_set *newcg; 1545 struct css_set *newcg;
1315 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1316 int subsys_id;
1317
1318 get_first_subsys(cgrp, NULL, &subsys_id);
1319 1547
1320 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1321 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1322 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1323 return 0; 1551 return 0;
1324 1552
1325 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1326 if (ss->can_attach) { 1554 if (ss->can_attach) {
1327 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1328 if (retval) 1556 if (retval)
1329 return retval; 1557 return retval;
1330 } 1558 }
@@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1362 1590
1363 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1364 if (ss->attach) 1592 if (ss->attach)
1365 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1366 } 1594 }
1367 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1368 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1423 return ret; 1651 return ret;
1424} 1652}
1425 1653
1426/* The various types of files and directories in a cgroup file system */
1427enum cgroup_filetype {
1428 FILE_ROOT,
1429 FILE_DIR,
1430 FILE_TASKLIST,
1431 FILE_NOTIFY_ON_RELEASE,
1432 FILE_RELEASE_AGENT,
1433};
1434
1435/** 1654/**
1436 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1437 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1644,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1644 return single_release(inode, file); 1863 return single_release(inode, file);
1645} 1864}
1646 1865
1647static struct file_operations cgroup_seqfile_operations = { 1866static const struct file_operations cgroup_seqfile_operations = {
1648 .read = seq_read, 1867 .read = seq_read,
1649 .write = cgroup_file_write, 1868 .write = cgroup_file_write,
1650 .llseek = seq_lseek, 1869 .llseek = seq_lseek,
@@ -1703,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1703 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1922 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1704} 1923}
1705 1924
1706static struct file_operations cgroup_file_operations = { 1925static const struct file_operations cgroup_file_operations = {
1707 .read = cgroup_file_read, 1926 .read = cgroup_file_read,
1708 .write = cgroup_file_write, 1927 .write = cgroup_file_write,
1709 .llseek = generic_file_llseek, 1928 .llseek = generic_file_llseek,
@@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = {
1711 .release = cgroup_file_release, 1930 .release = cgroup_file_release,
1712}; 1931};
1713 1932
1714static struct inode_operations cgroup_dir_inode_operations = { 1933static const struct inode_operations cgroup_dir_inode_operations = {
1715 .lookup = simple_lookup, 1934 .lookup = simple_lookup,
1716 .mkdir = cgroup_mkdir, 1935 .mkdir = cgroup_mkdir,
1717 .rmdir = cgroup_rmdir, 1936 .rmdir = cgroup_rmdir,
@@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1876 * the start of a css_set 2095 * the start of a css_set
1877 */ 2096 */
1878static void cgroup_advance_iter(struct cgroup *cgrp, 2097static void cgroup_advance_iter(struct cgroup *cgrp,
1879 struct cgroup_iter *it) 2098 struct cgroup_iter *it)
1880{ 2099{
1881 struct list_head *l = it->cg_link; 2100 struct list_head *l = it->cg_link;
1882 struct cg_cgroup_link *link; 2101 struct cg_cgroup_link *link;
@@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2129} 2348}
2130 2349
2131/* 2350/*
2132 * Stuff for reading the 'tasks' file. 2351 * Stuff for reading the 'tasks'/'procs' files.
2133 * 2352 *
2134 * Reading this file can return large amounts of data if a cgroup has 2353 * Reading this file can return large amounts of data if a cgroup has
2135 * *lots* of attached tasks. So it may need several calls to read(), 2354 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2139 */ 2358 */
2140 2359
2141/* 2360/*
2142 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2361 * The following two functions "fix" the issue where there are more pids
2143 * 'cgrp'. Return actual number of pids loaded. No need to 2362 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2144 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2363 * TODO: replace with a kernel-wide solution to this problem
2145 * read section, so the css_set can't go away, and is 2364 */
2146 * immutable after creation. 2365#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2366static void *pidlist_allocate(int count)
2367{
2368 if (PIDLIST_TOO_LARGE(count))
2369 return vmalloc(count * sizeof(pid_t));
2370 else
2371 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2372}
2373static void pidlist_free(void *p)
2374{
2375 if (is_vmalloc_addr(p))
2376 vfree(p);
2377 else
2378 kfree(p);
2379}
2380static void *pidlist_resize(void *p, int newcount)
2381{
2382 void *newlist;
2383 /* note: if new alloc fails, old p will still be valid either way */
2384 if (is_vmalloc_addr(p)) {
2385 newlist = vmalloc(newcount * sizeof(pid_t));
2386 if (!newlist)
2387 return NULL;
2388 memcpy(newlist, p, newcount * sizeof(pid_t));
2389 vfree(p);
2390 } else {
2391 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2392 }
2393 return newlist;
2394}
2395
2396/*
2397 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2398 * If the new stripped list is sufficiently smaller and there's enough memory
2399 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2400 * number of unique elements.
2401 */
2402/* is the size difference enough that we should re-allocate the array? */
2403#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2404static int pidlist_uniq(pid_t **p, int length)
2405{
2406 int src, dest = 1;
2407 pid_t *list = *p;
2408 pid_t *newlist;
2409
2410 /*
2411 * we presume the 0th element is unique, so i starts at 1. trivial
2412 * edge cases first; no work needs to be done for either
2413 */
2414 if (length == 0 || length == 1)
2415 return length;
2416 /* src and dest walk down the list; dest counts unique elements */
2417 for (src = 1; src < length; src++) {
2418 /* find next unique element */
2419 while (list[src] == list[src-1]) {
2420 src++;
2421 if (src == length)
2422 goto after;
2423 }
2424 /* dest always points to where the next unique element goes */
2425 list[dest] = list[src];
2426 dest++;
2427 }
2428after:
2429 /*
2430 * if the length difference is large enough, we want to allocate a
2431 * smaller buffer to save memory. if this fails due to out of memory,
2432 * we'll just stay with what we've got.
2433 */
2434 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2435 newlist = pidlist_resize(list, dest);
2436 if (newlist)
2437 *p = newlist;
2438 }
2439 return dest;
2440}
2441
2442static int cmppid(const void *a, const void *b)
2443{
2444 return *(pid_t *)a - *(pid_t *)b;
2445}
2446
2447/*
2448 * find the appropriate pidlist for our purpose (given procs vs tasks)
2449 * returns with the lock on that pidlist already held, and takes care
2450 * of the use count, or returns NULL with no locks held if we're out of
2451 * memory.
2147 */ 2452 */
2148static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2453static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2454 enum cgroup_filetype type)
2149{ 2455{
2150 int n = 0, pid; 2456 struct cgroup_pidlist *l;
2457 /* don't need task_nsproxy() if we're looking at ourself */
2458 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2459 /*
2460 * We can't drop the pidlist_mutex before taking the l->mutex in case
2461 * the last ref-holder is trying to remove l from the list at the same
2462 * time. Holding the pidlist_mutex precludes somebody taking whichever
2463 * list we find out from under us - compare release_pid_array().
2464 */
2465 mutex_lock(&cgrp->pidlist_mutex);
2466 list_for_each_entry(l, &cgrp->pidlists, links) {
2467 if (l->key.type == type && l->key.ns == ns) {
2468 /* found a matching list - drop the extra refcount */
2469 put_pid_ns(ns);
2470 /* make sure l doesn't vanish out from under us */
2471 down_write(&l->mutex);
2472 mutex_unlock(&cgrp->pidlist_mutex);
2473 l->use_count++;
2474 return l;
2475 }
2476 }
2477 /* entry not found; create a new one */
2478 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2479 if (!l) {
2480 mutex_unlock(&cgrp->pidlist_mutex);
2481 put_pid_ns(ns);
2482 return l;
2483 }
2484 init_rwsem(&l->mutex);
2485 down_write(&l->mutex);
2486 l->key.type = type;
2487 l->key.ns = ns;
2488 l->use_count = 0; /* don't increment here */
2489 l->list = NULL;
2490 l->owner = cgrp;
2491 list_add(&l->links, &cgrp->pidlists);
2492 mutex_unlock(&cgrp->pidlist_mutex);
2493 return l;
2494}
2495
2496/*
2497 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2498 */
2499static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2500 struct cgroup_pidlist **lp)
2501{
2502 pid_t *array;
2503 int length;
2504 int pid, n = 0; /* used for populating the array */
2151 struct cgroup_iter it; 2505 struct cgroup_iter it;
2152 struct task_struct *tsk; 2506 struct task_struct *tsk;
2507 struct cgroup_pidlist *l;
2508
2509 /*
2510 * If cgroup gets more users after we read count, we won't have
2511 * enough space - tough. This race is indistinguishable to the
2512 * caller from the case that the additional cgroup users didn't
2513 * show up until sometime later on.
2514 */
2515 length = cgroup_task_count(cgrp);
2516 array = pidlist_allocate(length);
2517 if (!array)
2518 return -ENOMEM;
2519 /* now, populate the array */
2153 cgroup_iter_start(cgrp, &it); 2520 cgroup_iter_start(cgrp, &it);
2154 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2521 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2155 if (unlikely(n == npids)) 2522 if (unlikely(n == length))
2156 break; 2523 break;
2157 pid = task_pid_vnr(tsk); 2524 /* get tgid or pid for procs or tasks file respectively */
2158 if (pid > 0) 2525 if (type == CGROUP_FILE_PROCS)
2159 pidarray[n++] = pid; 2526 pid = task_tgid_vnr(tsk);
2527 else
2528 pid = task_pid_vnr(tsk);
2529 if (pid > 0) /* make sure to only use valid results */
2530 array[n++] = pid;
2160 } 2531 }
2161 cgroup_iter_end(cgrp, &it); 2532 cgroup_iter_end(cgrp, &it);
2162 return n; 2533 length = n;
2534 /* now sort & (if procs) strip out duplicates */
2535 sort(array, length, sizeof(pid_t), cmppid, NULL);
2536 if (type == CGROUP_FILE_PROCS)
2537 length = pidlist_uniq(&array, length);
2538 l = cgroup_pidlist_find(cgrp, type);
2539 if (!l) {
2540 pidlist_free(array);
2541 return -ENOMEM;
2542 }
2543 /* store array, freeing old if necessary - lock already held */
2544 pidlist_free(l->list);
2545 l->list = array;
2546 l->length = length;
2547 l->use_count++;
2548 up_write(&l->mutex);
2549 *lp = l;
2550 return 0;
2163} 2551}
2164 2552
2165/** 2553/**
@@ -2216,37 +2604,14 @@ err:
2216 return ret; 2604 return ret;
2217} 2605}
2218 2606
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2238static int cmppid(const void *a, const void *b)
2239{
2240 return *(pid_t *)a - *(pid_t *)b;
2241}
2242 2607
2243/* 2608/*
2244 * seq_file methods for the "tasks" file. The seq_file position is the 2609 * seq_file methods for the tasks/procs files. The seq_file position is the
2245 * next pid to display; the seq_file iterator is a pointer to the pid 2610 * next pid to display; the seq_file iterator is a pointer to the pid
2246 * in the cgroup->tasks_pids array. 2611 * in the cgroup->l->list array.
2247 */ 2612 */
2248 2613
2249static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2614static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2250{ 2615{
2251 /* 2616 /*
2252 * Initially we receive a position value that corresponds to 2617 * Initially we receive a position value that corresponds to
@@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2254 * after a seek to the start). Use a binary-search to find the 2619 * after a seek to the start). Use a binary-search to find the
2255 * next pid to display, if any 2620 * next pid to display, if any
2256 */ 2621 */
2257 struct cgroup_pids *cp = s->private; 2622 struct cgroup_pidlist *l = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2259 int index = 0, pid = *pos; 2623 int index = 0, pid = *pos;
2260 int *iter; 2624 int *iter;
2261 2625
2262 down_read(&cgrp->pids_mutex); 2626 down_read(&l->mutex);
2263 if (pid) { 2627 if (pid) {
2264 int end = cp->length; 2628 int end = l->length;
2265 2629
2266 while (index < end) { 2630 while (index < end) {
2267 int mid = (index + end) / 2; 2631 int mid = (index + end) / 2;
2268 if (cp->tasks_pids[mid] == pid) { 2632 if (l->list[mid] == pid) {
2269 index = mid; 2633 index = mid;
2270 break; 2634 break;
2271 } else if (cp->tasks_pids[mid] <= pid) 2635 } else if (l->list[mid] <= pid)
2272 index = mid + 1; 2636 index = mid + 1;
2273 else 2637 else
2274 end = mid; 2638 end = mid;
2275 } 2639 }
2276 } 2640 }
2277 /* If we're off the end of the array, we're done */ 2641 /* If we're off the end of the array, we're done */
2278 if (index >= cp->length) 2642 if (index >= l->length)
2279 return NULL; 2643 return NULL;
2280 /* Update the abstract position to be the actual pid that we found */ 2644 /* Update the abstract position to be the actual pid that we found */
2281 iter = cp->tasks_pids + index; 2645 iter = l->list + index;
2282 *pos = *iter; 2646 *pos = *iter;
2283 return iter; 2647 return iter;
2284} 2648}
2285 2649
2286static void cgroup_tasks_stop(struct seq_file *s, void *v) 2650static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2287{ 2651{
2288 struct cgroup_pids *cp = s->private; 2652 struct cgroup_pidlist *l = s->private;
2289 struct cgroup *cgrp = cp->cgrp; 2653 up_read(&l->mutex);
2290 up_read(&cgrp->pids_mutex);
2291} 2654}
2292 2655
2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2656static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2294{ 2657{
2295 struct cgroup_pids *cp = s->private; 2658 struct cgroup_pidlist *l = s->private;
2296 int *p = v; 2659 pid_t *p = v;
2297 int *end = cp->tasks_pids + cp->length; 2660 pid_t *end = l->list + l->length;
2298
2299 /* 2661 /*
2300 * Advance to the next pid in the array. If this goes off the 2662 * Advance to the next pid in the array. If this goes off the
2301 * end, we're done 2663 * end, we're done
@@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2309 } 2671 }
2310} 2672}
2311 2673
2312static int cgroup_tasks_show(struct seq_file *s, void *v) 2674static int cgroup_pidlist_show(struct seq_file *s, void *v)
2313{ 2675{
2314 return seq_printf(s, "%d\n", *(int *)v); 2676 return seq_printf(s, "%d\n", *(int *)v);
2315} 2677}
2316 2678
2317static struct seq_operations cgroup_tasks_seq_operations = { 2679/*
2318 .start = cgroup_tasks_start, 2680 * seq_operations functions for iterating on pidlists through seq_file -
2319 .stop = cgroup_tasks_stop, 2681 * independent of whether it's tasks or procs
2320 .next = cgroup_tasks_next, 2682 */
2321 .show = cgroup_tasks_show, 2683static const struct seq_operations cgroup_pidlist_seq_operations = {
2684 .start = cgroup_pidlist_start,
2685 .stop = cgroup_pidlist_stop,
2686 .next = cgroup_pidlist_next,
2687 .show = cgroup_pidlist_show,
2322}; 2688};
2323 2689
2324static void release_cgroup_pid_array(struct cgroup_pids *cp) 2690static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2325{ 2691{
2326 struct cgroup *cgrp = cp->cgrp; 2692 /*
2327 2693 * the case where we're the last user of this particular pidlist will
2328 down_write(&cgrp->pids_mutex); 2694 * have us remove it from the cgroup's list, which entails taking the
2329 BUG_ON(!cp->use_count); 2695 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2330 if (!--cp->use_count) { 2696 * pidlist_mutex, we have to take pidlist_mutex first.
2331 list_del(&cp->list); 2697 */
2332 put_pid_ns(cp->ns); 2698 mutex_lock(&l->owner->pidlist_mutex);
2333 kfree(cp->tasks_pids); 2699 down_write(&l->mutex);
2334 kfree(cp); 2700 BUG_ON(!l->use_count);
2701 if (!--l->use_count) {
2702 /* we're the last user if refcount is 0; remove and free */
2703 list_del(&l->links);
2704 mutex_unlock(&l->owner->pidlist_mutex);
2705 pidlist_free(l->list);
2706 put_pid_ns(l->key.ns);
2707 up_write(&l->mutex);
2708 kfree(l);
2709 return;
2335 } 2710 }
2336 up_write(&cgrp->pids_mutex); 2711 mutex_unlock(&l->owner->pidlist_mutex);
2712 up_write(&l->mutex);
2337} 2713}
2338 2714
2339static int cgroup_tasks_release(struct inode *inode, struct file *file) 2715static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2340{ 2716{
2341 struct seq_file *seq; 2717 struct cgroup_pidlist *l;
2342 struct cgroup_pids *cp;
2343
2344 if (!(file->f_mode & FMODE_READ)) 2718 if (!(file->f_mode & FMODE_READ))
2345 return 0; 2719 return 0;
2346 2720 /*
2347 seq = file->private_data; 2721 * the seq_file will only be initialized if the file was opened for
2348 cp = seq->private; 2722 * reading; hence we check if it's not null only in that case.
2349 2723 */
2350 release_cgroup_pid_array(cp); 2724 l = ((struct seq_file *)file->private_data)->private;
2725 cgroup_release_pid_array(l);
2351 return seq_release(inode, file); 2726 return seq_release(inode, file);
2352} 2727}
2353 2728
2354static struct file_operations cgroup_tasks_operations = { 2729static const struct file_operations cgroup_pidlist_operations = {
2355 .read = seq_read, 2730 .read = seq_read,
2356 .llseek = seq_lseek, 2731 .llseek = seq_lseek,
2357 .write = cgroup_file_write, 2732 .write = cgroup_file_write,
2358 .release = cgroup_tasks_release, 2733 .release = cgroup_pidlist_release,
2359}; 2734};
2360 2735
2361/* 2736/*
2362 * Handle an open on 'tasks' file. Prepare an array containing the 2737 * The following functions handle opens on a file that displays a pidlist
2363 * process id's of tasks currently attached to the cgroup being opened. 2738 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2739 * in the cgroup.
2364 */ 2740 */
2365 2741/* helper function for the two below it */
2366static int cgroup_tasks_open(struct inode *unused, struct file *file) 2742static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2367{ 2743{
2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2744 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns; 2745 struct cgroup_pidlist *l;
2370 struct cgroup_pids *cp;
2371 pid_t *pidarray;
2372 int npids;
2373 int retval; 2746 int retval;
2374 2747
2375 /* Nothing to do for write-only files */ 2748 /* Nothing to do for write-only files */
2376 if (!(file->f_mode & FMODE_READ)) 2749 if (!(file->f_mode & FMODE_READ))
2377 return 0; 2750 return 0;
2378 2751
2379 /* 2752 /* have the array populated */
2380 * If cgroup gets more users after we read count, we won't have 2753 retval = pidlist_array_load(cgrp, type, &l);
2381 * enough space - tough. This race is indistinguishable to the 2754 if (retval)
2382 * caller from the case that the additional cgroup users didn't 2755 return retval;
2383 * show up until sometime later on. 2756 /* configure file information */
2384 */ 2757 file->f_op = &cgroup_pidlist_operations;
2385 npids = cgroup_task_count(cgrp);
2386 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2387 if (!pidarray)
2388 return -ENOMEM;
2389 npids = pid_array_load(pidarray, npids, cgrp);
2390 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2391
2392 /*
2393 * Store the array in the cgroup, freeing the old
2394 * array if necessary
2395 */
2396 down_write(&cgrp->pids_mutex);
2397
2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2399 if (ns == cp->ns)
2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2418 up_write(&cgrp->pids_mutex);
2419
2420 file->f_op = &cgroup_tasks_operations;
2421 2758
2422 retval = seq_open(file, &cgroup_tasks_seq_operations); 2759 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2423 if (retval) { 2760 if (retval) {
2424 release_cgroup_pid_array(cp); 2761 cgroup_release_pid_array(l);
2425 return retval; 2762 return retval;
2426 } 2763 }
2427 ((struct seq_file *)file->private_data)->private = cp; 2764 ((struct seq_file *)file->private_data)->private = l;
2428 return 0; 2765 return 0;
2429} 2766}
2767static int cgroup_tasks_open(struct inode *unused, struct file *file)
2768{
2769 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2770}
2771static int cgroup_procs_open(struct inode *unused, struct file *file)
2772{
2773 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2774}
2430 2775
2431static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2776static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2432 struct cftype *cft) 2777 struct cftype *cft)
@@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2449/* 2794/*
2450 * for the common functions, 'private' gives the type of file 2795 * for the common functions, 'private' gives the type of file
2451 */ 2796 */
2797/* for hysterical raisins, we can't put this on the older files */
2798#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2452static struct cftype files[] = { 2799static struct cftype files[] = {
2453 { 2800 {
2454 .name = "tasks", 2801 .name = "tasks",
2455 .open = cgroup_tasks_open, 2802 .open = cgroup_tasks_open,
2456 .write_u64 = cgroup_tasks_write, 2803 .write_u64 = cgroup_tasks_write,
2457 .release = cgroup_tasks_release, 2804 .release = cgroup_pidlist_release,
2458 .private = FILE_TASKLIST,
2459 .mode = S_IRUGO | S_IWUSR, 2805 .mode = S_IRUGO | S_IWUSR,
2460 }, 2806 },
2461 2807 {
2808 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2809 .open = cgroup_procs_open,
2810 /* .write_u64 = cgroup_procs_write, TODO */
2811 .release = cgroup_pidlist_release,
2812 .mode = S_IRUGO,
2813 },
2462 { 2814 {
2463 .name = "notify_on_release", 2815 .name = "notify_on_release",
2464 .read_u64 = cgroup_read_notify_on_release, 2816 .read_u64 = cgroup_read_notify_on_release,
2465 .write_u64 = cgroup_write_notify_on_release, 2817 .write_u64 = cgroup_write_notify_on_release,
2466 .private = FILE_NOTIFY_ON_RELEASE,
2467 }, 2818 },
2468}; 2819};
2469 2820
@@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = {
2472 .read_seq_string = cgroup_release_agent_show, 2823 .read_seq_string = cgroup_release_agent_show,
2473 .write_string = cgroup_release_agent_write, 2824 .write_string = cgroup_release_agent_write,
2474 .max_write_len = PATH_MAX, 2825 .max_write_len = PATH_MAX,
2475 .private = FILE_RELEASE_AGENT,
2476}; 2826};
2477 2827
2478static int cgroup_populate_dir(struct cgroup *cgrp) 2828static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void)
2879 init_task.cgroups = &init_css_set; 3229 init_task.cgroups = &init_css_set;
2880 3230
2881 init_css_set_link.cg = &init_css_set; 3231 init_css_set_link.cg = &init_css_set;
3232 init_css_set_link.cgrp = dummytop;
2882 list_add(&init_css_set_link.cgrp_link_list, 3233 list_add(&init_css_set_link.cgrp_link_list,
2883 &rootnode.top_cgroup.css_sets); 3234 &rootnode.top_cgroup.css_sets);
2884 list_add(&init_css_set_link.cg_link_list, 3235 list_add(&init_css_set_link.cg_link_list,
@@ -2933,7 +3284,7 @@ int __init cgroup_init(void)
2933 /* Add init_css_set to the hash table */ 3284 /* Add init_css_set to the hash table */
2934 hhead = css_set_hash(init_css_set.subsys); 3285 hhead = css_set_hash(init_css_set.subsys);
2935 hlist_add_head(&init_css_set.hlist, hhead); 3286 hlist_add_head(&init_css_set.hlist, hhead);
2936 3287 BUG_ON(!init_root_id(&rootnode));
2937 err = register_filesystem(&cgroup_fs_type); 3288 err = register_filesystem(&cgroup_fs_type);
2938 if (err < 0) 3289 if (err < 0)
2939 goto out; 3290 goto out;
@@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2986 for_each_active_root(root) { 3337 for_each_active_root(root) {
2987 struct cgroup_subsys *ss; 3338 struct cgroup_subsys *ss;
2988 struct cgroup *cgrp; 3339 struct cgroup *cgrp;
2989 int subsys_id;
2990 int count = 0; 3340 int count = 0;
2991 3341
2992 seq_printf(m, "%lu:", root->subsys_bits); 3342 seq_printf(m, "%d:", root->hierarchy_id);
2993 for_each_subsys(root, ss) 3343 for_each_subsys(root, ss)
2994 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3344 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3345 if (strlen(root->name))
3346 seq_printf(m, "%sname=%s", count ? "," : "",
3347 root->name);
2995 seq_putc(m, ':'); 3348 seq_putc(m, ':');
2996 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3349 cgrp = task_cgroup_from_root(tsk, root);
2997 cgrp = task_cgroup(tsk, subsys_id);
2998 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3350 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2999 if (retval < 0) 3351 if (retval < 0)
3000 goto out_unlock; 3352 goto out_unlock;
@@ -3017,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3017 return single_open(file, proc_cgroup_show, pid); 3369 return single_open(file, proc_cgroup_show, pid);
3018} 3370}
3019 3371
3020struct file_operations proc_cgroup_operations = { 3372const struct file_operations proc_cgroup_operations = {
3021 .open = cgroup_open, 3373 .open = cgroup_open,
3022 .read = seq_read, 3374 .read = seq_read,
3023 .llseek = seq_lseek, 3375 .llseek = seq_lseek,
@@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3033 mutex_lock(&cgroup_mutex); 3385 mutex_lock(&cgroup_mutex);
3034 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3035 struct cgroup_subsys *ss = subsys[i]; 3387 struct cgroup_subsys *ss = subsys[i];
3036 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3388 seq_printf(m, "%s\t%d\t%d\t%d\n",
3037 ss->name, ss->root->subsys_bits, 3389 ss->name, ss->root->hierarchy_id,
3038 ss->root->number_of_cgroups, !ss->disabled); 3390 ss->root->number_of_cgroups, !ss->disabled);
3039 } 3391 }
3040 mutex_unlock(&cgroup_mutex); 3392 mutex_unlock(&cgroup_mutex);
@@ -3046,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3046 return single_open(file, proc_cgroupstats_show, NULL); 3398 return single_open(file, proc_cgroupstats_show, NULL);
3047} 3399}
3048 3400
3049static struct file_operations proc_cgroupstats_operations = { 3401static const struct file_operations proc_cgroupstats_operations = {
3050 .open = cgroupstats_open, 3402 .open = cgroupstats_open,
3051 .read = seq_read, 3403 .read = seq_read,
3052 .llseek = seq_lseek, 3404 .llseek = seq_lseek,
@@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3320{ 3672{
3321 int ret; 3673 int ret;
3322 struct cgroup *target; 3674 struct cgroup *target;
3323 int subsys_id;
3324 3675
3325 if (cgrp == dummytop) 3676 if (cgrp == dummytop)
3326 return 1; 3677 return 1;
3327 3678
3328 get_first_subsys(cgrp, NULL, &subsys_id); 3679 target = task_cgroup_from_root(task, cgrp->root);
3329 target = task_cgroup(task, subsys_id);
3330 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3680 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3331 cgrp = cgrp->parent; 3681 cgrp = cgrp->parent;
3332 ret = (cgrp == target); 3682 ret = (cgrp == target);
@@ -3358,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp)
3358void __css_put(struct cgroup_subsys_state *css) 3708void __css_put(struct cgroup_subsys_state *css)
3359{ 3709{
3360 struct cgroup *cgrp = css->cgroup; 3710 struct cgroup *cgrp = css->cgroup;
3711 int val;
3361 rcu_read_lock(); 3712 rcu_read_lock();
3362 if (atomic_dec_return(&css->refcnt) == 1) { 3713 val = atomic_dec_return(&css->refcnt);
3714 if (val == 1) {
3363 if (notify_on_release(cgrp)) { 3715 if (notify_on_release(cgrp)) {
3364 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3716 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3365 check_for_release(cgrp); 3717 check_for_release(cgrp);
@@ -3367,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css)
3367 cgroup_wakeup_rmdir_waiter(cgrp); 3719 cgroup_wakeup_rmdir_waiter(cgrp);
3368 } 3720 }
3369 rcu_read_unlock(); 3721 rcu_read_unlock();
3722 WARN_ON_ONCE(val < 1);
3370} 3723}
3371 3724
3372/* 3725/*
@@ -3693,3 +4046,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3693 return ret; 4046 return ret;
3694} 4047}
3695 4048
4049#ifdef CONFIG_CGROUP_DEBUG
4050static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4051 struct cgroup *cont)
4052{
4053 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4054
4055 if (!css)
4056 return ERR_PTR(-ENOMEM);
4057
4058 return css;
4059}
4060
4061static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4062{
4063 kfree(cont->subsys[debug_subsys_id]);
4064}
4065
4066static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4067{
4068 return atomic_read(&cont->count);
4069}
4070
4071static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4072{
4073 return cgroup_task_count(cont);
4074}
4075
4076static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4077{
4078 return (u64)(unsigned long)current->cgroups;
4079}
4080
4081static u64 current_css_set_refcount_read(struct cgroup *cont,
4082 struct cftype *cft)
4083{
4084 u64 count;
4085
4086 rcu_read_lock();
4087 count = atomic_read(&current->cgroups->refcount);
4088 rcu_read_unlock();
4089 return count;
4090}
4091
4092static int current_css_set_cg_links_read(struct cgroup *cont,
4093 struct cftype *cft,
4094 struct seq_file *seq)
4095{
4096 struct cg_cgroup_link *link;
4097 struct css_set *cg;
4098
4099 read_lock(&css_set_lock);
4100 rcu_read_lock();
4101 cg = rcu_dereference(current->cgroups);
4102 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4103 struct cgroup *c = link->cgrp;
4104 const char *name;
4105
4106 if (c->dentry)
4107 name = c->dentry->d_name.name;
4108 else
4109 name = "?";
4110 seq_printf(seq, "Root %d group %s\n",
4111 c->root->hierarchy_id, name);
4112 }
4113 rcu_read_unlock();
4114 read_unlock(&css_set_lock);
4115 return 0;
4116}
4117
4118#define MAX_TASKS_SHOWN_PER_CSS 25
4119static int cgroup_css_links_read(struct cgroup *cont,
4120 struct cftype *cft,
4121 struct seq_file *seq)
4122{
4123 struct cg_cgroup_link *link;
4124
4125 read_lock(&css_set_lock);
4126 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4127 struct css_set *cg = link->cg;
4128 struct task_struct *task;
4129 int count = 0;
4130 seq_printf(seq, "css_set %p\n", cg);
4131 list_for_each_entry(task, &cg->tasks, cg_list) {
4132 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4133 seq_puts(seq, " ...\n");
4134 break;
4135 } else {
4136 seq_printf(seq, " task %d\n",
4137 task_pid_vnr(task));
4138 }
4139 }
4140 }
4141 read_unlock(&css_set_lock);
4142 return 0;
4143}
4144
4145static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4146{
4147 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4148}
4149
4150static struct cftype debug_files[] = {
4151 {
4152 .name = "cgroup_refcount",
4153 .read_u64 = cgroup_refcount_read,
4154 },
4155 {
4156 .name = "taskcount",
4157 .read_u64 = debug_taskcount_read,
4158 },
4159
4160 {
4161 .name = "current_css_set",
4162 .read_u64 = current_css_set_read,
4163 },
4164
4165 {
4166 .name = "current_css_set_refcount",
4167 .read_u64 = current_css_set_refcount_read,
4168 },
4169
4170 {
4171 .name = "current_css_set_cg_links",
4172 .read_seq_string = current_css_set_cg_links_read,
4173 },
4174
4175 {
4176 .name = "cgroup_css_links",
4177 .read_seq_string = cgroup_css_links_read,
4178 },
4179
4180 {
4181 .name = "releasable",
4182 .read_u64 = releasable_read,
4183 },
4184};
4185
4186static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4187{
4188 return cgroup_add_files(cont, ss, debug_files,
4189 ARRAY_SIZE(debug_files));
4190}
4191
4192struct cgroup_subsys debug_subsys = {
4193 .name = "debug",
4194 .create = debug_create,
4195 .destroy = debug_destroy,
4196 .populate = debug_populate,
4197 .subsys_id = debug_subsys_id,
4198};
4199#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b5cb469d2545 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1324static cpumask_var_t cpus_attach;
1325 1325
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1327static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1328 struct task_struct *tsk, bool threadgroup)
1329{ 1329{
1330 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1331 struct cpuset *cs = cgroup_cs(cont);
1331 1332
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1333 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1344 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1345 return -EINVAL;
1345 1346
1346 return security_task_setscheduler(tsk, 0, NULL); 1347 ret = security_task_setscheduler(tsk, 0, NULL);
1348 if (ret)
1349 return ret;
1350 if (threadgroup) {
1351 struct task_struct *c;
1352
1353 rcu_read_lock();
1354 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1355 ret = security_task_setscheduler(c, 0, NULL);
1356 if (ret) {
1357 rcu_read_unlock();
1358 return ret;
1359 }
1360 }
1361 rcu_read_unlock();
1362 }
1363 return 0;
1364}
1365
1366static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1367 struct cpuset *cs)
1368{
1369 int err;
1370 /*
1371 * can_attach beforehand should guarantee that this doesn't fail.
1372 * TODO: have a better way to handle failure here
1373 */
1374 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1375 WARN_ON_ONCE(err);
1376
1377 task_lock(tsk);
1378 cpuset_change_task_nodemask(tsk, to);
1379 task_unlock(tsk);
1380 cpuset_update_task_spread_flag(cs, tsk);
1381
1347} 1382}
1348 1383
1349static void cpuset_attach(struct cgroup_subsys *ss, 1384static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1350 struct cgroup *cont, struct cgroup *oldcont, 1385 struct cgroup *oldcont, struct task_struct *tsk,
1351 struct task_struct *tsk) 1386 bool threadgroup)
1352{ 1387{
1353 nodemask_t from, to; 1388 nodemask_t from, to;
1354 struct mm_struct *mm; 1389 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1390 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1391 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1392
1359 if (cs == &top_cpuset) { 1393 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1394 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1397 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1398 guarantee_online_mems(cs, &to);
1365 } 1399 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1400
1370 task_lock(tsk); 1401 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1402 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1403 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1404 struct task_struct *c;
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs);
1408 }
1409 rcu_read_unlock();
1410 }
1374 1411
1412 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1413 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1414 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1415 mm = get_task_mm(tsk);
diff --git a/kernel/cred.c b/kernel/cred.c
index d7f7a01082eb..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as);
782 782
783#ifdef CONFIG_DEBUG_CREDENTIALS 783#ifdef CONFIG_DEBUG_CREDENTIALS
784 784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
785/* 804/*
786 * dump invalid credentials 805 * dump invalid credentials
787 */ 806 */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddff..e61891f80123 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
47#include <linux/tracehook.h> 47#include <linux/tracehook.h>
48#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 154{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 156
157#ifdef CONFIG_PERF_COUNTERS 157#ifdef CONFIG_PERF_EVENTS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp); 158 WARN_ON_ONCE(tsk->perf_event_ctxp);
159#endif 159#endif
160 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
161 put_task_struct(tsk); 161 put_task_struct(tsk);
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
359{ 359{
360 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
361 361
362 if (task_session(curr) != pid) 362 if (task_session(curr) != pid) {
363 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
364 366
365 if (task_pgrp(curr) != pid) 367 if (task_pgrp(curr) != pid)
366 change_pid(curr, PIDTYPE_PGID, pid); 368 change_pid(curr, PIDTYPE_PGID, pid);
@@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code)
945 if (group_dead) { 947 if (group_dead) {
946 hrtimer_cancel(&tsk->signal->real_timer); 948 hrtimer_cancel(&tsk->signal->real_timer);
947 exit_itimers(tsk->signal); 949 exit_itimers(tsk->signal);
950 if (tsk->mm)
951 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
948 } 952 }
949 acct_collect(code, group_dead); 953 acct_collect(code, group_dead);
950 if (group_dead) 954 if (group_dead)
@@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code)
972 disassociate_ctty(1); 976 disassociate_ctty(1);
973 977
974 module_put(task_thread_info(tsk)->exec_domain->module); 978 module_put(task_thread_info(tsk)->exec_domain->module);
975 if (tsk->binfmt)
976 module_put(tsk->binfmt->module);
977 979
978 proc_exit_connector(tsk); 980 proc_exit_connector(tsk);
979 981
@@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code)
981 * Flush inherited counters to the parent - before the parent 983 * Flush inherited counters to the parent - before the parent
982 * gets woken up by child-exit notifications. 984 * gets woken up by child-exit notifications.
983 */ 985 */
984 perf_counter_exit_task(tsk); 986 perf_event_exit_task(tsk);
985 987
986 exit_notify(tsk, group_dead); 988 exit_notify(tsk, group_dead);
987#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
@@ -989,8 +991,6 @@ NORET_TYPE void do_exit(long code)
989 tsk->mempolicy = NULL; 991 tsk->mempolicy = NULL;
990#endif 992#endif
991#ifdef CONFIG_FUTEX 993#ifdef CONFIG_FUTEX
992 if (unlikely(!list_empty(&tsk->pi_state_list)))
993 exit_pi_state_list(tsk);
994 if (unlikely(current->pi_state_cache)) 994 if (unlikely(current->pi_state_cache))
995 kfree(current->pi_state_cache); 995 kfree(current->pi_state_cache);
996#endif 996#endif
@@ -1093,28 +1093,28 @@ struct wait_opts {
1093 int __user *wo_stat; 1093 int __user *wo_stat;
1094 struct rusage __user *wo_rusage; 1094 struct rusage __user *wo_rusage;
1095 1095
1096 wait_queue_t child_wait;
1096 int notask_error; 1097 int notask_error;
1097}; 1098};
1098 1099
1099static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1100static inline
1101struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1100{ 1102{
1101 struct pid *pid = NULL; 1103 if (type != PIDTYPE_PID)
1102 if (type == PIDTYPE_PID) 1104 task = task->group_leader;
1103 pid = task->pids[type].pid; 1105 return task->pids[type].pid;
1104 else if (type < PIDTYPE_MAX)
1105 pid = task->group_leader->pids[type].pid;
1106 return pid;
1107} 1106}
1108 1107
1109static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1108static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1110{ 1109{
1111 int err; 1110 return wo->wo_type == PIDTYPE_MAX ||
1112 1111 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1113 if (wo->wo_type < PIDTYPE_MAX) { 1112}
1114 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1115 return 0;
1116 }
1117 1113
1114static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1115{
1116 if (!eligible_pid(wo, p))
1117 return 0;
1118 /* Wait for all children (clone and not) if __WALL is set; 1118 /* Wait for all children (clone and not) if __WALL is set;
1119 * otherwise, wait for clone children *only* if __WCLONE is 1119 * otherwise, wait for clone children *only* if __WCLONE is
1120 * set; otherwise, wait for non-clone children *only*. (Note: 1120 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1124,10 +1124,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1124 && !(wo->wo_flags & __WALL)) 1124 && !(wo->wo_flags & __WALL))
1125 return 0; 1125 return 0;
1126 1126
1127 err = security_task_wait(p);
1128 if (err)
1129 return err;
1130
1131 return 1; 1127 return 1;
1132} 1128}
1133 1129
@@ -1140,18 +1136,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1140 1136
1141 put_task_struct(p); 1137 put_task_struct(p);
1142 infop = wo->wo_info; 1138 infop = wo->wo_info;
1143 if (!retval) 1139 if (infop) {
1144 retval = put_user(SIGCHLD, &infop->si_signo); 1140 if (!retval)
1145 if (!retval) 1141 retval = put_user(SIGCHLD, &infop->si_signo);
1146 retval = put_user(0, &infop->si_errno); 1142 if (!retval)
1147 if (!retval) 1143 retval = put_user(0, &infop->si_errno);
1148 retval = put_user((short)why, &infop->si_code); 1144 if (!retval)
1149 if (!retval) 1145 retval = put_user((short)why, &infop->si_code);
1150 retval = put_user(pid, &infop->si_pid); 1146 if (!retval)
1151 if (!retval) 1147 retval = put_user(pid, &infop->si_pid);
1152 retval = put_user(uid, &infop->si_uid); 1148 if (!retval)
1153 if (!retval) 1149 retval = put_user(uid, &infop->si_uid);
1154 retval = put_user(status, &infop->si_status); 1150 if (!retval)
1151 retval = put_user(status, &infop->si_status);
1152 }
1155 if (!retval) 1153 if (!retval)
1156 retval = pid; 1154 retval = pid;
1157 return retval; 1155 return retval;
@@ -1208,6 +1206,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1208 if (likely(!traced) && likely(!task_detached(p))) { 1206 if (likely(!traced) && likely(!task_detached(p))) {
1209 struct signal_struct *psig; 1207 struct signal_struct *psig;
1210 struct signal_struct *sig; 1208 struct signal_struct *sig;
1209 unsigned long maxrss;
1211 1210
1212 /* 1211 /*
1213 * The resource counters for the group leader are in its 1212 * The resource counters for the group leader are in its
@@ -1256,6 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1256 psig->coublock += 1255 psig->coublock +=
1257 task_io_get_oublock(p) + 1256 task_io_get_oublock(p) +
1258 sig->oublock + sig->coublock; 1257 sig->oublock + sig->coublock;
1258 maxrss = max(sig->maxrss, sig->cmaxrss);
1259 if (psig->cmaxrss < maxrss)
1260 psig->cmaxrss = maxrss;
1259 task_io_accounting_add(&psig->ioac, &p->ioac); 1261 task_io_accounting_add(&psig->ioac, &p->ioac);
1260 task_io_accounting_add(&psig->ioac, &sig->ioac); 1262 task_io_accounting_add(&psig->ioac, &sig->ioac);
1261 spin_unlock_irq(&p->real_parent->sighand->siglock); 1263 spin_unlock_irq(&p->real_parent->sighand->siglock);
@@ -1477,13 +1479,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1477 * then ->notask_error is 0 if @p is an eligible child, 1479 * then ->notask_error is 0 if @p is an eligible child,
1478 * or another error from security_task_wait(), or still -ECHILD. 1480 * or another error from security_task_wait(), or still -ECHILD.
1479 */ 1481 */
1480static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1482static int wait_consider_task(struct wait_opts *wo, int ptrace,
1481 int ptrace, struct task_struct *p) 1483 struct task_struct *p)
1482{ 1484{
1483 int ret = eligible_child(wo, p); 1485 int ret = eligible_child(wo, p);
1484 if (!ret) 1486 if (!ret)
1485 return ret; 1487 return ret;
1486 1488
1489 ret = security_task_wait(p);
1487 if (unlikely(ret < 0)) { 1490 if (unlikely(ret < 0)) {
1488 /* 1491 /*
1489 * If we have not yet seen any eligible child, 1492 * If we have not yet seen any eligible child,
@@ -1545,7 +1548,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1545 * Do not consider detached threads. 1548 * Do not consider detached threads.
1546 */ 1549 */
1547 if (!task_detached(p)) { 1550 if (!task_detached(p)) {
1548 int ret = wait_consider_task(wo, tsk, 0, p); 1551 int ret = wait_consider_task(wo, 0, p);
1549 if (ret) 1552 if (ret)
1550 return ret; 1553 return ret;
1551 } 1554 }
@@ -1559,7 +1562,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1559 struct task_struct *p; 1562 struct task_struct *p;
1560 1563
1561 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1564 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1562 int ret = wait_consider_task(wo, tsk, 1, p); 1565 int ret = wait_consider_task(wo, 1, p);
1563 if (ret) 1566 if (ret)
1564 return ret; 1567 return ret;
1565 } 1568 }
@@ -1567,15 +1570,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1567 return 0; 1570 return 0;
1568} 1571}
1569 1572
1573static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1574 int sync, void *key)
1575{
1576 struct wait_opts *wo = container_of(wait, struct wait_opts,
1577 child_wait);
1578 struct task_struct *p = key;
1579
1580 if (!eligible_pid(wo, p))
1581 return 0;
1582
1583 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1584 return 0;
1585
1586 return default_wake_function(wait, mode, sync, key);
1587}
1588
1589void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1590{
1591 __wake_up_sync_key(&parent->signal->wait_chldexit,
1592 TASK_INTERRUPTIBLE, 1, p);
1593}
1594
1570static long do_wait(struct wait_opts *wo) 1595static long do_wait(struct wait_opts *wo)
1571{ 1596{
1572 DECLARE_WAITQUEUE(wait, current);
1573 struct task_struct *tsk; 1597 struct task_struct *tsk;
1574 int retval; 1598 int retval;
1575 1599
1576 trace_sched_process_wait(wo->wo_pid); 1600 trace_sched_process_wait(wo->wo_pid);
1577 1601
1578 add_wait_queue(&current->signal->wait_chldexit,&wait); 1602 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1603 wo->child_wait.private = current;
1604 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1579repeat: 1605repeat:
1580 /* 1606 /*
1581 * If there is nothing that can match our critiera just get out. 1607 * If there is nothing that can match our critiera just get out.
@@ -1616,32 +1642,7 @@ notask:
1616 } 1642 }
1617end: 1643end:
1618 __set_current_state(TASK_RUNNING); 1644 __set_current_state(TASK_RUNNING);
1619 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1645 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1620 if (wo->wo_info) {
1621 struct siginfo __user *infop = wo->wo_info;
1622
1623 if (retval > 0)
1624 retval = 0;
1625 else {
1626 /*
1627 * For a WNOHANG return, clear out all the fields
1628 * we would set so the user can easily tell the
1629 * difference.
1630 */
1631 if (!retval)
1632 retval = put_user(0, &infop->si_signo);
1633 if (!retval)
1634 retval = put_user(0, &infop->si_errno);
1635 if (!retval)
1636 retval = put_user(0, &infop->si_code);
1637 if (!retval)
1638 retval = put_user(0, &infop->si_pid);
1639 if (!retval)
1640 retval = put_user(0, &infop->si_uid);
1641 if (!retval)
1642 retval = put_user(0, &infop->si_status);
1643 }
1644 }
1645 return retval; 1646 return retval;
1646} 1647}
1647 1648
@@ -1686,6 +1687,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1686 wo.wo_stat = NULL; 1687 wo.wo_stat = NULL;
1687 wo.wo_rusage = ru; 1688 wo.wo_rusage = ru;
1688 ret = do_wait(&wo); 1689 ret = do_wait(&wo);
1690
1691 if (ret > 0) {
1692 ret = 0;
1693 } else if (infop) {
1694 /*
1695 * For a WNOHANG return, clear out all the fields
1696 * we would set so the user can easily tell the
1697 * difference.
1698 */
1699 if (!ret)
1700 ret = put_user(0, &infop->si_signo);
1701 if (!ret)
1702 ret = put_user(0, &infop->si_errno);
1703 if (!ret)
1704 ret = put_user(0, &infop->si_code);
1705 if (!ret)
1706 ret = put_user(0, &infop->si_pid);
1707 if (!ret)
1708 ret = put_user(0, &infop->si_uid);
1709 if (!ret)
1710 ret = put_user(0, &infop->si_status);
1711 }
1712
1689 put_pid(pid); 1713 put_pid(pid);
1690 1714
1691 /* avoid REGPARM breakage on x86: */ 1715 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3fb..4c20fff8c13a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/ftrace.h> 49#include <linux/ftrace.h>
50#include <linux/profile.h> 50#include <linux/profile.h>
51#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
53#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
54#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
61#include <linux/blkdev.h> 62#include <linux/blkdev.h>
62#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
63#include <linux/magic.h> 64#include <linux/magic.h>
64#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
65 67
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
136/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
137static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
138 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
139void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
140{ 149{
141 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
142 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
143 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
253 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
254#endif 264#endif
255 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
256 return tsk; 269 return tsk;
257 270
258out: 271out:
@@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
288 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
289 rb_parent = NULL; 302 rb_parent = NULL;
290 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
291 307
292 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
293 struct file *file; 309 struct file *file;
@@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
418 434
419#include <linux/init_task.h> 435#include <linux/init_task.h>
420 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
421static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
422{ 446{
423 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
424 atomic_set(&mm->mm_count, 1); 448 atomic_set(&mm->mm_count, 1);
425 init_rwsem(&mm->mmap_sem); 449 init_rwsem(&mm->mmap_sem);
426 INIT_LIST_HEAD(&mm->mmlist); 450 INIT_LIST_HEAD(&mm->mmlist);
427 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 451 mm->flags = (current->mm) ?
452 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
428 mm->core_state = NULL; 453 mm->core_state = NULL;
429 mm->nr_ptes = 0; 454 mm->nr_ptes = 0;
430 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
431 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
432 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
433 spin_lock_init(&mm->ioctx_lock);
434 INIT_HLIST_HEAD(&mm->ioctx_list);
435 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
436 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
437 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
438 462
439 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm)
485 509
486 if (atomic_dec_and_test(&mm->mm_users)) { 510 if (atomic_dec_and_test(&mm->mm_users)) {
487 exit_aio(mm); 511 exit_aio(mm);
512 ksm_exit(mm);
488 exit_mmap(mm); 513 exit_mmap(mm);
489 set_mm_exe_file(mm, NULL); 514 set_mm_exe_file(mm, NULL);
490 if (!list_empty(&mm->mmlist)) { 515 if (!list_empty(&mm->mmlist)) {
@@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm)
493 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
494 } 519 }
495 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
496 mmdrop(mm); 523 mmdrop(mm);
497 } 524 }
498} 525}
@@ -543,12 +570,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
543 570
544 /* Get rid of any futexes when releasing the mm */ 571 /* Get rid of any futexes when releasing the mm */
545#ifdef CONFIG_FUTEX 572#ifdef CONFIG_FUTEX
546 if (unlikely(tsk->robust_list)) 573 if (unlikely(tsk->robust_list)) {
547 exit_robust_list(tsk); 574 exit_robust_list(tsk);
575 tsk->robust_list = NULL;
576 }
548#ifdef CONFIG_COMPAT 577#ifdef CONFIG_COMPAT
549 if (unlikely(tsk->compat_robust_list)) 578 if (unlikely(tsk->compat_robust_list)) {
550 compat_exit_robust_list(tsk); 579 compat_exit_robust_list(tsk);
580 tsk->compat_robust_list = NULL;
581 }
551#endif 582#endif
583 if (unlikely(!list_empty(&tsk->pi_state_list)))
584 exit_pi_state_list(tsk);
552#endif 585#endif
553 586
554 /* Get rid of any cached register state */ 587 /* Get rid of any cached register state */
@@ -618,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
618 mm->hiwater_rss = get_mm_rss(mm); 651 mm->hiwater_rss = get_mm_rss(mm);
619 mm->hiwater_vm = mm->total_vm; 652 mm->hiwater_vm = mm->total_vm;
620 653
654 if (mm->binfmt && !try_module_get(mm->binfmt->module))
655 goto free_pt;
656
621 return mm; 657 return mm;
622 658
623free_pt: 659free_pt:
660 /* don't put binfmt in mmput, we haven't got module yet */
661 mm->binfmt = NULL;
624 mmput(mm); 662 mmput(mm);
625 663
626fail_nomem: 664fail_nomem:
@@ -788,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
788 thread_group_cputime_init(sig); 826 thread_group_cputime_init(sig);
789 827
790 /* Expiration times and increments. */ 828 /* Expiration times and increments. */
791 sig->it_virt_expires = cputime_zero; 829 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
792 sig->it_virt_incr = cputime_zero; 830 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
793 sig->it_prof_expires = cputime_zero; 831 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
794 sig->it_prof_incr = cputime_zero; 832 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
795 833
796 /* Cached expiration times. */ 834 /* Cached expiration times. */
797 sig->cputime_expires.prof_exp = cputime_zero; 835 sig->cputime_expires.prof_exp = cputime_zero;
@@ -849,6 +887,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
849 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 887 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
850 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 888 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
851 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 889 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
890 sig->maxrss = sig->cmaxrss = 0;
852 task_io_accounting_init(&sig->ioac); 891 task_io_accounting_init(&sig->ioac);
853 sig->sum_sched_runtime = 0; 892 sig->sum_sched_runtime = 0;
854 taskstats_tgid_init(sig); 893 taskstats_tgid_init(sig);
@@ -863,6 +902,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
863 902
864 tty_audit_fork(sig); 903 tty_audit_fork(sig);
865 904
905 sig->oom_adj = current->signal->oom_adj;
906
866 return 0; 907 return 0;
867} 908}
868 909
@@ -958,6 +999,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
958 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 999 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
959 return ERR_PTR(-EINVAL); 1000 return ERR_PTR(-EINVAL);
960 1001
1002 /*
1003 * Siblings of global init remain as zombies on exit since they are
1004 * not reaped by their parent (swapper). To solve this and to avoid
1005 * multi-rooted process trees, prevent global and container-inits
1006 * from creating siblings.
1007 */
1008 if ((clone_flags & CLONE_PARENT) &&
1009 current->signal->flags & SIGNAL_UNKILLABLE)
1010 return ERR_PTR(-EINVAL);
1011
961 retval = security_task_create(clone_flags); 1012 retval = security_task_create(clone_flags);
962 if (retval) 1013 if (retval)
963 goto fork_out; 1014 goto fork_out;
@@ -999,9 +1050,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
999 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1050 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1000 goto bad_fork_cleanup_count; 1051 goto bad_fork_cleanup_count;
1001 1052
1002 if (p->binfmt && !try_module_get(p->binfmt->module))
1003 goto bad_fork_cleanup_put_domain;
1004
1005 p->did_exec = 0; 1053 p->did_exec = 0;
1006 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1054 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1007 copy_flags(clone_flags, p); 1055 copy_flags(clone_flags, p);
@@ -1075,10 +1123,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1075 1123
1076 p->bts = NULL; 1124 p->bts = NULL;
1077 1125
1126 p->stack_start = stack_start;
1127
1078 /* Perform scheduler related setup. Assign this task to a CPU. */ 1128 /* Perform scheduler related setup. Assign this task to a CPU. */
1079 sched_fork(p, clone_flags); 1129 sched_fork(p, clone_flags);
1080 1130
1081 retval = perf_counter_init_task(p); 1131 retval = perf_event_init_task(p);
1082 if (retval) 1132 if (retval)
1083 goto bad_fork_cleanup_policy; 1133 goto bad_fork_cleanup_policy;
1084 1134
@@ -1253,7 +1303,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1253 write_unlock_irq(&tasklist_lock); 1303 write_unlock_irq(&tasklist_lock);
1254 proc_fork_connector(p); 1304 proc_fork_connector(p);
1255 cgroup_post_fork(p); 1305 cgroup_post_fork(p);
1256 perf_counter_fork(p); 1306 perf_event_fork(p);
1257 return p; 1307 return p;
1258 1308
1259bad_fork_free_pid: 1309bad_fork_free_pid:
@@ -1280,16 +1330,13 @@ bad_fork_cleanup_semundo:
1280bad_fork_cleanup_audit: 1330bad_fork_cleanup_audit:
1281 audit_free(p); 1331 audit_free(p);
1282bad_fork_cleanup_policy: 1332bad_fork_cleanup_policy:
1283 perf_counter_free_task(p); 1333 perf_event_free_task(p);
1284#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
1285 mpol_put(p->mempolicy); 1335 mpol_put(p->mempolicy);
1286bad_fork_cleanup_cgroup: 1336bad_fork_cleanup_cgroup:
1287#endif 1337#endif
1288 cgroup_exit(p, cgroup_callbacks_done); 1338 cgroup_exit(p, cgroup_callbacks_done);
1289 delayacct_tsk_free(p); 1339 delayacct_tsk_free(p);
1290 if (p->binfmt)
1291 module_put(p->binfmt->module);
1292bad_fork_cleanup_put_domain:
1293 module_put(task_thread_info(p)->exec_domain->module); 1340 module_put(task_thread_info(p)->exec_domain->module);
1294bad_fork_cleanup_count: 1341bad_fork_cleanup_count:
1295 atomic_dec(&p->cred->user->processes); 1342 atomic_dec(&p->cred->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index 248dd119a86e..4949d336d88d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,36 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key; 121 union futex_key *requeue_pi_key;
120
121 /* Bitset for the optional bitmasked wakeup */
122 u32 bitset; 122 u32 bitset;
123}; 123};
124 124
@@ -198,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key)
198} 198}
199 199
200/** 200/**
201 * get_futex_key - Get parameters which are the keys for a futex. 201 * get_futex_key() - Get parameters which are the keys for a futex
202 * @uaddr: virtual address of the futex 202 * @uaddr: virtual address of the futex
203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
204 * @key: address where result is stored. 204 * @key: address where result is stored.
205 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 205 * @rw: mapping needs to be read/write (values: VERIFY_READ,
206 * VERIFY_WRITE)
206 * 207 *
207 * Returns a negative error code or 0 208 * Returns a negative error code or 0
208 * The key words are stored in *key on success. 209 * The key words are stored in *key on success.
@@ -288,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key)
288 drop_futex_key_refs(key); 289 drop_futex_key_refs(key);
289} 290}
290 291
291/* 292/**
292 * fault_in_user_writeable - fault in user address and verify RW access 293 * fault_in_user_writeable() - Fault in user address and verify RW access
293 * @uaddr: pointer to faulting user space address 294 * @uaddr: pointer to faulting user space address
294 * 295 *
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
@@ -309,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
309 310
310/** 311/**
311 * futex_top_waiter() - Return the highest priority waiter on a futex 312 * futex_top_waiter() - Return the highest priority waiter on a futex
312 * @hb: the hash bucket the futex_q's reside in 313 * @hb: the hash bucket the futex_q's reside in
313 * @key: the futex key (to distinguish it from other futex futex_q's) 314 * @key: the futex key (to distinguish it from other futex futex_q's)
314 * 315 *
315 * Must be called with the hb lock held. 316 * Must be called with the hb lock held.
316 */ 317 */
@@ -588,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
588} 589}
589 590
590/** 591/**
591 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 592 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
592 * @uaddr: the pi futex user address 593 * @uaddr: the pi futex user address
593 * @hb: the pi futex hash bucket 594 * @hb: the pi futex hash bucket
594 * @key: the futex key associated with uaddr and hb 595 * @key: the futex key associated with uaddr and hb
@@ -915,8 +916,8 @@ retry:
915 hb1 = hash_futex(&key1); 916 hb1 = hash_futex(&key1);
916 hb2 = hash_futex(&key2); 917 hb2 = hash_futex(&key2);
917 918
918 double_lock_hb(hb1, hb2);
919retry_private: 919retry_private:
920 double_lock_hb(hb1, hb2);
920 op_ret = futex_atomic_op_inuser(op, uaddr2); 921 op_ret = futex_atomic_op_inuser(op, uaddr2);
921 if (unlikely(op_ret < 0)) { 922 if (unlikely(op_ret < 0)) {
922 923
@@ -1011,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1011 1012
1012/** 1013/**
1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1014 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1014 * q: the futex_q 1015 * @q: the futex_q
1015 * key: the key of the requeue target futex 1016 * @key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex 1017 * @hb: the hash_bucket of the requeue target futex
1017 * 1018 *
1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1019 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1020 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1350,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1350 return hb; 1351 return hb;
1351} 1352}
1352 1353
1354static inline void
1355queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1356{
1357 spin_unlock(&hb->lock);
1358 drop_futex_key_refs(&q->key);
1359}
1360
1361/**
1362 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1363 * @q: The futex_q to enqueue
1364 * @hb: The destination hash bucket
1365 *
1366 * The hb->lock must be held by the caller, and is released here. A call to
1367 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1368 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1369 * or nothing if the unqueue is done as part of the wake process and the unqueue
1370 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1371 * an example).
1372 */
1353static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1373static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1354{ 1374{
1355 int prio; 1375 int prio;
@@ -1373,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1373 spin_unlock(&hb->lock); 1393 spin_unlock(&hb->lock);
1374} 1394}
1375 1395
1376static inline void 1396/**
1377queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1397 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1378{ 1398 * @q: The futex_q to unqueue
1379 spin_unlock(&hb->lock); 1399 *
1380 drop_futex_key_refs(&q->key); 1400 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1381} 1401 * be paired with exactly one earlier call to queue_me().
1382 1402 *
1383/* 1403 * Returns:
1384 * queue_me and unqueue_me must be called as a pair, each 1404 * 1 - if the futex_q was still queued (and we removed unqueued it)
1385 * exactly once. They are called with the hashed spinlock held. 1405 * 0 - if the futex_q was already removed by the waking thread
1386 */ 1406 */
1387
1388/* Return 1 if we were still queued (ie. 0 means we were woken) */
1389static int unqueue_me(struct futex_q *q) 1407static int unqueue_me(struct futex_q *q)
1390{ 1408{
1391 spinlock_t *lock_ptr; 1409 spinlock_t *lock_ptr;
@@ -1638,17 +1656,14 @@ out:
1638static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1656static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1639 struct hrtimer_sleeper *timeout) 1657 struct hrtimer_sleeper *timeout)
1640{ 1658{
1641 queue_me(q, hb);
1642
1643 /* 1659 /*
1644 * There might have been scheduling since the queue_me(), as we 1660 * The task state is guaranteed to be set before another task can
1645 * cannot hold a spinlock across the get_user() in case it 1661 * wake it. set_current_state() is implemented using set_mb() and
1646 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1662 * queue_me() calls spin_unlock() upon completion, both serializing
1647 * queueing ourselves into the futex hash. This code thus has to 1663 * access to the hash list and forcing another memory barrier.
1648 * rely on the futex_wake() code removing us from hash when it
1649 * wakes us up.
1650 */ 1664 */
1651 set_current_state(TASK_INTERRUPTIBLE); 1665 set_current_state(TASK_INTERRUPTIBLE);
1666 queue_me(q, hb);
1652 1667
1653 /* Arm the timer */ 1668 /* Arm the timer */
1654 if (timeout) { 1669 if (timeout) {
@@ -1658,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1658 } 1673 }
1659 1674
1660 /* 1675 /*
1661 * !plist_node_empty() is safe here without any lock. 1676 * If we have been removed from the hash list, then another task
1662 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1677 * has tried to wake us, and we can skip the call to schedule().
1663 */ 1678 */
1664 if (likely(!plist_node_empty(&q->list))) { 1679 if (likely(!plist_node_empty(&q->list))) {
1665 /* 1680 /*
@@ -2102,7 +2117,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2102 * Unqueue the futex_q and determine which it was. 2117 * Unqueue the futex_q and determine which it was.
2103 */ 2118 */
2104 plist_del(&q->list, &q->list.plist); 2119 plist_del(&q->list, &q->list.plist);
2105 drop_futex_key_refs(&q->key);
2106 2120
2107 if (timeout && !timeout->task) 2121 if (timeout && !timeout->task)
2108 ret = -ETIMEDOUT; 2122 ret = -ETIMEDOUT;
@@ -2114,12 +2128,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2114 2128
2115/** 2129/**
2116 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2130 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2117 * @uaddr: the futex we initialyl wait on (non-pi) 2131 * @uaddr: the futex we initially wait on (non-pi)
2118 * @fshared: whether the futexes are shared (1) or not (0). They must be 2132 * @fshared: whether the futexes are shared (1) or not (0). They must be
2119 * the same type, no requeueing from private to shared, etc. 2133 * the same type, no requeueing from private to shared, etc.
2120 * @val: the expected value of uaddr 2134 * @val: the expected value of uaddr
2121 * @abs_time: absolute timeout 2135 * @abs_time: absolute timeout
2122 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2136 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2123 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2137 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2124 * @uaddr2: the pi futex we will take prior to returning to user-space 2138 * @uaddr2: the pi futex we will take prior to returning to user-space
2125 * 2139 *
@@ -2246,7 +2260,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2246 res = fixup_owner(uaddr2, fshared, &q, !ret); 2260 res = fixup_owner(uaddr2, fshared, &q, !ret);
2247 /* 2261 /*
2248 * If fixup_owner() returned an error, proprogate that. If it 2262 * If fixup_owner() returned an error, proprogate that. If it
2249 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2263 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2250 */ 2264 */
2251 if (res) 2265 if (res)
2252 ret = (res < 0) ? res : 0; 2266 ret = (res < 0) ? res : 0;
@@ -2302,9 +2316,9 @@ out:
2302 */ 2316 */
2303 2317
2304/** 2318/**
2305 * sys_set_robust_list - set the robust-futex list head of a task 2319 * sys_set_robust_list() - Set the robust-futex list head of a task
2306 * @head: pointer to the list-head 2320 * @head: pointer to the list-head
2307 * @len: length of the list-head, as userspace expects 2321 * @len: length of the list-head, as userspace expects
2308 */ 2322 */
2309SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2323SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310 size_t, len) 2324 size_t, len)
@@ -2323,10 +2337,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2323} 2337}
2324 2338
2325/** 2339/**
2326 * sys_get_robust_list - get the robust-futex list head of a task 2340 * sys_get_robust_list() - Get the robust-futex list head of a task
2327 * @pid: pid of the process [zero for current task] 2341 * @pid: pid of the process [zero for current task]
2328 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2342 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2329 * @len_ptr: pointer to a length field, the kernel fills in the header size 2343 * @len_ptr: pointer to a length field, the kernel fills in the header size
2330 */ 2344 */
2331SYSCALL_DEFINE3(get_robust_list, int, pid, 2345SYSCALL_DEFINE3(get_robust_list, int, pid,
2332 struct robust_list_head __user * __user *, head_ptr, 2346 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 654efd09f6a9..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 || (PPC && EXPERIMENTAL) 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 05071bf6a37b..3e1c36e7998f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,36 +48,7 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/** 51#include <trace/events/timer.h>
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81 52
82/* 53/*
83 * The timer bases: 54 * The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 77 }
107}; 78};
108 79
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 80/*
135 * Get the coarse grained time at the softirq based on xtime and 81 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 82 * wall_to_monotonic.
@@ -498,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
498static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 444static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
499#endif 445#endif
500 446
447static inline void
448debug_init(struct hrtimer *timer, clockid_t clockid,
449 enum hrtimer_mode mode)
450{
451 debug_hrtimer_init(timer);
452 trace_hrtimer_init(timer, clockid, mode);
453}
454
455static inline void debug_activate(struct hrtimer *timer)
456{
457 debug_hrtimer_activate(timer);
458 trace_hrtimer_start(timer);
459}
460
461static inline void debug_deactivate(struct hrtimer *timer)
462{
463 debug_hrtimer_deactivate(timer);
464 trace_hrtimer_cancel(timer);
465}
466
501/* High resolution timer related functions */ 467/* High resolution timer related functions */
502#ifdef CONFIG_HIGH_RES_TIMERS 468#ifdef CONFIG_HIGH_RES_TIMERS
503 469
@@ -543,13 +509,14 @@ static inline int hrtimer_hres_active(void)
543 * next event 509 * next event
544 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
545 */ 511 */
546static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
547{ 514{
548 int i; 515 int i;
549 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
550 ktime_t expires; 517 ktime_t expires, expires_next;
551 518
552 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
553 520
554 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
555 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -565,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
565 */ 532 */
566 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
567 expires.tv64 = 0; 534 expires.tv64 = 0;
568 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
569 cpu_base->expires_next = expires; 536 expires_next = expires;
570 } 537 }
571 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
572 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
573 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
574} 546}
@@ -651,7 +623,7 @@ static void retrigger_next_event(void *arg)
651 base->clock_base[CLOCK_REALTIME].offset = 623 base->clock_base[CLOCK_REALTIME].offset =
652 timespec_to_ktime(realtime_offset); 624 timespec_to_ktime(realtime_offset);
653 625
654 hrtimer_force_reprogram(base); 626 hrtimer_force_reprogram(base, 0);
655 spin_unlock(&base->lock); 627 spin_unlock(&base->lock);
656} 628}
657 629
@@ -754,8 +726,6 @@ static int hrtimer_switch_to_hres(void)
754 /* "Retrigger" the interrupt to get things going */ 726 /* "Retrigger" the interrupt to get things going */
755 retrigger_next_event(NULL); 727 retrigger_next_event(NULL);
756 local_irq_restore(flags); 728 local_irq_restore(flags);
757 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
758 smp_processor_id());
759 return 1; 729 return 1;
760} 730}
761 731
@@ -764,7 +734,8 @@ static int hrtimer_switch_to_hres(void)
764static inline int hrtimer_hres_active(void) { return 0; } 734static inline int hrtimer_hres_active(void) { return 0; }
765static inline int hrtimer_is_hres_enabled(void) { return 0; } 735static inline int hrtimer_is_hres_enabled(void) { return 0; }
766static inline int hrtimer_switch_to_hres(void) { return 0; } 736static inline int hrtimer_switch_to_hres(void) { return 0; }
767static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 737static inline void
738hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
768static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 739static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
769 struct hrtimer_clock_base *base, 740 struct hrtimer_clock_base *base,
770 int wakeup) 741 int wakeup)
@@ -854,7 +825,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
854 struct hrtimer *entry; 825 struct hrtimer *entry;
855 int leftmost = 1; 826 int leftmost = 1;
856 827
857 debug_hrtimer_activate(timer); 828 debug_activate(timer);
858 829
859 /* 830 /*
860 * Find the right place in the rbtree: 831 * Find the right place in the rbtree:
@@ -907,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
907 struct hrtimer_clock_base *base, 878 struct hrtimer_clock_base *base,
908 unsigned long newstate, int reprogram) 879 unsigned long newstate, int reprogram)
909{ 880{
910 if (timer->state & HRTIMER_STATE_ENQUEUED) { 881 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
911 /* 882 goto out;
912 * Remove the timer from the rbtree and replace the 883
913 * first entry pointer if necessary. 884 /*
914 */ 885 * Remove the timer from the rbtree and replace the first
915 if (base->first == &timer->node) { 886 * entry pointer if necessary.
916 base->first = rb_next(&timer->node); 887 */
917 /* Reprogram the clock event device. if enabled */ 888 if (base->first == &timer->node) {
918 if (reprogram && hrtimer_hres_active()) 889 base->first = rb_next(&timer->node);
919 hrtimer_force_reprogram(base->cpu_base); 890#ifdef CONFIG_HIGH_RES_TIMERS
891 /* Reprogram the clock event device. if enabled */
892 if (reprogram && hrtimer_hres_active()) {
893 ktime_t expires;
894
895 expires = ktime_sub(hrtimer_get_expires(timer),
896 base->offset);
897 if (base->cpu_base->expires_next.tv64 == expires.tv64)
898 hrtimer_force_reprogram(base->cpu_base, 1);
920 } 899 }
921 rb_erase(&timer->node, &base->active); 900#endif
922 } 901 }
902 rb_erase(&timer->node, &base->active);
903out:
923 timer->state = newstate; 904 timer->state = newstate;
924} 905}
925 906
@@ -940,7 +921,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
940 * reprogramming happens in the interrupt handler. This is a 921 * reprogramming happens in the interrupt handler. This is a
941 * rare case and less expensive than a smp call. 922 * rare case and less expensive than a smp call.
942 */ 923 */
943 debug_hrtimer_deactivate(timer); 924 debug_deactivate(timer);
944 timer_stats_hrtimer_clear_start_info(timer); 925 timer_stats_hrtimer_clear_start_info(timer);
945 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 926 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
946 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 927 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1155,7 +1136,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1155 clock_id = CLOCK_MONOTONIC; 1136 clock_id = CLOCK_MONOTONIC;
1156 1137
1157 timer->base = &cpu_base->clock_base[clock_id]; 1138 timer->base = &cpu_base->clock_base[clock_id];
1158 INIT_LIST_HEAD(&timer->cb_entry);
1159 hrtimer_init_timer_hres(timer); 1139 hrtimer_init_timer_hres(timer);
1160 1140
1161#ifdef CONFIG_TIMER_STATS 1141#ifdef CONFIG_TIMER_STATS
@@ -1174,7 +1154,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1174void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1154void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1175 enum hrtimer_mode mode) 1155 enum hrtimer_mode mode)
1176{ 1156{
1177 debug_hrtimer_init(timer); 1157 debug_init(timer, clock_id, mode);
1178 __hrtimer_init(timer, clock_id, mode); 1158 __hrtimer_init(timer, clock_id, mode);
1179} 1159}
1180EXPORT_SYMBOL_GPL(hrtimer_init); 1160EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1198,7 +1178,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1198} 1178}
1199EXPORT_SYMBOL_GPL(hrtimer_get_res); 1179EXPORT_SYMBOL_GPL(hrtimer_get_res);
1200 1180
1201static void __run_hrtimer(struct hrtimer *timer) 1181static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1202{ 1182{
1203 struct hrtimer_clock_base *base = timer->base; 1183 struct hrtimer_clock_base *base = timer->base;
1204 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1184 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1207,7 +1187,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1207 1187
1208 WARN_ON(!irqs_disabled()); 1188 WARN_ON(!irqs_disabled());
1209 1189
1210 debug_hrtimer_deactivate(timer); 1190 debug_deactivate(timer);
1211 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1191 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1212 timer_stats_account_hrtimer(timer); 1192 timer_stats_account_hrtimer(timer);
1213 fn = timer->function; 1193 fn = timer->function;
@@ -1218,7 +1198,9 @@ static void __run_hrtimer(struct hrtimer *timer)
1218 * the timer base. 1198 * the timer base.
1219 */ 1199 */
1220 spin_unlock(&cpu_base->lock); 1200 spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now);
1221 restart = fn(timer); 1202 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer);
1222 spin_lock(&cpu_base->lock); 1204 spin_lock(&cpu_base->lock);
1223 1205
1224 /* 1206 /*
@@ -1329,7 +1311,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1329 break; 1311 break;
1330 } 1312 }
1331 1313
1332 __run_hrtimer(timer); 1314 __run_hrtimer(timer, &basenow);
1333 } 1315 }
1334 base++; 1316 base++;
1335 } 1317 }
@@ -1451,7 +1433,7 @@ void hrtimer_run_queues(void)
1451 hrtimer_get_expires_tv64(timer)) 1433 hrtimer_get_expires_tv64(timer))
1452 break; 1434 break;
1453 1435
1454 __run_hrtimer(timer); 1436 __run_hrtimer(timer, &base->softirq_time);
1455 } 1437 }
1456 spin_unlock(&cpu_base->lock); 1438 spin_unlock(&cpu_base->lock);
1457 } 1439 }
@@ -1628,7 +1610,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1628 while ((node = rb_first(&old_base->active))) { 1610 while ((node = rb_first(&old_base->active))) {
1629 timer = rb_entry(node, struct hrtimer, node); 1611 timer = rb_entry(node, struct hrtimer, node);
1630 BUG_ON(hrtimer_callback_running(timer)); 1612 BUG_ON(hrtimer_callback_running(timer));
1631 debug_hrtimer_deactivate(timer); 1613 debug_deactivate(timer);
1632 1614
1633 /* 1615 /*
1634 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1616 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..d4e841747400 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..17c71bb565c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/module.h> 16#include <linux/module.h>
16#include <linux/random.h> 17#include <linux/random.h>
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b03451ede528 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/posix-timers.h> 13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
41 return ktime_to_timeval(rem); 42 return ktime_to_timeval(rem);
42} 43}
43 44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime);
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cputime_le(cval, t))
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cputime_sub(cval, t);
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
44int do_getitimer(int which, struct itimerval *value) 79int do_getitimer(int which, struct itimerval *value)
45{ 80{
46 struct task_struct *tsk = current; 81 struct task_struct *tsk = current;
47 cputime_t cinterval, cval;
48 82
49 switch (which) { 83 switch (which) {
50 case ITIMER_REAL: 84 case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 89 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 90 break;
57 case ITIMER_VIRTUAL: 91 case ITIMER_VIRTUAL:
58 spin_lock_irq(&tsk->sighand->siglock); 92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
59 cval = tsk->signal->it_virt_expires;
60 cinterval = tsk->signal->it_virt_incr;
61 if (!cputime_eq(cval, cputime_zero)) {
62 struct task_cputime cputime;
63 cputime_t utime;
64
65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1);
69 } else {
70 cval = cputime_sub(cval, utime);
71 }
72 }
73 spin_unlock_irq(&tsk->sighand->siglock);
74 cputime_to_timeval(cval, &value->it_value);
75 cputime_to_timeval(cinterval, &value->it_interval);
76 break; 93 break;
77 case ITIMER_PROF: 94 case ITIMER_PROF:
78 spin_lock_irq(&tsk->sighand->siglock); 95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
79 cval = tsk->signal->it_prof_expires;
80 cinterval = tsk->signal->it_prof_incr;
81 if (!cputime_eq(cval, cputime_zero)) {
82 struct task_cputime times;
83 cputime_t ptime;
84
85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1);
89 } else {
90 cval = cputime_sub(cval, ptime);
91 }
92 }
93 spin_unlock_irq(&tsk->sighand->siglock);
94 cputime_to_timeval(cval, &value->it_value);
95 cputime_to_timeval(cinterval, &value->it_interval);
96 break; 96 break;
97 default: 97 default:
98 return(-EINVAL); 98 return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
123 struct signal_struct *sig = 123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer); 124 container_of(timer, struct signal_struct, real_timer);
125 125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
126 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); 127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
127 128
128 return HRTIMER_NORESTART; 129 return HRTIMER_NORESTART;
129} 130}
130 131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150
151 nval = timeval_to_cputime(&value->it_value);
152 ns_nval = timeval_to_ns(&value->it_value);
153 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval);
155
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
157 it->error = cputime_sub_ns(nval, ns_nval);
158
159 spin_lock_irq(&tsk->sighand->siglock);
160
161 cval = it->expires;
162 cinterval = it->incr;
163 if (!cputime_eq(cval, cputime_zero) ||
164 !cputime_eq(nval, cputime_zero)) {
165 if (cputime_gt(nval, cputime_zero))
166 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173
174 spin_unlock_irq(&tsk->sighand->siglock);
175
176 if (ovalue) {
177 cputime_to_timeval(cval, &ovalue->it_value);
178 cputime_to_timeval(cinterval, &ovalue->it_interval);
179 }
180}
181
131/* 182/*
132 * Returns true if the timeval is in canonical form 183 * Returns true if the timeval is in canonical form
133 */ 184 */
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
139 struct task_struct *tsk = current; 190 struct task_struct *tsk = current;
140 struct hrtimer *timer; 191 struct hrtimer *timer;
141 ktime_t expires; 192 ktime_t expires;
142 cputime_t cval, cinterval, nval, ninterval;
143 193
144 /* 194 /*
145 * Validate the timevals in value. 195 * Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
171 } else 221 } else
172 tsk->signal->it_real_incr.tv64 = 0; 222 tsk->signal->it_real_incr.tv64 = 0;
173 223
224 trace_itimer_state(ITIMER_REAL, value, 0);
174 spin_unlock_irq(&tsk->sighand->siglock); 225 spin_unlock_irq(&tsk->sighand->siglock);
175 break; 226 break;
176 case ITIMER_VIRTUAL: 227 case ITIMER_VIRTUAL:
177 nval = timeval_to_cputime(&value->it_value); 228 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
178 ninterval = timeval_to_cputime(&value->it_interval);
179 spin_lock_irq(&tsk->sighand->siglock);
180 cval = tsk->signal->it_virt_expires;
181 cinterval = tsk->signal->it_virt_incr;
182 if (!cputime_eq(cval, cputime_zero) ||
183 !cputime_eq(nval, cputime_zero)) {
184 if (cputime_gt(nval, cputime_zero))
185 nval = cputime_add(nval,
186 jiffies_to_cputime(1));
187 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
188 &nval, &cval);
189 }
190 tsk->signal->it_virt_expires = nval;
191 tsk->signal->it_virt_incr = ninterval;
192 spin_unlock_irq(&tsk->sighand->siglock);
193 if (ovalue) {
194 cputime_to_timeval(cval, &ovalue->it_value);
195 cputime_to_timeval(cinterval, &ovalue->it_interval);
196 }
197 break; 229 break;
198 case ITIMER_PROF: 230 case ITIMER_PROF:
199 nval = timeval_to_cputime(&value->it_value); 231 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
200 ninterval = timeval_to_cputime(&value->it_interval);
201 spin_lock_irq(&tsk->sighand->siglock);
202 cval = tsk->signal->it_prof_expires;
203 cinterval = tsk->signal->it_prof_incr;
204 if (!cputime_eq(cval, cputime_zero) ||
205 !cputime_eq(nval, cputime_zero)) {
206 if (cputime_gt(nval, cputime_zero))
207 nval = cputime_add(nval,
208 jiffies_to_cputime(1));
209 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
210 &nval, &cval);
211 }
212 tsk->signal->it_prof_expires = nval;
213 tsk->signal->it_prof_incr = ninterval;
214 spin_unlock_irq(&tsk->sighand->siglock);
215 if (ovalue) {
216 cputime_to_timeval(cval, &ovalue->it_value);
217 cputime_to_timeval(cinterval, &ovalue->it_interval);
218 }
219 break; 232 break;
220 default: 233 default:
221 return -EINVAL; 234 return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3a29dbe7898e..8b6b8b697c68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
59 59
60static inline int is_kernel_text(unsigned long addr) 60static inline int is_kernel_text(unsigned long addr)
61{ 61{
62 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 62 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
63 arch_is_kernel_text(addr))
63 return 1; 64 return 1;
64 return in_gate_area_no_task(addr); 65 return in_gate_area_no_task(addr);
65} 66}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ef177d653b2c..5240d75f4c60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1321 return 0; 1321 return 0;
1322} 1322}
1323 1323
1324static struct seq_operations kprobes_seq_ops = { 1324static const struct seq_operations kprobes_seq_ops = {
1325 .start = kprobe_seq_start, 1325 .start = kprobe_seq_start,
1326 .next = kprobe_seq_next, 1326 .next = kprobe_seq_next,
1327 .stop = kprobe_seq_stop, 1327 .stop = kprobe_seq_stop,
@@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1333 return seq_open(filp, &kprobes_seq_ops); 1333 return seq_open(filp, &kprobes_seq_ops);
1334} 1334}
1335 1335
1336static struct file_operations debugfs_kprobes_operations = { 1336static const struct file_operations debugfs_kprobes_operations = {
1337 .open = kprobes_open, 1337 .open = kprobes_open,
1338 .read = seq_read, 1338 .read = seq_read,
1339 .llseek = seq_lseek, 1339 .llseek = seq_lseek,
@@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1515 return count; 1515 return count;
1516} 1516}
1517 1517
1518static struct file_operations fops_kp = { 1518static const struct file_operations fops_kp = {
1519 .read = read_enabled_file_bool, 1519 .read = read_enabled_file_bool,
1520 .write = write_enabled_file_bool, 1520 .write = write_enabled_file_bool,
1521}; 1521};
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f74d2d7aa605..9af56723c096 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
144 144
145static inline u64 lockstat_clock(void)
146{
147 return cpu_clock(smp_processor_id());
148}
149
145static int lock_point(unsigned long points[], unsigned long ip) 150static int lock_point(unsigned long points[], unsigned long ip)
146{ 151{
147 int i; 152 int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
158 return i; 163 return i;
159} 164}
160 165
161static void lock_time_inc(struct lock_time *lt, s64 time) 166static void lock_time_inc(struct lock_time *lt, u64 time)
162{ 167{
163 if (time > lt->max) 168 if (time > lt->max)
164 lt->max = time; 169 lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
234static void lock_release_holdtime(struct held_lock *hlock) 239static void lock_release_holdtime(struct held_lock *hlock)
235{ 240{
236 struct lock_class_stats *stats; 241 struct lock_class_stats *stats;
237 s64 holdtime; 242 u64 holdtime;
238 243
239 if (!lock_stat) 244 if (!lock_stat)
240 return; 245 return;
241 246
242 holdtime = sched_clock() - hlock->holdtime_stamp; 247 holdtime = lockstat_clock() - hlock->holdtime_stamp;
243 248
244 stats = get_lock_stats(hlock_class(hlock)); 249 stats = get_lock_stats(hlock_class(hlock));
245 if (hlock->read) 250 if (hlock->read)
@@ -578,6 +583,9 @@ static int static_obj(void *obj)
578 if ((addr >= start) && (addr < end)) 583 if ((addr >= start) && (addr < end))
579 return 1; 584 return 1;
580 585
586 if (arch_is_kernel_data(addr))
587 return 1;
588
581#ifdef CONFIG_SMP 589#ifdef CONFIG_SMP
582 /* 590 /*
583 * percpu var? 591 * percpu var?
@@ -2789,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2789 hlock->references = references; 2797 hlock->references = references;
2790#ifdef CONFIG_LOCK_STAT 2798#ifdef CONFIG_LOCK_STAT
2791 hlock->waittime_stamp = 0; 2799 hlock->waittime_stamp = 0;
2792 hlock->holdtime_stamp = sched_clock(); 2800 hlock->holdtime_stamp = lockstat_clock();
2793#endif 2801#endif
2794 2802
2795 if (check == 2 && !mark_irqflags(curr, hlock)) 2803 if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3319,7 +3327,7 @@ found_it:
3319 if (hlock->instance != lock) 3327 if (hlock->instance != lock)
3320 return; 3328 return;
3321 3329
3322 hlock->waittime_stamp = sched_clock(); 3330 hlock->waittime_stamp = lockstat_clock();
3323 3331
3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3332 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3325 contending_point = lock_point(hlock_class(hlock)->contending_point, 3333 contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3342,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3342 struct held_lock *hlock, *prev_hlock; 3350 struct held_lock *hlock, *prev_hlock;
3343 struct lock_class_stats *stats; 3351 struct lock_class_stats *stats;
3344 unsigned int depth; 3352 unsigned int depth;
3345 u64 now; 3353 u64 now, waittime = 0;
3346 s64 waittime = 0;
3347 int i, cpu; 3354 int i, cpu;
3348 3355
3349 depth = curr->lockdep_depth; 3356 depth = curr->lockdep_depth;
@@ -3371,7 +3378,7 @@ found_it:
3371 3378
3372 cpu = smp_processor_id(); 3379 cpu = smp_processor_id();
3373 if (hlock->waittime_stamp) { 3380 if (hlock->waittime_stamp) {
3374 now = sched_clock(); 3381 now = lockstat_clock();
3375 waittime = now - hlock->waittime_stamp; 3382 waittime = now - hlock->waittime_stamp;
3376 hlock->holdtime_stamp = now; 3383 hlock->holdtime_stamp = now;
3377 } 3384 }
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4b3dbc79fdb..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
594 return 0; 594 return 0;
595} 595}
596 596
597static struct seq_operations lockstat_ops = { 597static const struct seq_operations lockstat_ops = {
598 .start = ls_start, 598 .start = ls_start,
599 .next = ls_next, 599 .next = ls_next,
600 .stop = ls_stop, 600 .stop = ls_stop,
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 05ce49ced8f6..8b7d8805819d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -1535,6 +1536,10 @@ static void free_module(struct module *mod)
1535 1536
1536 /* Finally, free the core (containing the module structure) */ 1537 /* Finally, free the core (containing the module structure) */
1537 module_free(mod, mod->module_core); 1538 module_free(mod, mod->module_core);
1539
1540#ifdef CONFIG_MPU
1541 update_protections(current->mm);
1542#endif
1538} 1543}
1539 1544
1540void *__symbol_get(const char *symbol) 1545void *__symbol_get(const char *symbol)
@@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1792 } 1797 }
1793} 1798}
1794 1799
1800static void free_modinfo(struct module *mod)
1801{
1802 struct module_attribute *attr;
1803 int i;
1804
1805 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1806 if (attr->free)
1807 attr->free(mod);
1808 }
1809}
1810
1795#ifdef CONFIG_KALLSYMS 1811#ifdef CONFIG_KALLSYMS
1796 1812
1797/* lookup symbol in given range of kernel_symbols */ 1813/* lookup symbol in given range of kernel_symbols */
@@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
1857 return '?'; 1873 return '?';
1858} 1874}
1859 1875
1876static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1877 unsigned int shnum)
1878{
1879 const Elf_Shdr *sec;
1880
1881 if (src->st_shndx == SHN_UNDEF
1882 || src->st_shndx >= shnum
1883 || !src->st_name)
1884 return false;
1885
1886 sec = sechdrs + src->st_shndx;
1887 if (!(sec->sh_flags & SHF_ALLOC)
1888#ifndef CONFIG_KALLSYMS_ALL
1889 || !(sec->sh_flags & SHF_EXECINSTR)
1890#endif
1891 || (sec->sh_entsize & INIT_OFFSET_MASK))
1892 return false;
1893
1894 return true;
1895}
1896
1897static unsigned long layout_symtab(struct module *mod,
1898 Elf_Shdr *sechdrs,
1899 unsigned int symindex,
1900 unsigned int strindex,
1901 const Elf_Ehdr *hdr,
1902 const char *secstrings,
1903 unsigned long *pstroffs,
1904 unsigned long *strmap)
1905{
1906 unsigned long symoffs;
1907 Elf_Shdr *symsect = sechdrs + symindex;
1908 Elf_Shdr *strsect = sechdrs + strindex;
1909 const Elf_Sym *src;
1910 const char *strtab;
1911 unsigned int i, nsrc, ndst;
1912
1913 /* Put symbol section at end of init part of module. */
1914 symsect->sh_flags |= SHF_ALLOC;
1915 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1916 symindex) | INIT_OFFSET_MASK;
1917 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1918
1919 src = (void *)hdr + symsect->sh_offset;
1920 nsrc = symsect->sh_size / sizeof(*src);
1921 strtab = (void *)hdr + strsect->sh_offset;
1922 for (ndst = i = 1; i < nsrc; ++i, ++src)
1923 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1924 unsigned int j = src->st_name;
1925
1926 while(!__test_and_set_bit(j, strmap) && strtab[j])
1927 ++j;
1928 ++ndst;
1929 }
1930
1931 /* Append room for core symbols at end of core part. */
1932 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1933 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1934
1935 /* Put string table section at end of init part of module. */
1936 strsect->sh_flags |= SHF_ALLOC;
1937 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1938 strindex) | INIT_OFFSET_MASK;
1939 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1940
1941 /* Append room for core symbols' strings at end of core part. */
1942 *pstroffs = mod->core_size;
1943 __set_bit(0, strmap);
1944 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1945
1946 return symoffs;
1947}
1948
1860static void add_kallsyms(struct module *mod, 1949static void add_kallsyms(struct module *mod,
1861 Elf_Shdr *sechdrs, 1950 Elf_Shdr *sechdrs,
1951 unsigned int shnum,
1862 unsigned int symindex, 1952 unsigned int symindex,
1863 unsigned int strindex, 1953 unsigned int strindex,
1864 const char *secstrings) 1954 unsigned long symoffs,
1955 unsigned long stroffs,
1956 const char *secstrings,
1957 unsigned long *strmap)
1865{ 1958{
1866 unsigned int i; 1959 unsigned int i, ndst;
1960 const Elf_Sym *src;
1961 Elf_Sym *dst;
1962 char *s;
1867 1963
1868 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1964 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1869 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1965 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1873,13 +1969,46 @@ static void add_kallsyms(struct module *mod,
1873 for (i = 0; i < mod->num_symtab; i++) 1969 for (i = 0; i < mod->num_symtab; i++)
1874 mod->symtab[i].st_info 1970 mod->symtab[i].st_info
1875 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1971 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1972
1973 mod->core_symtab = dst = mod->module_core + symoffs;
1974 src = mod->symtab;
1975 *dst = *src;
1976 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1977 if (!is_core_symbol(src, sechdrs, shnum))
1978 continue;
1979 dst[ndst] = *src;
1980 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1981 ++ndst;
1982 }
1983 mod->core_num_syms = ndst;
1984
1985 mod->core_strtab = s = mod->module_core + stroffs;
1986 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1987 if (test_bit(i, strmap))
1988 *++s = mod->strtab[i];
1876} 1989}
1877#else 1990#else
1991static inline unsigned long layout_symtab(struct module *mod,
1992 Elf_Shdr *sechdrs,
1993 unsigned int symindex,
1994 unsigned int strindex,
1995 const Elf_Ehdr *hdr,
1996 const char *secstrings,
1997 unsigned long *pstroffs,
1998 unsigned long *strmap)
1999{
2000 return 0;
2001}
2002
1878static inline void add_kallsyms(struct module *mod, 2003static inline void add_kallsyms(struct module *mod,
1879 Elf_Shdr *sechdrs, 2004 Elf_Shdr *sechdrs,
2005 unsigned int shnum,
1880 unsigned int symindex, 2006 unsigned int symindex,
1881 unsigned int strindex, 2007 unsigned int strindex,
1882 const char *secstrings) 2008 unsigned long symoffs,
2009 unsigned long stroffs,
2010 const char *secstrings,
2011 const unsigned long *strmap)
1883{ 2012{
1884} 2013}
1885#endif /* CONFIG_KALLSYMS */ 2014#endif /* CONFIG_KALLSYMS */
@@ -1954,6 +2083,8 @@ static noinline struct module *load_module(void __user *umod,
1954 struct module *mod; 2083 struct module *mod;
1955 long err = 0; 2084 long err = 0;
1956 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2085 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
2086 unsigned long symoffs, stroffs, *strmap;
2087
1957 mm_segment_t old_fs; 2088 mm_segment_t old_fs;
1958 2089
1959 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2090 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2035,11 +2166,6 @@ static noinline struct module *load_module(void __user *umod,
2035 /* Don't keep modinfo and version sections. */ 2166 /* Don't keep modinfo and version sections. */
2036 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2167 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2037 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2168 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2038#ifdef CONFIG_KALLSYMS
2039 /* Keep symbol and string tables for decoding later. */
2040 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2041 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2042#endif
2043 2169
2044 /* Check module struct version now, before we try to use module. */ 2170 /* Check module struct version now, before we try to use module. */
2045 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2171 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2075,6 +2201,13 @@ static noinline struct module *load_module(void __user *umod,
2075 goto free_hdr; 2201 goto free_hdr;
2076 } 2202 }
2077 2203
2204 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2205 * sizeof(long), GFP_KERNEL);
2206 if (!strmap) {
2207 err = -ENOMEM;
2208 goto free_mod;
2209 }
2210
2078 if (find_module(mod->name)) { 2211 if (find_module(mod->name)) {
2079 err = -EEXIST; 2212 err = -EEXIST;
2080 goto free_mod; 2213 goto free_mod;
@@ -2104,6 +2237,8 @@ static noinline struct module *load_module(void __user *umod,
2104 this is done generically; there doesn't appear to be any 2237 this is done generically; there doesn't appear to be any
2105 special cases for the architectures. */ 2238 special cases for the architectures. */
2106 layout_sections(mod, hdr, sechdrs, secstrings); 2239 layout_sections(mod, hdr, sechdrs, secstrings);
2240 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2241 secstrings, &stroffs, strmap);
2107 2242
2108 /* Do the allocs. */ 2243 /* Do the allocs. */
2109 ptr = module_alloc_update_bounds(mod->core_size); 2244 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2237,10 +2372,6 @@ static noinline struct module *load_module(void __user *umod,
2237 sizeof(*mod->ctors), &mod->num_ctors); 2372 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif 2373#endif
2239 2374
2240#ifdef CONFIG_MARKERS
2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2242 sizeof(*mod->markers), &mod->num_markers);
2243#endif
2244#ifdef CONFIG_TRACEPOINTS 2375#ifdef CONFIG_TRACEPOINTS
2245 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2376 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2246 "__tracepoints", 2377 "__tracepoints",
@@ -2312,7 +2443,10 @@ static noinline struct module *load_module(void __user *umod,
2312 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2443 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2313 sechdrs[pcpuindex].sh_size); 2444 sechdrs[pcpuindex].sh_size);
2314 2445
2315 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2446 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2447 symoffs, stroffs, secstrings, strmap);
2448 kfree(strmap);
2449 strmap = NULL;
2316 2450
2317 if (!mod->taints) { 2451 if (!mod->taints) {
2318 struct _ddebug *debug; 2452 struct _ddebug *debug;
@@ -2384,13 +2518,14 @@ static noinline struct module *load_module(void __user *umod,
2384 synchronize_sched(); 2518 synchronize_sched();
2385 module_arch_cleanup(mod); 2519 module_arch_cleanup(mod);
2386 cleanup: 2520 cleanup:
2521 free_modinfo(mod);
2387 kobject_del(&mod->mkobj.kobj); 2522 kobject_del(&mod->mkobj.kobj);
2388 kobject_put(&mod->mkobj.kobj); 2523 kobject_put(&mod->mkobj.kobj);
2389 free_unload: 2524 free_unload:
2390 module_unload_free(mod); 2525 module_unload_free(mod);
2391#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2526#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2392 free_init:
2393 percpu_modfree(mod->refptr); 2527 percpu_modfree(mod->refptr);
2528 free_init:
2394#endif 2529#endif
2395 module_free(mod, mod->module_init); 2530 module_free(mod, mod->module_init);
2396 free_core: 2531 free_core:
@@ -2401,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod,
2401 percpu_modfree(percpu); 2536 percpu_modfree(percpu);
2402 free_mod: 2537 free_mod:
2403 kfree(args); 2538 kfree(args);
2539 kfree(strmap);
2404 free_hdr: 2540 free_hdr:
2405 vfree(hdr); 2541 vfree(hdr);
2406 return ERR_PTR(err); 2542 return ERR_PTR(err);
@@ -2490,6 +2626,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2490 /* Drop initial reference. */ 2626 /* Drop initial reference. */
2491 module_put(mod); 2627 module_put(mod);
2492 trim_init_extable(mod); 2628 trim_init_extable(mod);
2629#ifdef CONFIG_KALLSYMS
2630 mod->num_symtab = mod->core_num_syms;
2631 mod->symtab = mod->core_symtab;
2632 mod->strtab = mod->core_strtab;
2633#endif
2493 module_free(mod, mod->module_init); 2634 module_free(mod, mod->module_init);
2494 mod->module_init = NULL; 2635 mod->module_init = NULL;
2495 mod->init_size = 0; 2636 mod->init_size = 0;
@@ -2951,27 +3092,12 @@ void module_layout(struct module *mod,
2951 struct modversion_info *ver, 3092 struct modversion_info *ver,
2952 struct kernel_param *kp, 3093 struct kernel_param *kp,
2953 struct kernel_symbol *ks, 3094 struct kernel_symbol *ks,
2954 struct marker *marker,
2955 struct tracepoint *tp) 3095 struct tracepoint *tp)
2956{ 3096{
2957} 3097}
2958EXPORT_SYMBOL(module_layout); 3098EXPORT_SYMBOL(module_layout);
2959#endif 3099#endif
2960 3100
2961#ifdef CONFIG_MARKERS
2962void module_update_markers(void)
2963{
2964 struct module *mod;
2965
2966 mutex_lock(&module_mutex);
2967 list_for_each_entry(mod, &modules, list)
2968 if (!mod->taints)
2969 marker_update_probe_range(mod->markers,
2970 mod->markers + mod->num_markers);
2971 mutex_unlock(&module_mutex);
2972}
2973#endif
2974
2975#ifdef CONFIG_TRACEPOINTS 3101#ifdef CONFIG_TRACEPOINTS
2976void module_update_tracepoints(void) 3102void module_update_tracepoints(void)
2977{ 3103{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/poison.h> 18#include <linux/poison.h>
19#include <linux/sched.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..96b45d0b4ba5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...)
90 90
91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf); 91 atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
92 92
93 bust_spinlocks(0);
94
93 if (!panic_blink) 95 if (!panic_blink)
94 panic_blink = no_blink; 96 panic_blink = no_blink;
95 97
@@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...)
136 mdelay(1); 138 mdelay(1);
137 i++; 139 i++;
138 } 140 }
139 bust_spinlocks(0);
140} 141}
141 142
142EXPORT_SYMBOL(panic); 143EXPORT_SYMBOL(panic);
@@ -177,7 +178,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 178 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 179 * 'C' - modules from drivers/staging are loaded.
179 * 180 *
180 * The string is overwritten by the next call to print_taint(). 181 * The string is overwritten by the next call to print_tainted().
181 */ 182 */
182const char *print_tainted(void) 183const char *print_tainted(void)
183{ 184{
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..9da58eabdcb2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 88 }
88 89
89 for (i = 0; args[i]; i++) { 90 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 91 if (isspace(args[i]) && !in_quote)
91 break; 92 break;
92 if (equals == 0) { 93 if (equals == 0) {
93 if (args[i] == '=') 94 if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 122 next = args + i;
122 123
123 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
124 while (*next == ' ') 125 while (isspace(*next))
125 next++; 126 next++;
126 return next; 127 return next;
127} 128}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
139 140
140 /* Chew leading spaces */ 141 /* Chew leading spaces */
141 while (*args == ' ') 142 while (isspace(*args))
142 args++; 143 args++;
143 144
144 while (*args) { 145 while (*args) {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index 8cb94a52d1bb..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4963 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_counter_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_counter_paranoid > 1;
69}
70
71int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf counter sample rate
75 */
76int sysctl_perf_counter_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_counter_id;
79
80/*
81 * Lock for (sysadmin-configurable) counter reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_counter_setup(int cpu) { barrier(); }
97void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_counter *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_counter_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_counter_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, perf_disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(perf_disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(perf_disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_counter_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_counter_context *ctx;
141
142 ctx = container_of(head, struct perf_counter_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_counter_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_counter_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit counters we want to return the parent counter id
167 * to userspace.
168 */
169static u64 primary_counter_id(struct perf_counter *counter)
170{
171 u64 id = counter->id;
172
173 if (counter->parent)
174 id = counter->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_counter_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_counter_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_counter_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_counter_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_counter_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_counter_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_counter_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a counter from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
252{
253 struct perf_counter *group_leader = counter->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling counter,
257 * add it straight to the context's counter list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == counter)
261 list_add_tail(&counter->list_entry, &ctx->counter_list);
262 else {
263 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&counter->event_entry, &ctx->event_list);
268 ctx->nr_counters++;
269 if (counter->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a counter from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
279{
280 struct perf_counter *sibling, *tmp;
281
282 if (list_empty(&counter->list_entry))
283 return;
284 ctx->nr_counters--;
285 if (counter->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&counter->list_entry);
289 list_del_rcu(&counter->event_entry);
290
291 if (counter->group_leader != counter)
292 counter->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group counter with sibling counters then
296 * upgrade the siblings to singleton counters by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp,
300 &counter->sibling_list, list_entry) {
301
302 list_move_tail(&sibling->list_entry, &ctx->counter_list);
303 sibling->group_leader = sibling;
304 }
305}
306
307static void
308counter_sched_out(struct perf_counter *counter,
309 struct perf_cpu_context *cpuctx,
310 struct perf_counter_context *ctx)
311{
312 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
313 return;
314
315 counter->state = PERF_COUNTER_STATE_INACTIVE;
316 if (counter->pending_disable) {
317 counter->pending_disable = 0;
318 counter->state = PERF_COUNTER_STATE_OFF;
319 }
320 counter->tstamp_stopped = ctx->time;
321 counter->pmu->disable(counter);
322 counter->oncpu = -1;
323
324 if (!is_software_counter(counter))
325 cpuctx->active_oncpu--;
326 ctx->nr_active--;
327 if (counter->attr.exclusive || !cpuctx->active_oncpu)
328 cpuctx->exclusive = 0;
329}
330
331static void
332group_sched_out(struct perf_counter *group_counter,
333 struct perf_cpu_context *cpuctx,
334 struct perf_counter_context *ctx)
335{
336 struct perf_counter *counter;
337
338 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
339 return;
340
341 counter_sched_out(group_counter, cpuctx, ctx);
342
343 /*
344 * Schedule out siblings (if any):
345 */
346 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
347 counter_sched_out(counter, cpuctx, ctx);
348
349 if (group_counter->attr.exclusive)
350 cpuctx->exclusive = 0;
351}
352
353/*
354 * Cross CPU call to remove a performance counter
355 *
356 * We disable the counter on the hardware level first. After that we
357 * remove it from the context list.
358 */
359static void __perf_counter_remove_from_context(void *info)
360{
361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
362 struct perf_counter *counter = info;
363 struct perf_counter_context *ctx = counter->ctx;
364
365 /*
366 * If this is a task context, we need to check whether it is
367 * the current task context of this cpu. If not it has been
368 * scheduled out before the smp call arrived.
369 */
370 if (ctx->task && cpuctx->task_ctx != ctx)
371 return;
372
373 spin_lock(&ctx->lock);
374 /*
375 * Protect the list operation against NMI by disabling the
376 * counters on a global level.
377 */
378 perf_disable();
379
380 counter_sched_out(counter, cpuctx, ctx);
381
382 list_del_counter(counter, ctx);
383
384 if (!ctx->task) {
385 /*
386 * Allow more per task counters with respect to the
387 * reservation:
388 */
389 cpuctx->max_pertask =
390 min(perf_max_counters - ctx->nr_counters,
391 perf_max_counters - perf_reserved_percpu);
392 }
393
394 perf_enable();
395 spin_unlock(&ctx->lock);
396}
397
398
399/*
400 * Remove the counter from a task's (or a CPU's) list of counters.
401 *
402 * Must be called with ctx->mutex held.
403 *
404 * CPU counters are removed with a smp call. For task counters we only
405 * call when the task is on a CPU.
406 *
407 * If counter->ctx is a cloned context, callers must make sure that
408 * every task struct that counter->ctx->task could possibly point to
409 * remains valid. This is OK when called from perf_release since
410 * that only calls us on the top-level context, which can't be a clone.
411 * When called from perf_counter_exit_task, it's OK because the
412 * context has been detached from its task.
413 */
414static void perf_counter_remove_from_context(struct perf_counter *counter)
415{
416 struct perf_counter_context *ctx = counter->ctx;
417 struct task_struct *task = ctx->task;
418
419 if (!task) {
420 /*
421 * Per cpu counters are removed via an smp call and
422 * the removal is always sucessful.
423 */
424 smp_call_function_single(counter->cpu,
425 __perf_counter_remove_from_context,
426 counter, 1);
427 return;
428 }
429
430retry:
431 task_oncpu_function_call(task, __perf_counter_remove_from_context,
432 counter);
433
434 spin_lock_irq(&ctx->lock);
435 /*
436 * If the context is active we need to retry the smp call.
437 */
438 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
439 spin_unlock_irq(&ctx->lock);
440 goto retry;
441 }
442
443 /*
444 * The lock prevents that this context is scheduled in so we
445 * can remove the counter safely, if the call above did not
446 * succeed.
447 */
448 if (!list_empty(&counter->list_entry)) {
449 list_del_counter(counter, ctx);
450 }
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_counter_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a counter.
472 */
473static void update_counter_times(struct perf_counter *counter)
474{
475 struct perf_counter_context *ctx = counter->ctx;
476 u64 run_end;
477
478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
480 return;
481
482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
483
484 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
485 run_end = counter->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 counter->total_time_running = run_end - counter->tstamp_running;
490}
491
492/*
493 * Update total_time_enabled and total_time_running for all counters in a group.
494 */
495static void update_group_times(struct perf_counter *leader)
496{
497 struct perf_counter *counter;
498
499 update_counter_times(leader);
500 list_for_each_entry(counter, &leader->sibling_list, list_entry)
501 update_counter_times(counter);
502}
503
504/*
505 * Cross CPU call to disable a performance counter
506 */
507static void __perf_counter_disable(void *info)
508{
509 struct perf_counter *counter = info;
510 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
511 struct perf_counter_context *ctx = counter->ctx;
512
513 /*
514 * If this is a per-task counter, need to check whether this
515 * counter's task is the current task on this cpu.
516 */
517 if (ctx->task && cpuctx->task_ctx != ctx)
518 return;
519
520 spin_lock(&ctx->lock);
521
522 /*
523 * If the counter is on, turn it off.
524 * If it is in error state, leave it in error state.
525 */
526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
527 update_context_time(ctx);
528 update_group_times(counter);
529 if (counter == counter->group_leader)
530 group_sched_out(counter, cpuctx, ctx);
531 else
532 counter_sched_out(counter, cpuctx, ctx);
533 counter->state = PERF_COUNTER_STATE_OFF;
534 }
535
536 spin_unlock(&ctx->lock);
537}
538
539/*
540 * Disable a counter.
541 *
542 * If counter->ctx is a cloned context, callers must make sure that
543 * every task struct that counter->ctx->task could possibly point to
544 * remains valid. This condition is satisifed when called through
545 * perf_counter_for_each_child or perf_counter_for_each because they
546 * hold the top-level counter's child_mutex, so any descendant that
547 * goes to exit will block in sync_child_counter.
548 * When called from perf_pending_counter it's OK because counter->ctx
549 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_counter_task_sched_out for this context.
551 */
552static void perf_counter_disable(struct perf_counter *counter)
553{
554 struct perf_counter_context *ctx = counter->ctx;
555 struct task_struct *task = ctx->task;
556
557 if (!task) {
558 /*
559 * Disable the counter on the cpu that it's on
560 */
561 smp_call_function_single(counter->cpu, __perf_counter_disable,
562 counter, 1);
563 return;
564 }
565
566 retry:
567 task_oncpu_function_call(task, __perf_counter_disable, counter);
568
569 spin_lock_irq(&ctx->lock);
570 /*
571 * If the counter is still active, we need to retry the cross-call.
572 */
573 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock);
575 goto retry;
576 }
577
578 /*
579 * Since we have the lock this context can't be scheduled
580 * in, so we can change the state safely.
581 */
582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
583 update_group_times(counter);
584 counter->state = PERF_COUNTER_STATE_OFF;
585 }
586
587 spin_unlock_irq(&ctx->lock);
588}
589
590static int
591counter_sched_in(struct perf_counter *counter,
592 struct perf_cpu_context *cpuctx,
593 struct perf_counter_context *ctx,
594 int cpu)
595{
596 if (counter->state <= PERF_COUNTER_STATE_OFF)
597 return 0;
598
599 counter->state = PERF_COUNTER_STATE_ACTIVE;
600 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
601 /*
602 * The new state must be visible before we turn it on in the hardware:
603 */
604 smp_wmb();
605
606 if (counter->pmu->enable(counter)) {
607 counter->state = PERF_COUNTER_STATE_INACTIVE;
608 counter->oncpu = -1;
609 return -EAGAIN;
610 }
611
612 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
613
614 if (!is_software_counter(counter))
615 cpuctx->active_oncpu++;
616 ctx->nr_active++;
617
618 if (counter->attr.exclusive)
619 cpuctx->exclusive = 1;
620
621 return 0;
622}
623
624static int
625group_sched_in(struct perf_counter *group_counter,
626 struct perf_cpu_context *cpuctx,
627 struct perf_counter_context *ctx,
628 int cpu)
629{
630 struct perf_counter *counter, *partial_group;
631 int ret;
632
633 if (group_counter->state == PERF_COUNTER_STATE_OFF)
634 return 0;
635
636 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
637 if (ret)
638 return ret < 0 ? ret : 0;
639
640 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
641 return -EAGAIN;
642
643 /*
644 * Schedule in siblings as one group (if any):
645 */
646 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
647 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
648 partial_group = counter;
649 goto group_error;
650 }
651 }
652
653 return 0;
654
655group_error:
656 /*
657 * Groups can be scheduled in as one unit only, so undo any
658 * partial group before returning:
659 */
660 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
661 if (counter == partial_group)
662 break;
663 counter_sched_out(counter, cpuctx, ctx);
664 }
665 counter_sched_out(group_counter, cpuctx, ctx);
666
667 return -EAGAIN;
668}
669
670/*
671 * Return 1 for a group consisting entirely of software counters,
672 * 0 if the group contains any hardware counters.
673 */
674static int is_software_only_group(struct perf_counter *leader)
675{
676 struct perf_counter *counter;
677
678 if (!is_software_counter(leader))
679 return 0;
680
681 list_for_each_entry(counter, &leader->sibling_list, list_entry)
682 if (!is_software_counter(counter))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this counter group on the CPU now.
690 */
691static int group_can_go_on(struct perf_counter *counter,
692 struct perf_cpu_context *cpuctx,
693 int can_add_hw)
694{
695 /*
696 * Groups consisting entirely of software counters can always go on.
697 */
698 if (is_software_only_group(counter))
699 return 1;
700 /*
701 * If an exclusive group is already on, no other hardware
702 * counters can go on.
703 */
704 if (cpuctx->exclusive)
705 return 0;
706 /*
707 * If this group is exclusive and there are already
708 * counters on the CPU, it can't go on.
709 */
710 if (counter->attr.exclusive && cpuctx->active_oncpu)
711 return 0;
712 /*
713 * Otherwise, try to add it if all previous groups were able
714 * to go on.
715 */
716 return can_add_hw;
717}
718
719static void add_counter_to_ctx(struct perf_counter *counter,
720 struct perf_counter_context *ctx)
721{
722 list_add_counter(counter, ctx);
723 counter->tstamp_enabled = ctx->time;
724 counter->tstamp_running = ctx->time;
725 counter->tstamp_stopped = ctx->time;
726}
727
728/*
729 * Cross CPU call to install and enable a performance counter
730 *
731 * Must be called with ctx->mutex held
732 */
733static void __perf_install_in_context(void *info)
734{
735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
736 struct perf_counter *counter = info;
737 struct perf_counter_context *ctx = counter->ctx;
738 struct perf_counter *leader = counter->group_leader;
739 int cpu = smp_processor_id();
740 int err;
741
742 /*
743 * If this is a task context, we need to check whether it is
744 * the current task context of this cpu. If not it has been
745 * scheduled out before the smp call arrived.
746 * Or possibly this is the right context but it isn't
747 * on this cpu because it had no counters.
748 */
749 if (ctx->task && cpuctx->task_ctx != ctx) {
750 if (cpuctx->task_ctx || ctx->task != current)
751 return;
752 cpuctx->task_ctx = ctx;
753 }
754
755 spin_lock(&ctx->lock);
756 ctx->is_active = 1;
757 update_context_time(ctx);
758
759 /*
760 * Protect the list operation against NMI by disabling the
761 * counters on a global level. NOP for non NMI based counters.
762 */
763 perf_disable();
764
765 add_counter_to_ctx(counter, ctx);
766
767 /*
768 * Don't put the counter on if it is disabled or if
769 * it is in a group and the group isn't on.
770 */
771 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
772 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
773 goto unlock;
774
775 /*
776 * An exclusive counter can't go on if there are already active
777 * hardware counters, and no hardware counter can go on if there
778 * is already an exclusive counter on.
779 */
780 if (!group_can_go_on(counter, cpuctx, 1))
781 err = -EEXIST;
782 else
783 err = counter_sched_in(counter, cpuctx, ctx, cpu);
784
785 if (err) {
786 /*
787 * This counter couldn't go on. If it is in a group
788 * then we have to pull the whole group off.
789 * If the counter group is pinned then put it in error state.
790 */
791 if (leader != counter)
792 group_sched_out(leader, cpuctx, ctx);
793 if (leader->attr.pinned) {
794 update_group_times(leader);
795 leader->state = PERF_COUNTER_STATE_ERROR;
796 }
797 }
798
799 if (!err && !ctx->task && cpuctx->max_pertask)
800 cpuctx->max_pertask--;
801
802 unlock:
803 perf_enable();
804
805 spin_unlock(&ctx->lock);
806}
807
808/*
809 * Attach a performance counter to a context
810 *
811 * First we add the counter to the list with the hardware enable bit
812 * in counter->hw_config cleared.
813 *
814 * If the counter is attached to a task which is on a CPU we use a smp
815 * call to enable it in the task context. The task might have been
816 * scheduled away, but we check this in the smp call again.
817 *
818 * Must be called with ctx->mutex held.
819 */
820static void
821perf_install_in_context(struct perf_counter_context *ctx,
822 struct perf_counter *counter,
823 int cpu)
824{
825 struct task_struct *task = ctx->task;
826
827 if (!task) {
828 /*
829 * Per cpu counters are installed via an smp call and
830 * the install is always sucessful.
831 */
832 smp_call_function_single(cpu, __perf_install_in_context,
833 counter, 1);
834 return;
835 }
836
837retry:
838 task_oncpu_function_call(task, __perf_install_in_context,
839 counter);
840
841 spin_lock_irq(&ctx->lock);
842 /*
843 * we need to retry the smp call.
844 */
845 if (ctx->is_active && list_empty(&counter->list_entry)) {
846 spin_unlock_irq(&ctx->lock);
847 goto retry;
848 }
849
850 /*
851 * The lock prevents that this context is scheduled in so we
852 * can add the counter safely, if it the call above did not
853 * succeed.
854 */
855 if (list_empty(&counter->list_entry))
856 add_counter_to_ctx(counter, ctx);
857 spin_unlock_irq(&ctx->lock);
858}
859
860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
882 * Cross CPU call to enable a performance counter
883 */
884static void __perf_counter_enable(void *info)
885{
886 struct perf_counter *counter = info;
887 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
888 struct perf_counter_context *ctx = counter->ctx;
889 struct perf_counter *leader = counter->group_leader;
890 int err;
891
892 /*
893 * If this is a per-task counter, need to check whether this
894 * counter's task is the current task on this cpu.
895 */
896 if (ctx->task && cpuctx->task_ctx != ctx) {
897 if (cpuctx->task_ctx || ctx->task != current)
898 return;
899 cpuctx->task_ctx = ctx;
900 }
901
902 spin_lock(&ctx->lock);
903 ctx->is_active = 1;
904 update_context_time(ctx);
905
906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
907 goto unlock;
908 __perf_counter_mark_enabled(counter, ctx);
909
910 /*
911 * If the counter is in a group and isn't the group leader,
912 * then don't put it on unless the group is on.
913 */
914 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
915 goto unlock;
916
917 if (!group_can_go_on(counter, cpuctx, 1)) {
918 err = -EEXIST;
919 } else {
920 perf_disable();
921 if (counter == leader)
922 err = group_sched_in(counter, cpuctx, ctx,
923 smp_processor_id());
924 else
925 err = counter_sched_in(counter, cpuctx, ctx,
926 smp_processor_id());
927 perf_enable();
928 }
929
930 if (err) {
931 /*
932 * If this counter can't go on and it's part of a
933 * group, then the whole group has to come off.
934 */
935 if (leader != counter)
936 group_sched_out(leader, cpuctx, ctx);
937 if (leader->attr.pinned) {
938 update_group_times(leader);
939 leader->state = PERF_COUNTER_STATE_ERROR;
940 }
941 }
942
943 unlock:
944 spin_unlock(&ctx->lock);
945}
946
947/*
948 * Enable a counter.
949 *
950 * If counter->ctx is a cloned context, callers must make sure that
951 * every task struct that counter->ctx->task could possibly point to
952 * remains valid. This condition is satisfied when called through
953 * perf_counter_for_each_child or perf_counter_for_each as described
954 * for perf_counter_disable.
955 */
956static void perf_counter_enable(struct perf_counter *counter)
957{
958 struct perf_counter_context *ctx = counter->ctx;
959 struct task_struct *task = ctx->task;
960
961 if (!task) {
962 /*
963 * Enable the counter on the cpu that it's on
964 */
965 smp_call_function_single(counter->cpu, __perf_counter_enable,
966 counter, 1);
967 return;
968 }
969
970 spin_lock_irq(&ctx->lock);
971 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
972 goto out;
973
974 /*
975 * If the counter is in error state, clear that first.
976 * That way, if we see the counter in error state below, we
977 * know that it has gone back into error state, as distinct
978 * from the task having been scheduled away before the
979 * cross-call arrived.
980 */
981 if (counter->state == PERF_COUNTER_STATE_ERROR)
982 counter->state = PERF_COUNTER_STATE_OFF;
983
984 retry:
985 spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_counter_enable, counter);
987
988 spin_lock_irq(&ctx->lock);
989
990 /*
991 * If the context is active and the counter is still off,
992 * we need to retry the cross-call.
993 */
994 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
995 goto retry;
996
997 /*
998 * Since we have the lock this context can't be scheduled
999 * in, so we can change the state safely.
1000 */
1001 if (counter->state == PERF_COUNTER_STATE_OFF)
1002 __perf_counter_mark_enabled(counter, ctx);
1003
1004 out:
1005 spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_counter_refresh(struct perf_counter *counter, int refresh)
1009{
1010 /*
1011 * not supported on inherited counters
1012 */
1013 if (counter->attr.inherit)
1014 return -EINVAL;
1015
1016 atomic_add(refresh, &counter->event_limit);
1017 perf_counter_enable(counter);
1018
1019 return 0;
1020}
1021
1022void __perf_counter_sched_out(struct perf_counter_context *ctx,
1023 struct perf_cpu_context *cpuctx)
1024{
1025 struct perf_counter *counter;
1026
1027 spin_lock(&ctx->lock);
1028 ctx->is_active = 0;
1029 if (likely(!ctx->nr_counters))
1030 goto out;
1031 update_context_time(ctx);
1032
1033 perf_disable();
1034 if (ctx->nr_active) {
1035 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1036 if (counter != counter->group_leader)
1037 counter_sched_out(counter, cpuctx, ctx);
1038 else
1039 group_sched_out(counter, cpuctx, ctx);
1040 }
1041 }
1042 perf_enable();
1043 out:
1044 spin_unlock(&ctx->lock);
1045}
1046
1047/*
1048 * Test whether two contexts are equivalent, i.e. whether they
1049 * have both been cloned from the same version of the same context
1050 * and they both have the same number of enabled counters.
1051 * If the number of enabled counters is the same, then the set
1052 * of enabled counters should be the same, because these are both
1053 * inherited contexts, therefore we can't access individual counters
1054 * in them directly with an fd; we can only enable/disable all
1055 * counters via prctl, or enable/disable all counters in a family
1056 * via ioctl, which will have the same effect on both contexts.
1057 */
1058static int context_equiv(struct perf_counter_context *ctx1,
1059 struct perf_counter_context *ctx2)
1060{
1061 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1062 && ctx1->parent_gen == ctx2->parent_gen
1063 && !ctx1->pin_count && !ctx2->pin_count;
1064}
1065
1066static void __perf_counter_read(void *counter);
1067
1068static void __perf_counter_sync_stat(struct perf_counter *counter,
1069 struct perf_counter *next_counter)
1070{
1071 u64 value;
1072
1073 if (!counter->attr.inherit_stat)
1074 return;
1075
1076 /*
1077 * Update the counter value, we cannot use perf_counter_read()
1078 * because we're in the middle of a context switch and have IRQs
1079 * disabled, which upsets smp_call_function_single(), however
1080 * we know the counter must be on the current CPU, therefore we
1081 * don't need to use it.
1082 */
1083 switch (counter->state) {
1084 case PERF_COUNTER_STATE_ACTIVE:
1085 __perf_counter_read(counter);
1086 break;
1087
1088 case PERF_COUNTER_STATE_INACTIVE:
1089 update_counter_times(counter);
1090 break;
1091
1092 default:
1093 break;
1094 }
1095
1096 /*
1097 * In order to keep per-task stats reliable we need to flip the counter
1098 * values when we flip the contexts.
1099 */
1100 value = atomic64_read(&next_counter->count);
1101 value = atomic64_xchg(&counter->count, value);
1102 atomic64_set(&next_counter->count, value);
1103
1104 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1105 swap(counter->total_time_running, next_counter->total_time_running);
1106
1107 /*
1108 * Since we swizzled the values, update the user visible data too.
1109 */
1110 perf_counter_update_userpage(counter);
1111 perf_counter_update_userpage(next_counter);
1112}
1113
1114#define list_next_entry(pos, member) \
1115 list_entry(pos->member.next, typeof(*pos), member)
1116
1117static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1118 struct perf_counter_context *next_ctx)
1119{
1120 struct perf_counter *counter, *next_counter;
1121
1122 if (!ctx->nr_stat)
1123 return;
1124
1125 counter = list_first_entry(&ctx->event_list,
1126 struct perf_counter, event_entry);
1127
1128 next_counter = list_first_entry(&next_ctx->event_list,
1129 struct perf_counter, event_entry);
1130
1131 while (&counter->event_entry != &ctx->event_list &&
1132 &next_counter->event_entry != &next_ctx->event_list) {
1133
1134 __perf_counter_sync_stat(counter, next_counter);
1135
1136 counter = list_next_entry(counter, event_entry);
1137 next_counter = list_next_entry(next_counter, event_entry);
1138 }
1139}
1140
1141/*
1142 * Called from scheduler to remove the counters of the current task,
1143 * with interrupts disabled.
1144 *
1145 * We stop each counter and update the counter value in counter->count.
1146 *
1147 * This does not protect us against NMI, but disable()
1148 * sets the disabled bit in the control field of counter _before_
1149 * accessing the counter control register. If a NMI hits, then it will
1150 * not restart the counter.
1151 */
1152void perf_counter_task_sched_out(struct task_struct *task,
1153 struct task_struct *next, int cpu)
1154{
1155 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1156 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1157 struct perf_counter_context *next_ctx;
1158 struct perf_counter_context *parent;
1159 struct pt_regs *regs;
1160 int do_switch = 1;
1161
1162 regs = task_pt_regs(task);
1163 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1164
1165 if (likely(!ctx || !cpuctx->task_ctx))
1166 return;
1167
1168 update_context_time(ctx);
1169
1170 rcu_read_lock();
1171 parent = rcu_dereference(ctx->parent_ctx);
1172 next_ctx = next->perf_counter_ctxp;
1173 if (parent && next_ctx &&
1174 rcu_dereference(next_ctx->parent_ctx) == parent) {
1175 /*
1176 * Looks like the two contexts are clones, so we might be
1177 * able to optimize the context switch. We lock both
1178 * contexts and check that they are clones under the
1179 * lock (including re-checking that neither has been
1180 * uncloned in the meantime). It doesn't matter which
1181 * order we take the locks because no other cpu could
1182 * be trying to lock both of these tasks.
1183 */
1184 spin_lock(&ctx->lock);
1185 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1186 if (context_equiv(ctx, next_ctx)) {
1187 /*
1188 * XXX do we need a memory barrier of sorts
1189 * wrt to rcu_dereference() of perf_counter_ctxp
1190 */
1191 task->perf_counter_ctxp = next_ctx;
1192 next->perf_counter_ctxp = ctx;
1193 ctx->task = next;
1194 next_ctx->task = task;
1195 do_switch = 0;
1196
1197 perf_counter_sync_stat(ctx, next_ctx);
1198 }
1199 spin_unlock(&next_ctx->lock);
1200 spin_unlock(&ctx->lock);
1201 }
1202 rcu_read_unlock();
1203
1204 if (do_switch) {
1205 __perf_counter_sched_out(ctx, cpuctx);
1206 cpuctx->task_ctx = NULL;
1207 }
1208}
1209
1210/*
1211 * Called with IRQs disabled
1212 */
1213static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1214{
1215 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1216
1217 if (!cpuctx->task_ctx)
1218 return;
1219
1220 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1221 return;
1222
1223 __perf_counter_sched_out(ctx, cpuctx);
1224 cpuctx->task_ctx = NULL;
1225}
1226
1227/*
1228 * Called with IRQs disabled
1229 */
1230static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1231{
1232 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1233}
1234
1235static void
1236__perf_counter_sched_in(struct perf_counter_context *ctx,
1237 struct perf_cpu_context *cpuctx, int cpu)
1238{
1239 struct perf_counter *counter;
1240 int can_add_hw = 1;
1241
1242 spin_lock(&ctx->lock);
1243 ctx->is_active = 1;
1244 if (likely(!ctx->nr_counters))
1245 goto out;
1246
1247 ctx->timestamp = perf_clock();
1248
1249 perf_disable();
1250
1251 /*
1252 * First go through the list and put on any pinned groups
1253 * in order to give them the best chance of going on.
1254 */
1255 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1256 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1257 !counter->attr.pinned)
1258 continue;
1259 if (counter->cpu != -1 && counter->cpu != cpu)
1260 continue;
1261
1262 if (counter != counter->group_leader)
1263 counter_sched_in(counter, cpuctx, ctx, cpu);
1264 else {
1265 if (group_can_go_on(counter, cpuctx, 1))
1266 group_sched_in(counter, cpuctx, ctx, cpu);
1267 }
1268
1269 /*
1270 * If this pinned group hasn't been scheduled,
1271 * put it in error state.
1272 */
1273 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1274 update_group_times(counter);
1275 counter->state = PERF_COUNTER_STATE_ERROR;
1276 }
1277 }
1278
1279 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1280 /*
1281 * Ignore counters in OFF or ERROR state, and
1282 * ignore pinned counters since we did them already.
1283 */
1284 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1285 counter->attr.pinned)
1286 continue;
1287
1288 /*
1289 * Listen to the 'cpu' scheduling filter constraint
1290 * of counters:
1291 */
1292 if (counter->cpu != -1 && counter->cpu != cpu)
1293 continue;
1294
1295 if (counter != counter->group_leader) {
1296 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1297 can_add_hw = 0;
1298 } else {
1299 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1300 if (group_sched_in(counter, cpuctx, ctx, cpu))
1301 can_add_hw = 0;
1302 }
1303 }
1304 }
1305 perf_enable();
1306 out:
1307 spin_unlock(&ctx->lock);
1308}
1309
1310/*
1311 * Called from scheduler to add the counters of the current task
1312 * with interrupts disabled.
1313 *
1314 * We restore the counter value and then enable it.
1315 *
1316 * This does not protect us against NMI, but enable()
1317 * sets the enabled bit in the control field of counter _before_
1318 * accessing the counter control register. If a NMI hits, then it will
1319 * keep the counter running.
1320 */
1321void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1322{
1323 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1324 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1325
1326 if (likely(!ctx))
1327 return;
1328 if (cpuctx->task_ctx == ctx)
1329 return;
1330 __perf_counter_sched_in(ctx, cpuctx, cpu);
1331 cpuctx->task_ctx = ctx;
1332}
1333
1334static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1335{
1336 struct perf_counter_context *ctx = &cpuctx->ctx;
1337
1338 __perf_counter_sched_in(ctx, cpuctx, cpu);
1339}
1340
1341#define MAX_INTERRUPTS (~0ULL)
1342
1343static void perf_log_throttle(struct perf_counter *counter, int enable);
1344
1345static void perf_adjust_period(struct perf_counter *counter, u64 events)
1346{
1347 struct hw_perf_counter *hwc = &counter->hw;
1348 u64 period, sample_period;
1349 s64 delta;
1350
1351 events *= hwc->sample_period;
1352 period = div64_u64(events, counter->attr.sample_freq);
1353
1354 delta = (s64)(period - hwc->sample_period);
1355 delta = (delta + 7) / 8; /* low pass filter */
1356
1357 sample_period = hwc->sample_period + delta;
1358
1359 if (!sample_period)
1360 sample_period = 1;
1361
1362 hwc->sample_period = sample_period;
1363}
1364
1365static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1366{
1367 struct perf_counter *counter;
1368 struct hw_perf_counter *hwc;
1369 u64 interrupts, freq;
1370
1371 spin_lock(&ctx->lock);
1372 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1373 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1374 continue;
1375
1376 hwc = &counter->hw;
1377
1378 interrupts = hwc->interrupts;
1379 hwc->interrupts = 0;
1380
1381 /*
1382 * unthrottle counters on the tick
1383 */
1384 if (interrupts == MAX_INTERRUPTS) {
1385 perf_log_throttle(counter, 1);
1386 counter->pmu->unthrottle(counter);
1387 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1388 }
1389
1390 if (!counter->attr.freq || !counter->attr.sample_freq)
1391 continue;
1392
1393 /*
1394 * if the specified freq < HZ then we need to skip ticks
1395 */
1396 if (counter->attr.sample_freq < HZ) {
1397 freq = counter->attr.sample_freq;
1398
1399 hwc->freq_count += freq;
1400 hwc->freq_interrupts += interrupts;
1401
1402 if (hwc->freq_count < HZ)
1403 continue;
1404
1405 interrupts = hwc->freq_interrupts;
1406 hwc->freq_interrupts = 0;
1407 hwc->freq_count -= HZ;
1408 } else
1409 freq = HZ;
1410
1411 perf_adjust_period(counter, freq * interrupts);
1412
1413 /*
1414 * In order to avoid being stalled by an (accidental) huge
1415 * sample period, force reset the sample period if we didn't
1416 * get any events in this freq period.
1417 */
1418 if (!interrupts) {
1419 perf_disable();
1420 counter->pmu->disable(counter);
1421 atomic64_set(&hwc->period_left, 0);
1422 counter->pmu->enable(counter);
1423 perf_enable();
1424 }
1425 }
1426 spin_unlock(&ctx->lock);
1427}
1428
1429/*
1430 * Round-robin a context's counters:
1431 */
1432static void rotate_ctx(struct perf_counter_context *ctx)
1433{
1434 struct perf_counter *counter;
1435
1436 if (!ctx->nr_counters)
1437 return;
1438
1439 spin_lock(&ctx->lock);
1440 /*
1441 * Rotate the first entry last (works just fine for group counters too):
1442 */
1443 perf_disable();
1444 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1445 list_move_tail(&counter->list_entry, &ctx->counter_list);
1446 break;
1447 }
1448 perf_enable();
1449
1450 spin_unlock(&ctx->lock);
1451}
1452
1453void perf_counter_task_tick(struct task_struct *curr, int cpu)
1454{
1455 struct perf_cpu_context *cpuctx;
1456 struct perf_counter_context *ctx;
1457
1458 if (!atomic_read(&nr_counters))
1459 return;
1460
1461 cpuctx = &per_cpu(perf_cpu_context, cpu);
1462 ctx = curr->perf_counter_ctxp;
1463
1464 perf_ctx_adjust_freq(&cpuctx->ctx);
1465 if (ctx)
1466 perf_ctx_adjust_freq(ctx);
1467
1468 perf_counter_cpu_sched_out(cpuctx);
1469 if (ctx)
1470 __perf_counter_task_sched_out(ctx);
1471
1472 rotate_ctx(&cpuctx->ctx);
1473 if (ctx)
1474 rotate_ctx(ctx);
1475
1476 perf_counter_cpu_sched_in(cpuctx, cpu);
1477 if (ctx)
1478 perf_counter_task_sched_in(curr, cpu);
1479}
1480
1481/*
1482 * Enable all of a task's counters that have been marked enable-on-exec.
1483 * This expects task == current.
1484 */
1485static void perf_counter_enable_on_exec(struct task_struct *task)
1486{
1487 struct perf_counter_context *ctx;
1488 struct perf_counter *counter;
1489 unsigned long flags;
1490 int enabled = 0;
1491
1492 local_irq_save(flags);
1493 ctx = task->perf_counter_ctxp;
1494 if (!ctx || !ctx->nr_counters)
1495 goto out;
1496
1497 __perf_counter_task_sched_out(ctx);
1498
1499 spin_lock(&ctx->lock);
1500
1501 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1502 if (!counter->attr.enable_on_exec)
1503 continue;
1504 counter->attr.enable_on_exec = 0;
1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1506 continue;
1507 __perf_counter_mark_enabled(counter, ctx);
1508 enabled = 1;
1509 }
1510
1511 /*
1512 * Unclone this context if we enabled any counter.
1513 */
1514 if (enabled)
1515 unclone_ctx(ctx);
1516
1517 spin_unlock(&ctx->lock);
1518
1519 perf_counter_task_sched_in(task, smp_processor_id());
1520 out:
1521 local_irq_restore(flags);
1522}
1523
1524/*
1525 * Cross CPU call to read the hardware counter
1526 */
1527static void __perf_counter_read(void *info)
1528{
1529 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1530 struct perf_counter *counter = info;
1531 struct perf_counter_context *ctx = counter->ctx;
1532 unsigned long flags;
1533
1534 /*
1535 * If this is a task context, we need to check whether it is
1536 * the current task context of this cpu. If not it has been
1537 * scheduled out before the smp call arrived. In that case
1538 * counter->count would have been updated to a recent sample
1539 * when the counter was scheduled out.
1540 */
1541 if (ctx->task && cpuctx->task_ctx != ctx)
1542 return;
1543
1544 local_irq_save(flags);
1545 if (ctx->is_active)
1546 update_context_time(ctx);
1547 counter->pmu->read(counter);
1548 update_counter_times(counter);
1549 local_irq_restore(flags);
1550}
1551
1552static u64 perf_counter_read(struct perf_counter *counter)
1553{
1554 /*
1555 * If counter is enabled and currently active on a CPU, update the
1556 * value in the counter structure:
1557 */
1558 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1559 smp_call_function_single(counter->oncpu,
1560 __perf_counter_read, counter, 1);
1561 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1562 update_counter_times(counter);
1563 }
1564
1565 return atomic64_read(&counter->count);
1566}
1567
1568/*
1569 * Initialize the perf_counter context in a task_struct:
1570 */
1571static void
1572__perf_counter_init_context(struct perf_counter_context *ctx,
1573 struct task_struct *task)
1574{
1575 memset(ctx, 0, sizeof(*ctx));
1576 spin_lock_init(&ctx->lock);
1577 mutex_init(&ctx->mutex);
1578 INIT_LIST_HEAD(&ctx->counter_list);
1579 INIT_LIST_HEAD(&ctx->event_list);
1580 atomic_set(&ctx->refcount, 1);
1581 ctx->task = task;
1582}
1583
1584static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1585{
1586 struct perf_counter_context *ctx;
1587 struct perf_cpu_context *cpuctx;
1588 struct task_struct *task;
1589 unsigned long flags;
1590 int err;
1591
1592 /*
1593 * If cpu is not a wildcard then this is a percpu counter:
1594 */
1595 if (cpu != -1) {
1596 /* Must be root to operate on a CPU counter: */
1597 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1598 return ERR_PTR(-EACCES);
1599
1600 if (cpu < 0 || cpu > num_possible_cpus())
1601 return ERR_PTR(-EINVAL);
1602
1603 /*
1604 * We could be clever and allow to attach a counter to an
1605 * offline CPU and activate it when the CPU comes up, but
1606 * that's for later.
1607 */
1608 if (!cpu_isset(cpu, cpu_online_map))
1609 return ERR_PTR(-ENODEV);
1610
1611 cpuctx = &per_cpu(perf_cpu_context, cpu);
1612 ctx = &cpuctx->ctx;
1613 get_ctx(ctx);
1614
1615 return ctx;
1616 }
1617
1618 rcu_read_lock();
1619 if (!pid)
1620 task = current;
1621 else
1622 task = find_task_by_vpid(pid);
1623 if (task)
1624 get_task_struct(task);
1625 rcu_read_unlock();
1626
1627 if (!task)
1628 return ERR_PTR(-ESRCH);
1629
1630 /*
1631 * Can't attach counters to a dying task.
1632 */
1633 err = -ESRCH;
1634 if (task->flags & PF_EXITING)
1635 goto errout;
1636
1637 /* Reuse ptrace permission checks for now. */
1638 err = -EACCES;
1639 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1640 goto errout;
1641
1642 retry:
1643 ctx = perf_lock_task_context(task, &flags);
1644 if (ctx) {
1645 unclone_ctx(ctx);
1646 spin_unlock_irqrestore(&ctx->lock, flags);
1647 }
1648
1649 if (!ctx) {
1650 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1651 err = -ENOMEM;
1652 if (!ctx)
1653 goto errout;
1654 __perf_counter_init_context(ctx, task);
1655 get_ctx(ctx);
1656 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1657 /*
1658 * We raced with some other task; use
1659 * the context they set.
1660 */
1661 kfree(ctx);
1662 goto retry;
1663 }
1664 get_task_struct(task);
1665 }
1666
1667 put_task_struct(task);
1668 return ctx;
1669
1670 errout:
1671 put_task_struct(task);
1672 return ERR_PTR(err);
1673}
1674
1675static void free_counter_rcu(struct rcu_head *head)
1676{
1677 struct perf_counter *counter;
1678
1679 counter = container_of(head, struct perf_counter, rcu_head);
1680 if (counter->ns)
1681 put_pid_ns(counter->ns);
1682 kfree(counter);
1683}
1684
1685static void perf_pending_sync(struct perf_counter *counter);
1686
1687static void free_counter(struct perf_counter *counter)
1688{
1689 perf_pending_sync(counter);
1690
1691 if (!counter->parent) {
1692 atomic_dec(&nr_counters);
1693 if (counter->attr.mmap)
1694 atomic_dec(&nr_mmap_counters);
1695 if (counter->attr.comm)
1696 atomic_dec(&nr_comm_counters);
1697 if (counter->attr.task)
1698 atomic_dec(&nr_task_counters);
1699 }
1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1706 if (counter->destroy)
1707 counter->destroy(counter);
1708
1709 put_ctx(counter->ctx);
1710 call_rcu(&counter->rcu_head, free_counter_rcu);
1711}
1712
1713/*
1714 * Called when the last reference to the file is gone.
1715 */
1716static int perf_release(struct inode *inode, struct file *file)
1717{
1718 struct perf_counter *counter = file->private_data;
1719 struct perf_counter_context *ctx = counter->ctx;
1720
1721 file->private_data = NULL;
1722
1723 WARN_ON_ONCE(ctx->parent_ctx);
1724 mutex_lock(&ctx->mutex);
1725 perf_counter_remove_from_context(counter);
1726 mutex_unlock(&ctx->mutex);
1727
1728 mutex_lock(&counter->owner->perf_counter_mutex);
1729 list_del_init(&counter->owner_entry);
1730 mutex_unlock(&counter->owner->perf_counter_mutex);
1731 put_task_struct(counter->owner);
1732
1733 free_counter(counter);
1734
1735 return 0;
1736}
1737
1738static int perf_counter_read_size(struct perf_counter *counter)
1739{
1740 int entry = sizeof(u64); /* value */
1741 int size = 0;
1742 int nr = 1;
1743
1744 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1745 size += sizeof(u64);
1746
1747 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1748 size += sizeof(u64);
1749
1750 if (counter->attr.read_format & PERF_FORMAT_ID)
1751 entry += sizeof(u64);
1752
1753 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1754 nr += counter->group_leader->nr_siblings;
1755 size += sizeof(u64);
1756 }
1757
1758 size += entry * nr;
1759
1760 return size;
1761}
1762
1763static u64 perf_counter_read_value(struct perf_counter *counter)
1764{
1765 struct perf_counter *child;
1766 u64 total = 0;
1767
1768 total += perf_counter_read(counter);
1769 list_for_each_entry(child, &counter->child_list, child_list)
1770 total += perf_counter_read(child);
1771
1772 return total;
1773}
1774
1775static int perf_counter_read_entry(struct perf_counter *counter,
1776 u64 read_format, char __user *buf)
1777{
1778 int n = 0, count = 0;
1779 u64 values[2];
1780
1781 values[n++] = perf_counter_read_value(counter);
1782 if (read_format & PERF_FORMAT_ID)
1783 values[n++] = primary_counter_id(counter);
1784
1785 count = n * sizeof(u64);
1786
1787 if (copy_to_user(buf, values, count))
1788 return -EFAULT;
1789
1790 return count;
1791}
1792
1793static int perf_counter_read_group(struct perf_counter *counter,
1794 u64 read_format, char __user *buf)
1795{
1796 struct perf_counter *leader = counter->group_leader, *sub;
1797 int n = 0, size = 0, err = -EFAULT;
1798 u64 values[3];
1799
1800 values[n++] = 1 + leader->nr_siblings;
1801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1802 values[n++] = leader->total_time_enabled +
1803 atomic64_read(&leader->child_total_time_enabled);
1804 }
1805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1806 values[n++] = leader->total_time_running +
1807 atomic64_read(&leader->child_total_time_running);
1808 }
1809
1810 size = n * sizeof(u64);
1811
1812 if (copy_to_user(buf, values, size))
1813 return -EFAULT;
1814
1815 err = perf_counter_read_entry(leader, read_format, buf + size);
1816 if (err < 0)
1817 return err;
1818
1819 size += err;
1820
1821 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1822 err = perf_counter_read_entry(sub, read_format,
1823 buf + size);
1824 if (err < 0)
1825 return err;
1826
1827 size += err;
1828 }
1829
1830 return size;
1831}
1832
1833static int perf_counter_read_one(struct perf_counter *counter,
1834 u64 read_format, char __user *buf)
1835{
1836 u64 values[4];
1837 int n = 0;
1838
1839 values[n++] = perf_counter_read_value(counter);
1840 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1841 values[n++] = counter->total_time_enabled +
1842 atomic64_read(&counter->child_total_time_enabled);
1843 }
1844 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1845 values[n++] = counter->total_time_running +
1846 atomic64_read(&counter->child_total_time_running);
1847 }
1848 if (read_format & PERF_FORMAT_ID)
1849 values[n++] = primary_counter_id(counter);
1850
1851 if (copy_to_user(buf, values, n * sizeof(u64)))
1852 return -EFAULT;
1853
1854 return n * sizeof(u64);
1855}
1856
1857/*
1858 * Read the performance counter - simple non blocking version for now
1859 */
1860static ssize_t
1861perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1862{
1863 u64 read_format = counter->attr.read_format;
1864 int ret;
1865
1866 /*
1867 * Return end-of-file for a read on a counter that is in
1868 * error state (i.e. because it was pinned but it couldn't be
1869 * scheduled on to the CPU at some point).
1870 */
1871 if (counter->state == PERF_COUNTER_STATE_ERROR)
1872 return 0;
1873
1874 if (count < perf_counter_read_size(counter))
1875 return -ENOSPC;
1876
1877 WARN_ON_ONCE(counter->ctx->parent_ctx);
1878 mutex_lock(&counter->child_mutex);
1879 if (read_format & PERF_FORMAT_GROUP)
1880 ret = perf_counter_read_group(counter, read_format, buf);
1881 else
1882 ret = perf_counter_read_one(counter, read_format, buf);
1883 mutex_unlock(&counter->child_mutex);
1884
1885 return ret;
1886}
1887
1888static ssize_t
1889perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1890{
1891 struct perf_counter *counter = file->private_data;
1892
1893 return perf_read_hw(counter, buf, count);
1894}
1895
1896static unsigned int perf_poll(struct file *file, poll_table *wait)
1897{
1898 struct perf_counter *counter = file->private_data;
1899 struct perf_mmap_data *data;
1900 unsigned int events = POLL_HUP;
1901
1902 rcu_read_lock();
1903 data = rcu_dereference(counter->data);
1904 if (data)
1905 events = atomic_xchg(&data->poll, 0);
1906 rcu_read_unlock();
1907
1908 poll_wait(file, &counter->waitq, wait);
1909
1910 return events;
1911}
1912
1913static void perf_counter_reset(struct perf_counter *counter)
1914{
1915 (void)perf_counter_read(counter);
1916 atomic64_set(&counter->count, 0);
1917 perf_counter_update_userpage(counter);
1918}
1919
1920/*
1921 * Holding the top-level counter's child_mutex means that any
1922 * descendant process that has inherited this counter will block
1923 * in sync_child_counter if it goes to exit, thus satisfying the
1924 * task existence requirements of perf_counter_enable/disable.
1925 */
1926static void perf_counter_for_each_child(struct perf_counter *counter,
1927 void (*func)(struct perf_counter *))
1928{
1929 struct perf_counter *child;
1930
1931 WARN_ON_ONCE(counter->ctx->parent_ctx);
1932 mutex_lock(&counter->child_mutex);
1933 func(counter);
1934 list_for_each_entry(child, &counter->child_list, child_list)
1935 func(child);
1936 mutex_unlock(&counter->child_mutex);
1937}
1938
1939static void perf_counter_for_each(struct perf_counter *counter,
1940 void (*func)(struct perf_counter *))
1941{
1942 struct perf_counter_context *ctx = counter->ctx;
1943 struct perf_counter *sibling;
1944
1945 WARN_ON_ONCE(ctx->parent_ctx);
1946 mutex_lock(&ctx->mutex);
1947 counter = counter->group_leader;
1948
1949 perf_counter_for_each_child(counter, func);
1950 func(counter);
1951 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1952 perf_counter_for_each_child(counter, func);
1953 mutex_unlock(&ctx->mutex);
1954}
1955
1956static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1957{
1958 struct perf_counter_context *ctx = counter->ctx;
1959 unsigned long size;
1960 int ret = 0;
1961 u64 value;
1962
1963 if (!counter->attr.sample_period)
1964 return -EINVAL;
1965
1966 size = copy_from_user(&value, arg, sizeof(value));
1967 if (size != sizeof(value))
1968 return -EFAULT;
1969
1970 if (!value)
1971 return -EINVAL;
1972
1973 spin_lock_irq(&ctx->lock);
1974 if (counter->attr.freq) {
1975 if (value > sysctl_perf_counter_sample_rate) {
1976 ret = -EINVAL;
1977 goto unlock;
1978 }
1979
1980 counter->attr.sample_freq = value;
1981 } else {
1982 counter->attr.sample_period = value;
1983 counter->hw.sample_period = value;
1984 }
1985unlock:
1986 spin_unlock_irq(&ctx->lock);
1987
1988 return ret;
1989}
1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1994{
1995 struct perf_counter *counter = file->private_data;
1996 void (*func)(struct perf_counter *);
1997 u32 flags = arg;
1998
1999 switch (cmd) {
2000 case PERF_COUNTER_IOC_ENABLE:
2001 func = perf_counter_enable;
2002 break;
2003 case PERF_COUNTER_IOC_DISABLE:
2004 func = perf_counter_disable;
2005 break;
2006 case PERF_COUNTER_IOC_RESET:
2007 func = perf_counter_reset;
2008 break;
2009
2010 case PERF_COUNTER_IOC_REFRESH:
2011 return perf_counter_refresh(counter, arg);
2012
2013 case PERF_COUNTER_IOC_PERIOD:
2014 return perf_counter_period(counter, (u64 __user *)arg);
2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
2019 default:
2020 return -ENOTTY;
2021 }
2022
2023 if (flags & PERF_IOC_FLAG_GROUP)
2024 perf_counter_for_each(counter, func);
2025 else
2026 perf_counter_for_each_child(counter, func);
2027
2028 return 0;
2029}
2030
2031int perf_counter_task_enable(void)
2032{
2033 struct perf_counter *counter;
2034
2035 mutex_lock(&current->perf_counter_mutex);
2036 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2037 perf_counter_for_each_child(counter, perf_counter_enable);
2038 mutex_unlock(&current->perf_counter_mutex);
2039
2040 return 0;
2041}
2042
2043int perf_counter_task_disable(void)
2044{
2045 struct perf_counter *counter;
2046
2047 mutex_lock(&current->perf_counter_mutex);
2048 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2049 perf_counter_for_each_child(counter, perf_counter_disable);
2050 mutex_unlock(&current->perf_counter_mutex);
2051
2052 return 0;
2053}
2054
2055#ifndef PERF_COUNTER_INDEX_OFFSET
2056# define PERF_COUNTER_INDEX_OFFSET 0
2057#endif
2058
2059static int perf_counter_index(struct perf_counter *counter)
2060{
2061 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2062 return 0;
2063
2064 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2065}
2066
2067/*
2068 * Callers need to ensure there can be no nesting of this function, otherwise
2069 * the seqlock logic goes bad. We can not serialize this because the arch
2070 * code calls this from NMI context.
2071 */
2072void perf_counter_update_userpage(struct perf_counter *counter)
2073{
2074 struct perf_counter_mmap_page *userpg;
2075 struct perf_mmap_data *data;
2076
2077 rcu_read_lock();
2078 data = rcu_dereference(counter->data);
2079 if (!data)
2080 goto unlock;
2081
2082 userpg = data->user_page;
2083
2084 /*
2085 * Disable preemption so as to not let the corresponding user-space
2086 * spin too long if we get preempted.
2087 */
2088 preempt_disable();
2089 ++userpg->lock;
2090 barrier();
2091 userpg->index = perf_counter_index(counter);
2092 userpg->offset = atomic64_read(&counter->count);
2093 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2094 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2095
2096 userpg->time_enabled = counter->total_time_enabled +
2097 atomic64_read(&counter->child_total_time_enabled);
2098
2099 userpg->time_running = counter->total_time_running +
2100 atomic64_read(&counter->child_total_time_running);
2101
2102 barrier();
2103 ++userpg->lock;
2104 preempt_enable();
2105unlock:
2106 rcu_read_unlock();
2107}
2108
2109static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2110{
2111 struct perf_counter *counter = vma->vm_file->private_data;
2112 struct perf_mmap_data *data;
2113 int ret = VM_FAULT_SIGBUS;
2114
2115 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2116 if (vmf->pgoff == 0)
2117 ret = 0;
2118 return ret;
2119 }
2120
2121 rcu_read_lock();
2122 data = rcu_dereference(counter->data);
2123 if (!data)
2124 goto unlock;
2125
2126 if (vmf->pgoff == 0) {
2127 vmf->page = virt_to_page(data->user_page);
2128 } else {
2129 int nr = vmf->pgoff - 1;
2130
2131 if ((unsigned)nr > data->nr_pages)
2132 goto unlock;
2133
2134 if (vmf->flags & FAULT_FLAG_WRITE)
2135 goto unlock;
2136
2137 vmf->page = virt_to_page(data->data_pages[nr]);
2138 }
2139
2140 get_page(vmf->page);
2141 vmf->page->mapping = vma->vm_file->f_mapping;
2142 vmf->page->index = vmf->pgoff;
2143
2144 ret = 0;
2145unlock:
2146 rcu_read_unlock();
2147
2148 return ret;
2149}
2150
2151static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2152{
2153 struct perf_mmap_data *data;
2154 unsigned long size;
2155 int i;
2156
2157 WARN_ON(atomic_read(&counter->mmap_count));
2158
2159 size = sizeof(struct perf_mmap_data);
2160 size += nr_pages * sizeof(void *);
2161
2162 data = kzalloc(size, GFP_KERNEL);
2163 if (!data)
2164 goto fail;
2165
2166 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2167 if (!data->user_page)
2168 goto fail_user_page;
2169
2170 for (i = 0; i < nr_pages; i++) {
2171 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2172 if (!data->data_pages[i])
2173 goto fail_data_pages;
2174 }
2175
2176 data->nr_pages = nr_pages;
2177 atomic_set(&data->lock, -1);
2178
2179 rcu_assign_pointer(counter->data, data);
2180
2181 return 0;
2182
2183fail_data_pages:
2184 for (i--; i >= 0; i--)
2185 free_page((unsigned long)data->data_pages[i]);
2186
2187 free_page((unsigned long)data->user_page);
2188
2189fail_user_page:
2190 kfree(data);
2191
2192fail:
2193 return -ENOMEM;
2194}
2195
2196static void perf_mmap_free_page(unsigned long addr)
2197{
2198 struct page *page = virt_to_page((void *)addr);
2199
2200 page->mapping = NULL;
2201 __free_page(page);
2202}
2203
2204static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2205{
2206 struct perf_mmap_data *data;
2207 int i;
2208
2209 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2210
2211 perf_mmap_free_page((unsigned long)data->user_page);
2212 for (i = 0; i < data->nr_pages; i++)
2213 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2214
2215 kfree(data);
2216}
2217
2218static void perf_mmap_data_free(struct perf_counter *counter)
2219{
2220 struct perf_mmap_data *data = counter->data;
2221
2222 WARN_ON(atomic_read(&counter->mmap_count));
2223
2224 rcu_assign_pointer(counter->data, NULL);
2225 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2226}
2227
2228static void perf_mmap_open(struct vm_area_struct *vma)
2229{
2230 struct perf_counter *counter = vma->vm_file->private_data;
2231
2232 atomic_inc(&counter->mmap_count);
2233}
2234
2235static void perf_mmap_close(struct vm_area_struct *vma)
2236{
2237 struct perf_counter *counter = vma->vm_file->private_data;
2238
2239 WARN_ON_ONCE(counter->ctx->parent_ctx);
2240 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2241 struct user_struct *user = current_user();
2242
2243 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2244 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2245 perf_mmap_data_free(counter);
2246 mutex_unlock(&counter->mmap_mutex);
2247 }
2248}
2249
2250static struct vm_operations_struct perf_mmap_vmops = {
2251 .open = perf_mmap_open,
2252 .close = perf_mmap_close,
2253 .fault = perf_mmap_fault,
2254 .page_mkwrite = perf_mmap_fault,
2255};
2256
2257static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2258{
2259 struct perf_counter *counter = file->private_data;
2260 unsigned long user_locked, user_lock_limit;
2261 struct user_struct *user = current_user();
2262 unsigned long locked, lock_limit;
2263 unsigned long vma_size;
2264 unsigned long nr_pages;
2265 long user_extra, extra;
2266 int ret = 0;
2267
2268 if (!(vma->vm_flags & VM_SHARED))
2269 return -EINVAL;
2270
2271 vma_size = vma->vm_end - vma->vm_start;
2272 nr_pages = (vma_size / PAGE_SIZE) - 1;
2273
2274 /*
2275 * If we have data pages ensure they're a power-of-two number, so we
2276 * can do bitmasks instead of modulo.
2277 */
2278 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2279 return -EINVAL;
2280
2281 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2282 return -EINVAL;
2283
2284 if (vma->vm_pgoff != 0)
2285 return -EINVAL;
2286
2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2295 if (nr_pages != counter->data->nr_pages)
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 user_extra = nr_pages + 1;
2301 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2302
2303 /*
2304 * Increase the limit linearly with more CPUs:
2305 */
2306 user_lock_limit *= num_online_cpus();
2307
2308 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2309
2310 extra = 0;
2311 if (user_locked > user_lock_limit)
2312 extra = user_locked - user_lock_limit;
2313
2314 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2315 lock_limit >>= PAGE_SHIFT;
2316 locked = vma->vm_mm->locked_vm + extra;
2317
2318 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2319 ret = -EPERM;
2320 goto unlock;
2321 }
2322
2323 WARN_ON(counter->data);
2324 ret = perf_mmap_data_alloc(counter, nr_pages);
2325 if (ret)
2326 goto unlock;
2327
2328 atomic_set(&counter->mmap_count, 1);
2329 atomic_long_add(user_extra, &user->locked_vm);
2330 vma->vm_mm->locked_vm += extra;
2331 counter->data->nr_locked = extra;
2332 if (vma->vm_flags & VM_WRITE)
2333 counter->data->writable = 1;
2334
2335unlock:
2336 mutex_unlock(&counter->mmap_mutex);
2337
2338 vma->vm_flags |= VM_RESERVED;
2339 vma->vm_ops = &perf_mmap_vmops;
2340
2341 return ret;
2342}
2343
2344static int perf_fasync(int fd, struct file *filp, int on)
2345{
2346 struct inode *inode = filp->f_path.dentry->d_inode;
2347 struct perf_counter *counter = filp->private_data;
2348 int retval;
2349
2350 mutex_lock(&inode->i_mutex);
2351 retval = fasync_helper(fd, filp, on, &counter->fasync);
2352 mutex_unlock(&inode->i_mutex);
2353
2354 if (retval < 0)
2355 return retval;
2356
2357 return 0;
2358}
2359
2360static const struct file_operations perf_fops = {
2361 .release = perf_release,
2362 .read = perf_read,
2363 .poll = perf_poll,
2364 .unlocked_ioctl = perf_ioctl,
2365 .compat_ioctl = perf_ioctl,
2366 .mmap = perf_mmap,
2367 .fasync = perf_fasync,
2368};
2369
2370/*
2371 * Perf counter wakeup
2372 *
2373 * If there's data, ensure we set the poll() state and publish everything
2374 * to user-space before waking everybody up.
2375 */
2376
2377void perf_counter_wakeup(struct perf_counter *counter)
2378{
2379 wake_up_all(&counter->waitq);
2380
2381 if (counter->pending_kill) {
2382 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2383 counter->pending_kill = 0;
2384 }
2385}
2386
2387/*
2388 * Pending wakeups
2389 *
2390 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2391 *
2392 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2393 * single linked list and use cmpxchg() to add entries lockless.
2394 */
2395
2396static void perf_pending_counter(struct perf_pending_entry *entry)
2397{
2398 struct perf_counter *counter = container_of(entry,
2399 struct perf_counter, pending);
2400
2401 if (counter->pending_disable) {
2402 counter->pending_disable = 0;
2403 __perf_counter_disable(counter);
2404 }
2405
2406 if (counter->pending_wakeup) {
2407 counter->pending_wakeup = 0;
2408 perf_counter_wakeup(counter);
2409 }
2410}
2411
2412#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2413
2414static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2415 PENDING_TAIL,
2416};
2417
2418static void perf_pending_queue(struct perf_pending_entry *entry,
2419 void (*func)(struct perf_pending_entry *))
2420{
2421 struct perf_pending_entry **head;
2422
2423 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2424 return;
2425
2426 entry->func = func;
2427
2428 head = &get_cpu_var(perf_pending_head);
2429
2430 do {
2431 entry->next = *head;
2432 } while (cmpxchg(head, entry->next, entry) != entry->next);
2433
2434 set_perf_counter_pending();
2435
2436 put_cpu_var(perf_pending_head);
2437}
2438
2439static int __perf_pending_run(void)
2440{
2441 struct perf_pending_entry *list;
2442 int nr = 0;
2443
2444 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2445 while (list != PENDING_TAIL) {
2446 void (*func)(struct perf_pending_entry *);
2447 struct perf_pending_entry *entry = list;
2448
2449 list = list->next;
2450
2451 func = entry->func;
2452 entry->next = NULL;
2453 /*
2454 * Ensure we observe the unqueue before we issue the wakeup,
2455 * so that we won't be waiting forever.
2456 * -- see perf_not_pending().
2457 */
2458 smp_wmb();
2459
2460 func(entry);
2461 nr++;
2462 }
2463
2464 return nr;
2465}
2466
2467static inline int perf_not_pending(struct perf_counter *counter)
2468{
2469 /*
2470 * If we flush on whatever cpu we run, there is a chance we don't
2471 * need to wait.
2472 */
2473 get_cpu();
2474 __perf_pending_run();
2475 put_cpu();
2476
2477 /*
2478 * Ensure we see the proper queue state before going to sleep
2479 * so that we do not miss the wakeup. -- see perf_pending_handle()
2480 */
2481 smp_rmb();
2482 return counter->pending.next == NULL;
2483}
2484
2485static void perf_pending_sync(struct perf_counter *counter)
2486{
2487 wait_event(counter->waitq, perf_not_pending(counter));
2488}
2489
2490void perf_counter_do_pending(void)
2491{
2492 __perf_pending_run();
2493}
2494
2495/*
2496 * Callchain support -- arch specific
2497 */
2498
2499__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2500{
2501 return NULL;
2502}
2503
2504/*
2505 * Output
2506 */
2507
2508struct perf_output_handle {
2509 struct perf_counter *counter;
2510 struct perf_mmap_data *data;
2511 unsigned long head;
2512 unsigned long offset;
2513 int nmi;
2514 int sample;
2515 int locked;
2516 unsigned long flags;
2517};
2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{
2522 unsigned long tail;
2523 unsigned long mask;
2524
2525 if (!data->writable)
2526 return true;
2527
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536
2537 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask;
2539
2540 if ((int)(head - offset) < 0)
2541 return false;
2542
2543 return true;
2544}
2545
2546static void perf_output_wakeup(struct perf_output_handle *handle)
2547{
2548 atomic_set(&handle->data->poll, POLL_IN);
2549
2550 if (handle->nmi) {
2551 handle->counter->pending_wakeup = 1;
2552 perf_pending_queue(&handle->counter->pending,
2553 perf_pending_counter);
2554 } else
2555 perf_counter_wakeup(handle->counter);
2556}
2557
2558/*
2559 * Curious locking construct.
2560 *
2561 * We need to ensure a later event doesn't publish a head when a former
2562 * event isn't done writing. However since we need to deal with NMIs we
2563 * cannot fully serialize things.
2564 *
2565 * What we do is serialize between CPUs so we only have to deal with NMI
2566 * nesting on a single CPU.
2567 *
2568 * We only publish the head (and generate a wakeup) when the outer-most
2569 * event completes.
2570 */
2571static void perf_output_lock(struct perf_output_handle *handle)
2572{
2573 struct perf_mmap_data *data = handle->data;
2574 int cpu;
2575
2576 handle->locked = 0;
2577
2578 local_irq_save(handle->flags);
2579 cpu = smp_processor_id();
2580
2581 if (in_nmi() && atomic_read(&data->lock) == cpu)
2582 return;
2583
2584 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2585 cpu_relax();
2586
2587 handle->locked = 1;
2588}
2589
2590static void perf_output_unlock(struct perf_output_handle *handle)
2591{
2592 struct perf_mmap_data *data = handle->data;
2593 unsigned long head;
2594 int cpu;
2595
2596 data->done_head = data->head;
2597
2598 if (!handle->locked)
2599 goto out;
2600
2601again:
2602 /*
2603 * The xchg implies a full barrier that ensures all writes are done
2604 * before we publish the new head, matched by a rmb() in userspace when
2605 * reading this position.
2606 */
2607 while ((head = atomic_long_xchg(&data->done_head, 0)))
2608 data->user_page->data_head = head;
2609
2610 /*
2611 * NMI can happen here, which means we can miss a done_head update.
2612 */
2613
2614 cpu = atomic_xchg(&data->lock, -1);
2615 WARN_ON_ONCE(cpu != smp_processor_id());
2616
2617 /*
2618 * Therefore we have to validate we did not indeed do so.
2619 */
2620 if (unlikely(atomic_long_read(&data->done_head))) {
2621 /*
2622 * Since we had it locked, we can lock it again.
2623 */
2624 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2625 cpu_relax();
2626
2627 goto again;
2628 }
2629
2630 if (atomic_xchg(&data->wakeup, 0))
2631 perf_output_wakeup(handle);
2632out:
2633 local_irq_restore(handle->flags);
2634}
2635
2636static void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len)
2638{
2639 unsigned int pages_mask;
2640 unsigned int offset;
2641 unsigned int size;
2642 void **pages;
2643
2644 offset = handle->offset;
2645 pages_mask = handle->data->nr_pages - 1;
2646 pages = handle->data->data_pages;
2647
2648 do {
2649 unsigned int page_offset;
2650 int nr;
2651
2652 nr = (offset >> PAGE_SHIFT) & pages_mask;
2653 page_offset = offset & (PAGE_SIZE - 1);
2654 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2655
2656 memcpy(pages[nr] + page_offset, buf, size);
2657
2658 len -= size;
2659 buf += size;
2660 offset += size;
2661 } while (len);
2662
2663 handle->offset = offset;
2664
2665 /*
2666 * Check we didn't copy past our reservation window, taking the
2667 * possible unsigned int wrap into account.
2668 */
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670}
2671
2672#define perf_output_put(handle, x) \
2673 perf_output_copy((handle), &(x), sizeof(x))
2674
2675static int perf_output_begin(struct perf_output_handle *handle,
2676 struct perf_counter *counter, unsigned int size,
2677 int nmi, int sample)
2678{
2679 struct perf_counter *output_counter;
2680 struct perf_mmap_data *data;
2681 unsigned int offset, head;
2682 int have_lost;
2683 struct {
2684 struct perf_event_header header;
2685 u64 id;
2686 u64 lost;
2687 } lost_event;
2688
2689 rcu_read_lock();
2690 /*
2691 * For inherited counters we send all the output towards the parent.
2692 */
2693 if (counter->parent)
2694 counter = counter->parent;
2695
2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2700 data = rcu_dereference(counter->data);
2701 if (!data)
2702 goto out;
2703
2704 handle->data = data;
2705 handle->counter = counter;
2706 handle->nmi = nmi;
2707 handle->sample = sample;
2708
2709 if (!data->nr_pages)
2710 goto fail;
2711
2712 have_lost = atomic_read(&data->lost);
2713 if (have_lost)
2714 size += sizeof(lost_event);
2715
2716 perf_output_lock(handle);
2717
2718 do {
2719 offset = head = atomic_long_read(&data->head);
2720 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head)))
2722 goto fail;
2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2724
2725 handle->offset = offset;
2726 handle->head = head;
2727
2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2729 atomic_set(&data->wakeup, 1);
2730
2731 if (have_lost) {
2732 lost_event.header.type = PERF_EVENT_LOST;
2733 lost_event.header.misc = 0;
2734 lost_event.header.size = sizeof(lost_event);
2735 lost_event.id = counter->id;
2736 lost_event.lost = atomic_xchg(&data->lost, 0);
2737
2738 perf_output_put(handle, lost_event);
2739 }
2740
2741 return 0;
2742
2743fail:
2744 atomic_inc(&data->lost);
2745 perf_output_unlock(handle);
2746out:
2747 rcu_read_unlock();
2748
2749 return -ENOSPC;
2750}
2751
2752static void perf_output_end(struct perf_output_handle *handle)
2753{
2754 struct perf_counter *counter = handle->counter;
2755 struct perf_mmap_data *data = handle->data;
2756
2757 int wakeup_events = counter->attr.wakeup_events;
2758
2759 if (handle->sample && wakeup_events) {
2760 int events = atomic_inc_return(&data->events);
2761 if (events >= wakeup_events) {
2762 atomic_sub(wakeup_events, &data->events);
2763 atomic_set(&data->wakeup, 1);
2764 }
2765 }
2766
2767 perf_output_unlock(handle);
2768 rcu_read_unlock();
2769}
2770
2771static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2772{
2773 /*
2774 * only top level counters have the pid namespace they were created in
2775 */
2776 if (counter->parent)
2777 counter = counter->parent;
2778
2779 return task_tgid_nr_ns(p, counter->ns);
2780}
2781
2782static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2783{
2784 /*
2785 * only top level counters have the pid namespace they were created in
2786 */
2787 if (counter->parent)
2788 counter = counter->parent;
2789
2790 return task_pid_nr_ns(p, counter->ns);
2791}
2792
2793static void perf_output_read_one(struct perf_output_handle *handle,
2794 struct perf_counter *counter)
2795{
2796 u64 read_format = counter->attr.read_format;
2797 u64 values[4];
2798 int n = 0;
2799
2800 values[n++] = atomic64_read(&counter->count);
2801 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2802 values[n++] = counter->total_time_enabled +
2803 atomic64_read(&counter->child_total_time_enabled);
2804 }
2805 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2806 values[n++] = counter->total_time_running +
2807 atomic64_read(&counter->child_total_time_running);
2808 }
2809 if (read_format & PERF_FORMAT_ID)
2810 values[n++] = primary_counter_id(counter);
2811
2812 perf_output_copy(handle, values, n * sizeof(u64));
2813}
2814
2815/*
2816 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2817 */
2818static void perf_output_read_group(struct perf_output_handle *handle,
2819 struct perf_counter *counter)
2820{
2821 struct perf_counter *leader = counter->group_leader, *sub;
2822 u64 read_format = counter->attr.read_format;
2823 u64 values[5];
2824 int n = 0;
2825
2826 values[n++] = 1 + leader->nr_siblings;
2827
2828 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2829 values[n++] = leader->total_time_enabled;
2830
2831 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2832 values[n++] = leader->total_time_running;
2833
2834 if (leader != counter)
2835 leader->pmu->read(leader);
2836
2837 values[n++] = atomic64_read(&leader->count);
2838 if (read_format & PERF_FORMAT_ID)
2839 values[n++] = primary_counter_id(leader);
2840
2841 perf_output_copy(handle, values, n * sizeof(u64));
2842
2843 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2844 n = 0;
2845
2846 if (sub != counter)
2847 sub->pmu->read(sub);
2848
2849 values[n++] = atomic64_read(&sub->count);
2850 if (read_format & PERF_FORMAT_ID)
2851 values[n++] = primary_counter_id(sub);
2852
2853 perf_output_copy(handle, values, n * sizeof(u64));
2854 }
2855}
2856
2857static void perf_output_read(struct perf_output_handle *handle,
2858 struct perf_counter *counter)
2859{
2860 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2861 perf_output_read_group(handle, counter);
2862 else
2863 perf_output_read_one(handle, counter);
2864}
2865
2866void perf_counter_output(struct perf_counter *counter, int nmi,
2867 struct perf_sample_data *data)
2868{
2869 int ret;
2870 u64 sample_type = counter->attr.sample_type;
2871 struct perf_output_handle handle;
2872 struct perf_event_header header;
2873 u64 ip;
2874 struct {
2875 u32 pid, tid;
2876 } tid_entry;
2877 struct perf_callchain_entry *callchain = NULL;
2878 int callchain_size = 0;
2879 u64 time;
2880 struct {
2881 u32 cpu, reserved;
2882 } cpu_entry;
2883
2884 header.type = PERF_EVENT_SAMPLE;
2885 header.size = sizeof(header);
2886
2887 header.misc = 0;
2888 header.misc |= perf_misc_flags(data->regs);
2889
2890 if (sample_type & PERF_SAMPLE_IP) {
2891 ip = perf_instruction_pointer(data->regs);
2892 header.size += sizeof(ip);
2893 }
2894
2895 if (sample_type & PERF_SAMPLE_TID) {
2896 /* namespace issues */
2897 tid_entry.pid = perf_counter_pid(counter, current);
2898 tid_entry.tid = perf_counter_tid(counter, current);
2899
2900 header.size += sizeof(tid_entry);
2901 }
2902
2903 if (sample_type & PERF_SAMPLE_TIME) {
2904 /*
2905 * Maybe do better on x86 and provide cpu_clock_nmi()
2906 */
2907 time = sched_clock();
2908
2909 header.size += sizeof(u64);
2910 }
2911
2912 if (sample_type & PERF_SAMPLE_ADDR)
2913 header.size += sizeof(u64);
2914
2915 if (sample_type & PERF_SAMPLE_ID)
2916 header.size += sizeof(u64);
2917
2918 if (sample_type & PERF_SAMPLE_STREAM_ID)
2919 header.size += sizeof(u64);
2920
2921 if (sample_type & PERF_SAMPLE_CPU) {
2922 header.size += sizeof(cpu_entry);
2923
2924 cpu_entry.cpu = raw_smp_processor_id();
2925 cpu_entry.reserved = 0;
2926 }
2927
2928 if (sample_type & PERF_SAMPLE_PERIOD)
2929 header.size += sizeof(u64);
2930
2931 if (sample_type & PERF_SAMPLE_READ)
2932 header.size += perf_counter_read_size(counter);
2933
2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2935 callchain = perf_callchain(data->regs);
2936
2937 if (callchain) {
2938 callchain_size = (1 + callchain->nr) * sizeof(u64);
2939 header.size += callchain_size;
2940 } else
2941 header.size += sizeof(u64);
2942 }
2943
2944 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32);
2946
2947 if (data->raw)
2948 size += data->raw->size;
2949 else
2950 size += sizeof(u32);
2951
2952 WARN_ON_ONCE(size & (sizeof(u64)-1));
2953 header.size += size;
2954 }
2955
2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2957 if (ret)
2958 return;
2959
2960 perf_output_put(&handle, header);
2961
2962 if (sample_type & PERF_SAMPLE_IP)
2963 perf_output_put(&handle, ip);
2964
2965 if (sample_type & PERF_SAMPLE_TID)
2966 perf_output_put(&handle, tid_entry);
2967
2968 if (sample_type & PERF_SAMPLE_TIME)
2969 perf_output_put(&handle, time);
2970
2971 if (sample_type & PERF_SAMPLE_ADDR)
2972 perf_output_put(&handle, data->addr);
2973
2974 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter);
2976
2977 perf_output_put(&handle, id);
2978 }
2979
2980 if (sample_type & PERF_SAMPLE_STREAM_ID)
2981 perf_output_put(&handle, counter->id);
2982
2983 if (sample_type & PERF_SAMPLE_CPU)
2984 perf_output_put(&handle, cpu_entry);
2985
2986 if (sample_type & PERF_SAMPLE_PERIOD)
2987 perf_output_put(&handle, data->period);
2988
2989 if (sample_type & PERF_SAMPLE_READ)
2990 perf_output_read(&handle, counter);
2991
2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2993 if (callchain)
2994 perf_output_copy(&handle, callchain, callchain_size);
2995 else {
2996 u64 nr = 0;
2997 perf_output_put(&handle, nr);
2998 }
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) {
3003 perf_output_put(&handle, data->raw->size);
3004 perf_output_copy(&handle, data->raw->data, data->raw->size);
3005 } else {
3006 struct {
3007 u32 size;
3008 u32 data;
3009 } raw = {
3010 .size = sizeof(u32),
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 }
3016
3017 perf_output_end(&handle);
3018}
3019
3020/*
3021 * read event
3022 */
3023
3024struct perf_read_event {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 tid;
3029};
3030
3031static void
3032perf_counter_read_event(struct perf_counter *counter,
3033 struct task_struct *task)
3034{
3035 struct perf_output_handle handle;
3036 struct perf_read_event event = {
3037 .header = {
3038 .type = PERF_EVENT_READ,
3039 .misc = 0,
3040 .size = sizeof(event) + perf_counter_read_size(counter),
3041 },
3042 .pid = perf_counter_pid(counter, task),
3043 .tid = perf_counter_tid(counter, task),
3044 };
3045 int ret;
3046
3047 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3048 if (ret)
3049 return;
3050
3051 perf_output_put(&handle, event);
3052 perf_output_read(&handle, counter);
3053
3054 perf_output_end(&handle);
3055}
3056
3057/*
3058 * task tracking -- fork/exit
3059 *
3060 * enabled by: attr.comm | attr.mmap | attr.task
3061 */
3062
3063struct perf_task_event {
3064 struct task_struct *task;
3065 struct perf_counter_context *task_ctx;
3066
3067 struct {
3068 struct perf_event_header header;
3069
3070 u32 pid;
3071 u32 ppid;
3072 u32 tid;
3073 u32 ptid;
3074 } event;
3075};
3076
3077static void perf_counter_task_output(struct perf_counter *counter,
3078 struct perf_task_event *task_event)
3079{
3080 struct perf_output_handle handle;
3081 int size = task_event->event.header.size;
3082 struct task_struct *task = task_event->task;
3083 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3084
3085 if (ret)
3086 return;
3087
3088 task_event->event.pid = perf_counter_pid(counter, task);
3089 task_event->event.ppid = perf_counter_pid(counter, current);
3090
3091 task_event->event.tid = perf_counter_tid(counter, task);
3092 task_event->event.ptid = perf_counter_tid(counter, current);
3093
3094 perf_output_put(&handle, task_event->event);
3095 perf_output_end(&handle);
3096}
3097
3098static int perf_counter_task_match(struct perf_counter *counter)
3099{
3100 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3101 return 1;
3102
3103 return 0;
3104}
3105
3106static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3107 struct perf_task_event *task_event)
3108{
3109 struct perf_counter *counter;
3110
3111 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3112 return;
3113
3114 rcu_read_lock();
3115 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3116 if (perf_counter_task_match(counter))
3117 perf_counter_task_output(counter, task_event);
3118 }
3119 rcu_read_unlock();
3120}
3121
3122static void perf_counter_task_event(struct perf_task_event *task_event)
3123{
3124 struct perf_cpu_context *cpuctx;
3125 struct perf_counter_context *ctx = task_event->task_ctx;
3126
3127 cpuctx = &get_cpu_var(perf_cpu_context);
3128 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3129 put_cpu_var(perf_cpu_context);
3130
3131 rcu_read_lock();
3132 if (!ctx)
3133 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3134 if (ctx)
3135 perf_counter_task_ctx(ctx, task_event);
3136 rcu_read_unlock();
3137}
3138
3139static void perf_counter_task(struct task_struct *task,
3140 struct perf_counter_context *task_ctx,
3141 int new)
3142{
3143 struct perf_task_event task_event;
3144
3145 if (!atomic_read(&nr_comm_counters) &&
3146 !atomic_read(&nr_mmap_counters) &&
3147 !atomic_read(&nr_task_counters))
3148 return;
3149
3150 task_event = (struct perf_task_event){
3151 .task = task,
3152 .task_ctx = task_ctx,
3153 .event = {
3154 .header = {
3155 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3156 .misc = 0,
3157 .size = sizeof(task_event.event),
3158 },
3159 /* .pid */
3160 /* .ppid */
3161 /* .tid */
3162 /* .ptid */
3163 },
3164 };
3165
3166 perf_counter_task_event(&task_event);
3167}
3168
3169void perf_counter_fork(struct task_struct *task)
3170{
3171 perf_counter_task(task, NULL, 1);
3172}
3173
3174/*
3175 * comm tracking
3176 */
3177
3178struct perf_comm_event {
3179 struct task_struct *task;
3180 char *comm;
3181 int comm_size;
3182
3183 struct {
3184 struct perf_event_header header;
3185
3186 u32 pid;
3187 u32 tid;
3188 } event;
3189};
3190
3191static void perf_counter_comm_output(struct perf_counter *counter,
3192 struct perf_comm_event *comm_event)
3193{
3194 struct perf_output_handle handle;
3195 int size = comm_event->event.header.size;
3196 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3197
3198 if (ret)
3199 return;
3200
3201 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3202 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3203
3204 perf_output_put(&handle, comm_event->event);
3205 perf_output_copy(&handle, comm_event->comm,
3206 comm_event->comm_size);
3207 perf_output_end(&handle);
3208}
3209
3210static int perf_counter_comm_match(struct perf_counter *counter)
3211{
3212 if (counter->attr.comm)
3213 return 1;
3214
3215 return 0;
3216}
3217
3218static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3219 struct perf_comm_event *comm_event)
3220{
3221 struct perf_counter *counter;
3222
3223 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3224 return;
3225
3226 rcu_read_lock();
3227 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3228 if (perf_counter_comm_match(counter))
3229 perf_counter_comm_output(counter, comm_event);
3230 }
3231 rcu_read_unlock();
3232}
3233
3234static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3235{
3236 struct perf_cpu_context *cpuctx;
3237 struct perf_counter_context *ctx;
3238 unsigned int size;
3239 char comm[TASK_COMM_LEN];
3240
3241 memset(comm, 0, sizeof(comm));
3242 strncpy(comm, comm_event->task->comm, sizeof(comm));
3243 size = ALIGN(strlen(comm)+1, sizeof(u64));
3244
3245 comm_event->comm = comm;
3246 comm_event->comm_size = size;
3247
3248 comm_event->event.header.size = sizeof(comm_event->event) + size;
3249
3250 cpuctx = &get_cpu_var(perf_cpu_context);
3251 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3252 put_cpu_var(perf_cpu_context);
3253
3254 rcu_read_lock();
3255 /*
3256 * doesn't really matter which of the child contexts the
3257 * events ends up in.
3258 */
3259 ctx = rcu_dereference(current->perf_counter_ctxp);
3260 if (ctx)
3261 perf_counter_comm_ctx(ctx, comm_event);
3262 rcu_read_unlock();
3263}
3264
3265void perf_counter_comm(struct task_struct *task)
3266{
3267 struct perf_comm_event comm_event;
3268
3269 if (task->perf_counter_ctxp)
3270 perf_counter_enable_on_exec(task);
3271
3272 if (!atomic_read(&nr_comm_counters))
3273 return;
3274
3275 comm_event = (struct perf_comm_event){
3276 .task = task,
3277 /* .comm */
3278 /* .comm_size */
3279 .event = {
3280 .header = {
3281 .type = PERF_EVENT_COMM,
3282 .misc = 0,
3283 /* .size */
3284 },
3285 /* .pid */
3286 /* .tid */
3287 },
3288 };
3289
3290 perf_counter_comm_event(&comm_event);
3291}
3292
3293/*
3294 * mmap tracking
3295 */
3296
3297struct perf_mmap_event {
3298 struct vm_area_struct *vma;
3299
3300 const char *file_name;
3301 int file_size;
3302
3303 struct {
3304 struct perf_event_header header;
3305
3306 u32 pid;
3307 u32 tid;
3308 u64 start;
3309 u64 len;
3310 u64 pgoff;
3311 } event;
3312};
3313
3314static void perf_counter_mmap_output(struct perf_counter *counter,
3315 struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_output_handle handle;
3318 int size = mmap_event->event.header.size;
3319 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3320
3321 if (ret)
3322 return;
3323
3324 mmap_event->event.pid = perf_counter_pid(counter, current);
3325 mmap_event->event.tid = perf_counter_tid(counter, current);
3326
3327 perf_output_put(&handle, mmap_event->event);
3328 perf_output_copy(&handle, mmap_event->file_name,
3329 mmap_event->file_size);
3330 perf_output_end(&handle);
3331}
3332
3333static int perf_counter_mmap_match(struct perf_counter *counter,
3334 struct perf_mmap_event *mmap_event)
3335{
3336 if (counter->attr.mmap)
3337 return 1;
3338
3339 return 0;
3340}
3341
3342static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3343 struct perf_mmap_event *mmap_event)
3344{
3345 struct perf_counter *counter;
3346
3347 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3348 return;
3349
3350 rcu_read_lock();
3351 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3352 if (perf_counter_mmap_match(counter, mmap_event))
3353 perf_counter_mmap_output(counter, mmap_event);
3354 }
3355 rcu_read_unlock();
3356}
3357
3358static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3359{
3360 struct perf_cpu_context *cpuctx;
3361 struct perf_counter_context *ctx;
3362 struct vm_area_struct *vma = mmap_event->vma;
3363 struct file *file = vma->vm_file;
3364 unsigned int size;
3365 char tmp[16];
3366 char *buf = NULL;
3367 const char *name;
3368
3369 memset(tmp, 0, sizeof(tmp));
3370
3371 if (file) {
3372 /*
3373 * d_path works from the end of the buffer backwards, so we
3374 * need to add enough zero bytes after the string to handle
3375 * the 64bit alignment we do later.
3376 */
3377 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3378 if (!buf) {
3379 name = strncpy(tmp, "//enomem", sizeof(tmp));
3380 goto got_name;
3381 }
3382 name = d_path(&file->f_path, buf, PATH_MAX);
3383 if (IS_ERR(name)) {
3384 name = strncpy(tmp, "//toolong", sizeof(tmp));
3385 goto got_name;
3386 }
3387 } else {
3388 if (arch_vma_name(mmap_event->vma)) {
3389 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3390 sizeof(tmp));
3391 goto got_name;
3392 }
3393
3394 if (!vma->vm_mm) {
3395 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3396 goto got_name;
3397 }
3398
3399 name = strncpy(tmp, "//anon", sizeof(tmp));
3400 goto got_name;
3401 }
3402
3403got_name:
3404 size = ALIGN(strlen(name)+1, sizeof(u64));
3405
3406 mmap_event->file_name = name;
3407 mmap_event->file_size = size;
3408
3409 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3410
3411 cpuctx = &get_cpu_var(perf_cpu_context);
3412 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3413 put_cpu_var(perf_cpu_context);
3414
3415 rcu_read_lock();
3416 /*
3417 * doesn't really matter which of the child contexts the
3418 * events ends up in.
3419 */
3420 ctx = rcu_dereference(current->perf_counter_ctxp);
3421 if (ctx)
3422 perf_counter_mmap_ctx(ctx, mmap_event);
3423 rcu_read_unlock();
3424
3425 kfree(buf);
3426}
3427
3428void __perf_counter_mmap(struct vm_area_struct *vma)
3429{
3430 struct perf_mmap_event mmap_event;
3431
3432 if (!atomic_read(&nr_mmap_counters))
3433 return;
3434
3435 mmap_event = (struct perf_mmap_event){
3436 .vma = vma,
3437 /* .file_name */
3438 /* .file_size */
3439 .event = {
3440 .header = {
3441 .type = PERF_EVENT_MMAP,
3442 .misc = 0,
3443 /* .size */
3444 },
3445 /* .pid */
3446 /* .tid */
3447 .start = vma->vm_start,
3448 .len = vma->vm_end - vma->vm_start,
3449 .pgoff = vma->vm_pgoff,
3450 },
3451 };
3452
3453 perf_counter_mmap_event(&mmap_event);
3454}
3455
3456/*
3457 * IRQ throttle logging
3458 */
3459
3460static void perf_log_throttle(struct perf_counter *counter, int enable)
3461{
3462 struct perf_output_handle handle;
3463 int ret;
3464
3465 struct {
3466 struct perf_event_header header;
3467 u64 time;
3468 u64 id;
3469 u64 stream_id;
3470 } throttle_event = {
3471 .header = {
3472 .type = PERF_EVENT_THROTTLE,
3473 .misc = 0,
3474 .size = sizeof(throttle_event),
3475 },
3476 .time = sched_clock(),
3477 .id = primary_counter_id(counter),
3478 .stream_id = counter->id,
3479 };
3480
3481 if (enable)
3482 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3483
3484 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3485 if (ret)
3486 return;
3487
3488 perf_output_put(&handle, throttle_event);
3489 perf_output_end(&handle);
3490}
3491
3492/*
3493 * Generic counter overflow handling, sampling.
3494 */
3495
3496int perf_counter_overflow(struct perf_counter *counter, int nmi,
3497 struct perf_sample_data *data)
3498{
3499 int events = atomic_read(&counter->event_limit);
3500 int throttle = counter->pmu->unthrottle != NULL;
3501 struct hw_perf_counter *hwc = &counter->hw;
3502 int ret = 0;
3503
3504 if (!throttle) {
3505 hwc->interrupts++;
3506 } else {
3507 if (hwc->interrupts != MAX_INTERRUPTS) {
3508 hwc->interrupts++;
3509 if (HZ * hwc->interrupts >
3510 (u64)sysctl_perf_counter_sample_rate) {
3511 hwc->interrupts = MAX_INTERRUPTS;
3512 perf_log_throttle(counter, 0);
3513 ret = 1;
3514 }
3515 } else {
3516 /*
3517 * Keep re-disabling counters even though on the previous
3518 * pass we disabled it - just in case we raced with a
3519 * sched-in and the counter got enabled again:
3520 */
3521 ret = 1;
3522 }
3523 }
3524
3525 if (counter->attr.freq) {
3526 u64 now = sched_clock();
3527 s64 delta = now - hwc->freq_stamp;
3528
3529 hwc->freq_stamp = now;
3530
3531 if (delta > 0 && delta < TICK_NSEC)
3532 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3533 }
3534
3535 /*
3536 * XXX event_limit might not quite work as expected on inherited
3537 * counters
3538 */
3539
3540 counter->pending_kill = POLL_IN;
3541 if (events && atomic_dec_and_test(&counter->event_limit)) {
3542 ret = 1;
3543 counter->pending_kill = POLL_HUP;
3544 if (nmi) {
3545 counter->pending_disable = 1;
3546 perf_pending_queue(&counter->pending,
3547 perf_pending_counter);
3548 } else
3549 perf_counter_disable(counter);
3550 }
3551
3552 perf_counter_output(counter, nmi, data);
3553 return ret;
3554}
3555
3556/*
3557 * Generic software counter infrastructure
3558 */
3559
3560/*
3561 * We directly increment counter->count and keep a second value in
3562 * counter->hw.period_left to count intervals. This period counter
3563 * is kept in the range [-sample_period, 0] so that we can use the
3564 * sign as trigger.
3565 */
3566
3567static u64 perf_swcounter_set_period(struct perf_counter *counter)
3568{
3569 struct hw_perf_counter *hwc = &counter->hw;
3570 u64 period = hwc->last_period;
3571 u64 nr, offset;
3572 s64 old, val;
3573
3574 hwc->last_period = hwc->sample_period;
3575
3576again:
3577 old = val = atomic64_read(&hwc->period_left);
3578 if (val < 0)
3579 return 0;
3580
3581 nr = div64_u64(period + val, period);
3582 offset = nr * period;
3583 val -= offset;
3584 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3585 goto again;
3586
3587 return nr;
3588}
3589
3590static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data)
3592{
3593 struct hw_perf_counter *hwc = &counter->hw;
3594 u64 overflow;
3595
3596 data->period = counter->hw.last_period;
3597 overflow = perf_swcounter_set_period(counter);
3598
3599 if (hwc->interrupts == MAX_INTERRUPTS)
3600 return;
3601
3602 for (; overflow; overflow--) {
3603 if (perf_counter_overflow(counter, nmi, data)) {
3604 /*
3605 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS.
3607 */
3608 break;
3609 }
3610 }
3611}
3612
3613static void perf_swcounter_unthrottle(struct perf_counter *counter)
3614{
3615 /*
3616 * Nothing to do, we already reset hwc->interrupts.
3617 */
3618}
3619
3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3621 int nmi, struct perf_sample_data *data)
3622{
3623 struct hw_perf_counter *hwc = &counter->hw;
3624
3625 atomic64_add(nr, &counter->count);
3626
3627 if (!hwc->sample_period)
3628 return;
3629
3630 if (!data->regs)
3631 return;
3632
3633 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data);
3635}
3636
3637static int perf_swcounter_is_counting(struct perf_counter *counter)
3638{
3639 /*
3640 * The counter is active, we're good!
3641 */
3642 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3643 return 1;
3644
3645 /*
3646 * The counter is off/error, not counting.
3647 */
3648 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3649 return 0;
3650
3651 /*
3652 * The counter is inactive, if the context is active
3653 * we're part of a group that didn't make it on the 'pmu',
3654 * not counting.
3655 */
3656 if (counter->ctx->is_active)
3657 return 0;
3658
3659 /*
3660 * We're inactive and the context is too, this means the
3661 * task is scheduled out, we're counting events that happen
3662 * to us, like migration events.
3663 */
3664 return 1;
3665}
3666
3667static int perf_swcounter_match(struct perf_counter *counter,
3668 enum perf_type_id type,
3669 u32 event, struct pt_regs *regs)
3670{
3671 if (!perf_swcounter_is_counting(counter))
3672 return 0;
3673
3674 if (counter->attr.type != type)
3675 return 0;
3676 if (counter->attr.config != event)
3677 return 0;
3678
3679 if (regs) {
3680 if (counter->attr.exclude_user && user_mode(regs))
3681 return 0;
3682
3683 if (counter->attr.exclude_kernel && !user_mode(regs))
3684 return 0;
3685 }
3686
3687 return 1;
3688}
3689
3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3691 enum perf_type_id type,
3692 u32 event, u64 nr, int nmi,
3693 struct perf_sample_data *data)
3694{
3695 struct perf_counter *counter;
3696
3697 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3698 return;
3699
3700 rcu_read_lock();
3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3702 if (perf_swcounter_match(counter, type, event, data->regs))
3703 perf_swcounter_add(counter, nr, nmi, data);
3704 }
3705 rcu_read_unlock();
3706}
3707
3708static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3709{
3710 if (in_nmi())
3711 return &cpuctx->recursion[3];
3712
3713 if (in_irq())
3714 return &cpuctx->recursion[2];
3715
3716 if (in_softirq())
3717 return &cpuctx->recursion[1];
3718
3719 return &cpuctx->recursion[0];
3720}
3721
3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3723 u64 nr, int nmi,
3724 struct perf_sample_data *data)
3725{
3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3727 int *recursion = perf_swcounter_recursion_context(cpuctx);
3728 struct perf_counter_context *ctx;
3729
3730 if (*recursion)
3731 goto out;
3732
3733 (*recursion)++;
3734 barrier();
3735
3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3737 nr, nmi, data);
3738 rcu_read_lock();
3739 /*
3740 * doesn't really matter which of the child contexts the
3741 * events ends up in.
3742 */
3743 ctx = rcu_dereference(current->perf_counter_ctxp);
3744 if (ctx)
3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3746 rcu_read_unlock();
3747
3748 barrier();
3749 (*recursion)--;
3750
3751out:
3752 put_cpu_var(perf_cpu_context);
3753}
3754
3755void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3756 struct pt_regs *regs, u64 addr)
3757{
3758 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr,
3761 };
3762
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3764}
3765
3766static void perf_swcounter_read(struct perf_counter *counter)
3767{
3768}
3769
3770static int perf_swcounter_enable(struct perf_counter *counter)
3771{
3772 struct hw_perf_counter *hwc = &counter->hw;
3773
3774 if (hwc->sample_period) {
3775 hwc->last_period = hwc->sample_period;
3776 perf_swcounter_set_period(counter);
3777 }
3778 return 0;
3779}
3780
3781static void perf_swcounter_disable(struct perf_counter *counter)
3782{
3783}
3784
3785static const struct pmu perf_ops_generic = {
3786 .enable = perf_swcounter_enable,
3787 .disable = perf_swcounter_disable,
3788 .read = perf_swcounter_read,
3789 .unthrottle = perf_swcounter_unthrottle,
3790};
3791
3792/*
3793 * hrtimer based swcounter callback
3794 */
3795
3796static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{
3798 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data;
3800 struct perf_counter *counter;
3801 u64 period;
3802
3803 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3804 counter->pmu->read(counter);
3805
3806 data.addr = 0;
3807 data.regs = get_irq_regs();
3808 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP.
3811 */
3812 if ((counter->attr.exclude_kernel || !data.regs) &&
3813 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current);
3815
3816 if (data.regs) {
3817 if (perf_counter_overflow(counter, 0, &data))
3818 ret = HRTIMER_NORESTART;
3819 }
3820
3821 period = max_t(u64, 10000, counter->hw.sample_period);
3822 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3823
3824 return ret;
3825}
3826
3827/*
3828 * Software counter: cpu wall time clock
3829 */
3830
3831static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3832{
3833 int cpu = raw_smp_processor_id();
3834 s64 prev;
3835 u64 now;
3836
3837 now = cpu_clock(cpu);
3838 prev = atomic64_read(&counter->hw.prev_count);
3839 atomic64_set(&counter->hw.prev_count, now);
3840 atomic64_add(now - prev, &counter->count);
3841}
3842
3843static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3844{
3845 struct hw_perf_counter *hwc = &counter->hw;
3846 int cpu = raw_smp_processor_id();
3847
3848 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3849 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3850 hwc->hrtimer.function = perf_swcounter_hrtimer;
3851 if (hwc->sample_period) {
3852 u64 period = max_t(u64, 10000, hwc->sample_period);
3853 __hrtimer_start_range_ns(&hwc->hrtimer,
3854 ns_to_ktime(period), 0,
3855 HRTIMER_MODE_REL, 0);
3856 }
3857
3858 return 0;
3859}
3860
3861static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3862{
3863 if (counter->hw.sample_period)
3864 hrtimer_cancel(&counter->hw.hrtimer);
3865 cpu_clock_perf_counter_update(counter);
3866}
3867
3868static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3869{
3870 cpu_clock_perf_counter_update(counter);
3871}
3872
3873static const struct pmu perf_ops_cpu_clock = {
3874 .enable = cpu_clock_perf_counter_enable,
3875 .disable = cpu_clock_perf_counter_disable,
3876 .read = cpu_clock_perf_counter_read,
3877};
3878
3879/*
3880 * Software counter: task time clock
3881 */
3882
3883static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3884{
3885 u64 prev;
3886 s64 delta;
3887
3888 prev = atomic64_xchg(&counter->hw.prev_count, now);
3889 delta = now - prev;
3890 atomic64_add(delta, &counter->count);
3891}
3892
3893static int task_clock_perf_counter_enable(struct perf_counter *counter)
3894{
3895 struct hw_perf_counter *hwc = &counter->hw;
3896 u64 now;
3897
3898 now = counter->ctx->time;
3899
3900 atomic64_set(&hwc->prev_count, now);
3901 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3902 hwc->hrtimer.function = perf_swcounter_hrtimer;
3903 if (hwc->sample_period) {
3904 u64 period = max_t(u64, 10000, hwc->sample_period);
3905 __hrtimer_start_range_ns(&hwc->hrtimer,
3906 ns_to_ktime(period), 0,
3907 HRTIMER_MODE_REL, 0);
3908 }
3909
3910 return 0;
3911}
3912
3913static void task_clock_perf_counter_disable(struct perf_counter *counter)
3914{
3915 if (counter->hw.sample_period)
3916 hrtimer_cancel(&counter->hw.hrtimer);
3917 task_clock_perf_counter_update(counter, counter->ctx->time);
3918
3919}
3920
3921static void task_clock_perf_counter_read(struct perf_counter *counter)
3922{
3923 u64 time;
3924
3925 if (!in_nmi()) {
3926 update_context_time(counter->ctx);
3927 time = counter->ctx->time;
3928 } else {
3929 u64 now = perf_clock();
3930 u64 delta = now - counter->ctx->timestamp;
3931 time = counter->ctx->time + delta;
3932 }
3933
3934 task_clock_perf_counter_update(counter, time);
3935}
3936
3937static const struct pmu perf_ops_task_clock = {
3938 .enable = task_clock_perf_counter_enable,
3939 .disable = task_clock_perf_counter_disable,
3940 .read = task_clock_perf_counter_read,
3941};
3942
3943#ifdef CONFIG_EVENT_PROFILE
3944void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3945 int entry_size)
3946{
3947 struct perf_raw_record raw = {
3948 .size = entry_size,
3949 .data = record,
3950 };
3951
3952 struct perf_sample_data data = {
3953 .regs = get_irq_regs(),
3954 .addr = addr,
3955 .raw = &raw,
3956 };
3957
3958 if (!data.regs)
3959 data.regs = task_pt_regs(current);
3960
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3962}
3963EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3964
3965extern int ftrace_profile_enable(int);
3966extern void ftrace_profile_disable(int);
3967
3968static void tp_perf_counter_destroy(struct perf_counter *counter)
3969{
3970 ftrace_profile_disable(counter->attr.config);
3971}
3972
3973static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3974{
3975 /*
3976 * Raw tracepoint data is a severe data leak, only allow root to
3977 * have these.
3978 */
3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3981 !capable(CAP_SYS_ADMIN))
3982 return ERR_PTR(-EPERM);
3983
3984 if (ftrace_profile_enable(counter->attr.config))
3985 return NULL;
3986
3987 counter->destroy = tp_perf_counter_destroy;
3988
3989 return &perf_ops_generic;
3990}
3991#else
3992static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3993{
3994 return NULL;
3995}
3996#endif
3997
3998atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3999
4000static void sw_perf_counter_destroy(struct perf_counter *counter)
4001{
4002 u64 event = counter->attr.config;
4003
4004 WARN_ON(counter->parent);
4005
4006 atomic_dec(&perf_swcounter_enabled[event]);
4007}
4008
4009static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
4010{
4011 const struct pmu *pmu = NULL;
4012 u64 event = counter->attr.config;
4013
4014 /*
4015 * Software counters (currently) can't in general distinguish
4016 * between user, kernel and hypervisor events.
4017 * However, context switches and cpu migrations are considered
4018 * to be kernel events, and page faults are never hypervisor
4019 * events.
4020 */
4021 switch (event) {
4022 case PERF_COUNT_SW_CPU_CLOCK:
4023 pmu = &perf_ops_cpu_clock;
4024
4025 break;
4026 case PERF_COUNT_SW_TASK_CLOCK:
4027 /*
4028 * If the user instantiates this as a per-cpu counter,
4029 * use the cpu_clock counter instead.
4030 */
4031 if (counter->ctx->task)
4032 pmu = &perf_ops_task_clock;
4033 else
4034 pmu = &perf_ops_cpu_clock;
4035
4036 break;
4037 case PERF_COUNT_SW_PAGE_FAULTS:
4038 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4039 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4040 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4041 case PERF_COUNT_SW_CPU_MIGRATIONS:
4042 if (!counter->parent) {
4043 atomic_inc(&perf_swcounter_enabled[event]);
4044 counter->destroy = sw_perf_counter_destroy;
4045 }
4046 pmu = &perf_ops_generic;
4047 break;
4048 }
4049
4050 return pmu;
4051}
4052
4053/*
4054 * Allocate and initialize a counter structure
4055 */
4056static struct perf_counter *
4057perf_counter_alloc(struct perf_counter_attr *attr,
4058 int cpu,
4059 struct perf_counter_context *ctx,
4060 struct perf_counter *group_leader,
4061 struct perf_counter *parent_counter,
4062 gfp_t gfpflags)
4063{
4064 const struct pmu *pmu;
4065 struct perf_counter *counter;
4066 struct hw_perf_counter *hwc;
4067 long err;
4068
4069 counter = kzalloc(sizeof(*counter), gfpflags);
4070 if (!counter)
4071 return ERR_PTR(-ENOMEM);
4072
4073 /*
4074 * Single counters are their own group leaders, with an
4075 * empty sibling list:
4076 */
4077 if (!group_leader)
4078 group_leader = counter;
4079
4080 mutex_init(&counter->child_mutex);
4081 INIT_LIST_HEAD(&counter->child_list);
4082
4083 INIT_LIST_HEAD(&counter->list_entry);
4084 INIT_LIST_HEAD(&counter->event_entry);
4085 INIT_LIST_HEAD(&counter->sibling_list);
4086 init_waitqueue_head(&counter->waitq);
4087
4088 mutex_init(&counter->mmap_mutex);
4089
4090 counter->cpu = cpu;
4091 counter->attr = *attr;
4092 counter->group_leader = group_leader;
4093 counter->pmu = NULL;
4094 counter->ctx = ctx;
4095 counter->oncpu = -1;
4096
4097 counter->parent = parent_counter;
4098
4099 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4100 counter->id = atomic64_inc_return(&perf_counter_id);
4101
4102 counter->state = PERF_COUNTER_STATE_INACTIVE;
4103
4104 if (attr->disabled)
4105 counter->state = PERF_COUNTER_STATE_OFF;
4106
4107 pmu = NULL;
4108
4109 hwc = &counter->hw;
4110 hwc->sample_period = attr->sample_period;
4111 if (attr->freq && attr->sample_freq)
4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
4114
4115 atomic64_set(&hwc->period_left, hwc->sample_period);
4116
4117 /*
4118 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4119 */
4120 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4121 goto done;
4122
4123 switch (attr->type) {
4124 case PERF_TYPE_RAW:
4125 case PERF_TYPE_HARDWARE:
4126 case PERF_TYPE_HW_CACHE:
4127 pmu = hw_perf_counter_init(counter);
4128 break;
4129
4130 case PERF_TYPE_SOFTWARE:
4131 pmu = sw_perf_counter_init(counter);
4132 break;
4133
4134 case PERF_TYPE_TRACEPOINT:
4135 pmu = tp_perf_counter_init(counter);
4136 break;
4137
4138 default:
4139 break;
4140 }
4141done:
4142 err = 0;
4143 if (!pmu)
4144 err = -EINVAL;
4145 else if (IS_ERR(pmu))
4146 err = PTR_ERR(pmu);
4147
4148 if (err) {
4149 if (counter->ns)
4150 put_pid_ns(counter->ns);
4151 kfree(counter);
4152 return ERR_PTR(err);
4153 }
4154
4155 counter->pmu = pmu;
4156
4157 if (!counter->parent) {
4158 atomic_inc(&nr_counters);
4159 if (counter->attr.mmap)
4160 atomic_inc(&nr_mmap_counters);
4161 if (counter->attr.comm)
4162 atomic_inc(&nr_comm_counters);
4163 if (counter->attr.task)
4164 atomic_inc(&nr_task_counters);
4165 }
4166
4167 return counter;
4168}
4169
4170static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4171 struct perf_counter_attr *attr)
4172{
4173 int ret;
4174 u32 size;
4175
4176 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4177 return -EFAULT;
4178
4179 /*
4180 * zero the full structure, so that a short copy will be nice.
4181 */
4182 memset(attr, 0, sizeof(*attr));
4183
4184 ret = get_user(size, &uattr->size);
4185 if (ret)
4186 return ret;
4187
4188 if (size > PAGE_SIZE) /* silly large */
4189 goto err_size;
4190
4191 if (!size) /* abi compat */
4192 size = PERF_ATTR_SIZE_VER0;
4193
4194 if (size < PERF_ATTR_SIZE_VER0)
4195 goto err_size;
4196
4197 /*
4198 * If we're handed a bigger struct than we know of,
4199 * ensure all the unknown bits are 0.
4200 */
4201 if (size > sizeof(*attr)) {
4202 unsigned long val;
4203 unsigned long __user *addr;
4204 unsigned long __user *end;
4205
4206 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4207 sizeof(unsigned long));
4208 end = PTR_ALIGN((void __user *)uattr + size,
4209 sizeof(unsigned long));
4210
4211 for (; addr < end; addr += sizeof(unsigned long)) {
4212 ret = get_user(val, addr);
4213 if (ret)
4214 return ret;
4215 if (val)
4216 goto err_size;
4217 }
4218 size = sizeof(*attr);
4219 }
4220
4221 ret = copy_from_user(attr, uattr, size);
4222 if (ret)
4223 return -EFAULT;
4224
4225 /*
4226 * If the type exists, the corresponding creation will verify
4227 * the attr->config.
4228 */
4229 if (attr->type >= PERF_TYPE_MAX)
4230 return -EINVAL;
4231
4232 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4233 return -EINVAL;
4234
4235 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4236 return -EINVAL;
4237
4238 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4239 return -EINVAL;
4240
4241out:
4242 return ret;
4243
4244err_size:
4245 put_user(sizeof(*attr), &uattr->size);
4246 ret = -E2BIG;
4247 goto out;
4248}
4249
4250int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4251{
4252 struct perf_counter *output_counter = NULL;
4253 struct file *output_file = NULL;
4254 struct perf_counter *old_output;
4255 int fput_needed = 0;
4256 int ret = -EINVAL;
4257
4258 if (!output_fd)
4259 goto set;
4260
4261 output_file = fget_light(output_fd, &fput_needed);
4262 if (!output_file)
4263 return -EBADF;
4264
4265 if (output_file->f_op != &perf_fops)
4266 goto out;
4267
4268 output_counter = output_file->private_data;
4269
4270 /* Don't chain output fds */
4271 if (output_counter->output)
4272 goto out;
4273
4274 /* Don't set an output fd when we already have an output channel */
4275 if (counter->data)
4276 goto out;
4277
4278 atomic_long_inc(&output_file->f_count);
4279
4280set:
4281 mutex_lock(&counter->mmap_mutex);
4282 old_output = counter->output;
4283 rcu_assign_pointer(counter->output, output_counter);
4284 mutex_unlock(&counter->mmap_mutex);
4285
4286 if (old_output) {
4287 /*
4288 * we need to make sure no existing perf_output_*()
4289 * is still referencing this counter.
4290 */
4291 synchronize_rcu();
4292 fput(old_output->filp);
4293 }
4294
4295 ret = 0;
4296out:
4297 fput_light(output_file, fput_needed);
4298 return ret;
4299}
4300
4301/**
4302 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4303 *
4304 * @attr_uptr: event type attributes for monitoring/sampling
4305 * @pid: target pid
4306 * @cpu: target cpu
4307 * @group_fd: group leader counter fd
4308 */
4309SYSCALL_DEFINE5(perf_counter_open,
4310 struct perf_counter_attr __user *, attr_uptr,
4311 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4312{
4313 struct perf_counter *counter, *group_leader;
4314 struct perf_counter_attr attr;
4315 struct perf_counter_context *ctx;
4316 struct file *counter_file = NULL;
4317 struct file *group_file = NULL;
4318 int fput_needed = 0;
4319 int fput_needed2 = 0;
4320 int err;
4321
4322 /* for future expandability... */
4323 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4324 return -EINVAL;
4325
4326 err = perf_copy_attr(attr_uptr, &attr);
4327 if (err)
4328 return err;
4329
4330 if (!attr.exclude_kernel) {
4331 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4332 return -EACCES;
4333 }
4334
4335 if (attr.freq) {
4336 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4337 return -EINVAL;
4338 }
4339
4340 /*
4341 * Get the target context (task or percpu):
4342 */
4343 ctx = find_get_context(pid, cpu);
4344 if (IS_ERR(ctx))
4345 return PTR_ERR(ctx);
4346
4347 /*
4348 * Look up the group leader (we will attach this counter to it):
4349 */
4350 group_leader = NULL;
4351 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4352 err = -EINVAL;
4353 group_file = fget_light(group_fd, &fput_needed);
4354 if (!group_file)
4355 goto err_put_context;
4356 if (group_file->f_op != &perf_fops)
4357 goto err_put_context;
4358
4359 group_leader = group_file->private_data;
4360 /*
4361 * Do not allow a recursive hierarchy (this new sibling
4362 * becoming part of another group-sibling):
4363 */
4364 if (group_leader->group_leader != group_leader)
4365 goto err_put_context;
4366 /*
4367 * Do not allow to attach to a group in a different
4368 * task or CPU context:
4369 */
4370 if (group_leader->ctx != ctx)
4371 goto err_put_context;
4372 /*
4373 * Only a group leader can be exclusive or pinned
4374 */
4375 if (attr.exclusive || attr.pinned)
4376 goto err_put_context;
4377 }
4378
4379 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4380 NULL, GFP_KERNEL);
4381 err = PTR_ERR(counter);
4382 if (IS_ERR(counter))
4383 goto err_put_context;
4384
4385 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4386 if (err < 0)
4387 goto err_free_put_context;
4388
4389 counter_file = fget_light(err, &fput_needed2);
4390 if (!counter_file)
4391 goto err_free_put_context;
4392
4393 if (flags & PERF_FLAG_FD_OUTPUT) {
4394 err = perf_counter_set_output(counter, group_fd);
4395 if (err)
4396 goto err_fput_free_put_context;
4397 }
4398
4399 counter->filp = counter_file;
4400 WARN_ON_ONCE(ctx->parent_ctx);
4401 mutex_lock(&ctx->mutex);
4402 perf_install_in_context(ctx, counter, cpu);
4403 ++ctx->generation;
4404 mutex_unlock(&ctx->mutex);
4405
4406 counter->owner = current;
4407 get_task_struct(current);
4408 mutex_lock(&current->perf_counter_mutex);
4409 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4410 mutex_unlock(&current->perf_counter_mutex);
4411
4412err_fput_free_put_context:
4413 fput_light(counter_file, fput_needed2);
4414
4415err_free_put_context:
4416 if (err < 0)
4417 kfree(counter);
4418
4419err_put_context:
4420 if (err < 0)
4421 put_ctx(ctx);
4422
4423 fput_light(group_file, fput_needed);
4424
4425 return err;
4426}
4427
4428/*
4429 * inherit a counter from parent task to child task:
4430 */
4431static struct perf_counter *
4432inherit_counter(struct perf_counter *parent_counter,
4433 struct task_struct *parent,
4434 struct perf_counter_context *parent_ctx,
4435 struct task_struct *child,
4436 struct perf_counter *group_leader,
4437 struct perf_counter_context *child_ctx)
4438{
4439 struct perf_counter *child_counter;
4440
4441 /*
4442 * Instead of creating recursive hierarchies of counters,
4443 * we link inherited counters back to the original parent,
4444 * which has a filp for sure, which we use as the reference
4445 * count:
4446 */
4447 if (parent_counter->parent)
4448 parent_counter = parent_counter->parent;
4449
4450 child_counter = perf_counter_alloc(&parent_counter->attr,
4451 parent_counter->cpu, child_ctx,
4452 group_leader, parent_counter,
4453 GFP_KERNEL);
4454 if (IS_ERR(child_counter))
4455 return child_counter;
4456 get_ctx(child_ctx);
4457
4458 /*
4459 * Make the child state follow the state of the parent counter,
4460 * not its attr.disabled bit. We hold the parent's mutex,
4461 * so we won't race with perf_counter_{en, dis}able_family.
4462 */
4463 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4464 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4465 else
4466 child_counter->state = PERF_COUNTER_STATE_OFF;
4467
4468 if (parent_counter->attr.freq)
4469 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4470
4471 /*
4472 * Link it up in the child's context:
4473 */
4474 add_counter_to_ctx(child_counter, child_ctx);
4475
4476 /*
4477 * Get a reference to the parent filp - we will fput it
4478 * when the child counter exits. This is safe to do because
4479 * we are in the parent and we know that the filp still
4480 * exists and has a nonzero count:
4481 */
4482 atomic_long_inc(&parent_counter->filp->f_count);
4483
4484 /*
4485 * Link this into the parent counter's child list
4486 */
4487 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4488 mutex_lock(&parent_counter->child_mutex);
4489 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4490 mutex_unlock(&parent_counter->child_mutex);
4491
4492 return child_counter;
4493}
4494
4495static int inherit_group(struct perf_counter *parent_counter,
4496 struct task_struct *parent,
4497 struct perf_counter_context *parent_ctx,
4498 struct task_struct *child,
4499 struct perf_counter_context *child_ctx)
4500{
4501 struct perf_counter *leader;
4502 struct perf_counter *sub;
4503 struct perf_counter *child_ctr;
4504
4505 leader = inherit_counter(parent_counter, parent, parent_ctx,
4506 child, NULL, child_ctx);
4507 if (IS_ERR(leader))
4508 return PTR_ERR(leader);
4509 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4510 child_ctr = inherit_counter(sub, parent, parent_ctx,
4511 child, leader, child_ctx);
4512 if (IS_ERR(child_ctr))
4513 return PTR_ERR(child_ctr);
4514 }
4515 return 0;
4516}
4517
4518static void sync_child_counter(struct perf_counter *child_counter,
4519 struct task_struct *child)
4520{
4521 struct perf_counter *parent_counter = child_counter->parent;
4522 u64 child_val;
4523
4524 if (child_counter->attr.inherit_stat)
4525 perf_counter_read_event(child_counter, child);
4526
4527 child_val = atomic64_read(&child_counter->count);
4528
4529 /*
4530 * Add back the child's count to the parent's count:
4531 */
4532 atomic64_add(child_val, &parent_counter->count);
4533 atomic64_add(child_counter->total_time_enabled,
4534 &parent_counter->child_total_time_enabled);
4535 atomic64_add(child_counter->total_time_running,
4536 &parent_counter->child_total_time_running);
4537
4538 /*
4539 * Remove this counter from the parent's list
4540 */
4541 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4542 mutex_lock(&parent_counter->child_mutex);
4543 list_del_init(&child_counter->child_list);
4544 mutex_unlock(&parent_counter->child_mutex);
4545
4546 /*
4547 * Release the parent counter, if this was the last
4548 * reference to it.
4549 */
4550 fput(parent_counter->filp);
4551}
4552
4553static void
4554__perf_counter_exit_task(struct perf_counter *child_counter,
4555 struct perf_counter_context *child_ctx,
4556 struct task_struct *child)
4557{
4558 struct perf_counter *parent_counter;
4559
4560 update_counter_times(child_counter);
4561 perf_counter_remove_from_context(child_counter);
4562
4563 parent_counter = child_counter->parent;
4564 /*
4565 * It can happen that parent exits first, and has counters
4566 * that are still around due to the child reference. These
4567 * counters need to be zapped - but otherwise linger.
4568 */
4569 if (parent_counter) {
4570 sync_child_counter(child_counter, child);
4571 free_counter(child_counter);
4572 }
4573}
4574
4575/*
4576 * When a child task exits, feed back counter values to parent counters.
4577 */
4578void perf_counter_exit_task(struct task_struct *child)
4579{
4580 struct perf_counter *child_counter, *tmp;
4581 struct perf_counter_context *child_ctx;
4582 unsigned long flags;
4583
4584 if (likely(!child->perf_counter_ctxp)) {
4585 perf_counter_task(child, NULL, 0);
4586 return;
4587 }
4588
4589 local_irq_save(flags);
4590 /*
4591 * We can't reschedule here because interrupts are disabled,
4592 * and either child is current or it is a task that can't be
4593 * scheduled, so we are now safe from rescheduling changing
4594 * our context.
4595 */
4596 child_ctx = child->perf_counter_ctxp;
4597 __perf_counter_task_sched_out(child_ctx);
4598
4599 /*
4600 * Take the context lock here so that if find_get_context is
4601 * reading child->perf_counter_ctxp, we wait until it has
4602 * incremented the context's refcount before we do put_ctx below.
4603 */
4604 spin_lock(&child_ctx->lock);
4605 child->perf_counter_ctxp = NULL;
4606 /*
4607 * If this context is a clone; unclone it so it can't get
4608 * swapped to another process while we're removing all
4609 * the counters from it.
4610 */
4611 unclone_ctx(child_ctx);
4612 spin_unlock_irqrestore(&child_ctx->lock, flags);
4613
4614 /*
4615 * Report the task dead after unscheduling the counters so that we
4616 * won't get any samples after PERF_EVENT_EXIT. We can however still
4617 * get a few PERF_EVENT_READ events.
4618 */
4619 perf_counter_task(child, child_ctx, 0);
4620
4621 /*
4622 * We can recurse on the same lock type through:
4623 *
4624 * __perf_counter_exit_task()
4625 * sync_child_counter()
4626 * fput(parent_counter->filp)
4627 * perf_release()
4628 * mutex_lock(&ctx->mutex)
4629 *
4630 * But since its the parent context it won't be the same instance.
4631 */
4632 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4633
4634again:
4635 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4636 list_entry)
4637 __perf_counter_exit_task(child_counter, child_ctx, child);
4638
4639 /*
4640 * If the last counter was a group counter, it will have appended all
4641 * its siblings to the list, but we obtained 'tmp' before that which
4642 * will still point to the list head terminating the iteration.
4643 */
4644 if (!list_empty(&child_ctx->counter_list))
4645 goto again;
4646
4647 mutex_unlock(&child_ctx->mutex);
4648
4649 put_ctx(child_ctx);
4650}
4651
4652/*
4653 * free an unexposed, unused context as created by inheritance by
4654 * init_task below, used by fork() in case of fail.
4655 */
4656void perf_counter_free_task(struct task_struct *task)
4657{
4658 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4659 struct perf_counter *counter, *tmp;
4660
4661 if (!ctx)
4662 return;
4663
4664 mutex_lock(&ctx->mutex);
4665again:
4666 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4667 struct perf_counter *parent = counter->parent;
4668
4669 if (WARN_ON_ONCE(!parent))
4670 continue;
4671
4672 mutex_lock(&parent->child_mutex);
4673 list_del_init(&counter->child_list);
4674 mutex_unlock(&parent->child_mutex);
4675
4676 fput(parent->filp);
4677
4678 list_del_counter(counter, ctx);
4679 free_counter(counter);
4680 }
4681
4682 if (!list_empty(&ctx->counter_list))
4683 goto again;
4684
4685 mutex_unlock(&ctx->mutex);
4686
4687 put_ctx(ctx);
4688}
4689
4690/*
4691 * Initialize the perf_counter context in task_struct
4692 */
4693int perf_counter_init_task(struct task_struct *child)
4694{
4695 struct perf_counter_context *child_ctx, *parent_ctx;
4696 struct perf_counter_context *cloned_ctx;
4697 struct perf_counter *counter;
4698 struct task_struct *parent = current;
4699 int inherited_all = 1;
4700 int ret = 0;
4701
4702 child->perf_counter_ctxp = NULL;
4703
4704 mutex_init(&child->perf_counter_mutex);
4705 INIT_LIST_HEAD(&child->perf_counter_list);
4706
4707 if (likely(!parent->perf_counter_ctxp))
4708 return 0;
4709
4710 /*
4711 * This is executed from the parent task context, so inherit
4712 * counters that have been marked for cloning.
4713 * First allocate and initialize a context for the child.
4714 */
4715
4716 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4717 if (!child_ctx)
4718 return -ENOMEM;
4719
4720 __perf_counter_init_context(child_ctx, child);
4721 child->perf_counter_ctxp = child_ctx;
4722 get_task_struct(child);
4723
4724 /*
4725 * If the parent's context is a clone, pin it so it won't get
4726 * swapped under us.
4727 */
4728 parent_ctx = perf_pin_task_context(parent);
4729
4730 /*
4731 * No need to check if parent_ctx != NULL here; since we saw
4732 * it non-NULL earlier, the only reason for it to become NULL
4733 * is if we exit, and since we're currently in the middle of
4734 * a fork we can't be exiting at the same time.
4735 */
4736
4737 /*
4738 * Lock the parent list. No need to lock the child - not PID
4739 * hashed yet and not running, so nobody can access it.
4740 */
4741 mutex_lock(&parent_ctx->mutex);
4742
4743 /*
4744 * We dont have to disable NMIs - we are only looking at
4745 * the list, not manipulating it:
4746 */
4747 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4748 if (counter != counter->group_leader)
4749 continue;
4750
4751 if (!counter->attr.inherit) {
4752 inherited_all = 0;
4753 continue;
4754 }
4755
4756 ret = inherit_group(counter, parent, parent_ctx,
4757 child, child_ctx);
4758 if (ret) {
4759 inherited_all = 0;
4760 break;
4761 }
4762 }
4763
4764 if (inherited_all) {
4765 /*
4766 * Mark the child context as a clone of the parent
4767 * context, or of whatever the parent is a clone of.
4768 * Note that if the parent is a clone, it could get
4769 * uncloned at any point, but that doesn't matter
4770 * because the list of counters and the generation
4771 * count can't have changed since we took the mutex.
4772 */
4773 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4774 if (cloned_ctx) {
4775 child_ctx->parent_ctx = cloned_ctx;
4776 child_ctx->parent_gen = parent_ctx->parent_gen;
4777 } else {
4778 child_ctx->parent_ctx = parent_ctx;
4779 child_ctx->parent_gen = parent_ctx->generation;
4780 }
4781 get_ctx(child_ctx->parent_ctx);
4782 }
4783
4784 mutex_unlock(&parent_ctx->mutex);
4785
4786 perf_unpin_context(parent_ctx);
4787
4788 return ret;
4789}
4790
4791static void __cpuinit perf_counter_init_cpu(int cpu)
4792{
4793 struct perf_cpu_context *cpuctx;
4794
4795 cpuctx = &per_cpu(perf_cpu_context, cpu);
4796 __perf_counter_init_context(&cpuctx->ctx, NULL);
4797
4798 spin_lock(&perf_resource_lock);
4799 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4800 spin_unlock(&perf_resource_lock);
4801
4802 hw_perf_counter_setup(cpu);
4803}
4804
4805#ifdef CONFIG_HOTPLUG_CPU
4806static void __perf_counter_exit_cpu(void *info)
4807{
4808 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4809 struct perf_counter_context *ctx = &cpuctx->ctx;
4810 struct perf_counter *counter, *tmp;
4811
4812 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4813 __perf_counter_remove_from_context(counter);
4814}
4815static void perf_counter_exit_cpu(int cpu)
4816{
4817 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4818 struct perf_counter_context *ctx = &cpuctx->ctx;
4819
4820 mutex_lock(&ctx->mutex);
4821 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4822 mutex_unlock(&ctx->mutex);
4823}
4824#else
4825static inline void perf_counter_exit_cpu(int cpu) { }
4826#endif
4827
4828static int __cpuinit
4829perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4830{
4831 unsigned int cpu = (long)hcpu;
4832
4833 switch (action) {
4834
4835 case CPU_UP_PREPARE:
4836 case CPU_UP_PREPARE_FROZEN:
4837 perf_counter_init_cpu(cpu);
4838 break;
4839
4840 case CPU_ONLINE:
4841 case CPU_ONLINE_FROZEN:
4842 hw_perf_counter_setup_online(cpu);
4843 break;
4844
4845 case CPU_DOWN_PREPARE:
4846 case CPU_DOWN_PREPARE_FROZEN:
4847 perf_counter_exit_cpu(cpu);
4848 break;
4849
4850 default:
4851 break;
4852 }
4853
4854 return NOTIFY_OK;
4855}
4856
4857/*
4858 * This has to have a higher priority than migration_notifier in sched.c.
4859 */
4860static struct notifier_block __cpuinitdata perf_cpu_nb = {
4861 .notifier_call = perf_cpu_notify,
4862 .priority = 20,
4863};
4864
4865void __init perf_counter_init(void)
4866{
4867 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4868 (void *)(long)smp_processor_id());
4869 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4870 (void *)(long)smp_processor_id());
4871 register_cpu_notifier(&perf_cpu_nb);
4872}
4873
4874static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4875{
4876 return sprintf(buf, "%d\n", perf_reserved_percpu);
4877}
4878
4879static ssize_t
4880perf_set_reserve_percpu(struct sysdev_class *class,
4881 const char *buf,
4882 size_t count)
4883{
4884 struct perf_cpu_context *cpuctx;
4885 unsigned long val;
4886 int err, cpu, mpt;
4887
4888 err = strict_strtoul(buf, 10, &val);
4889 if (err)
4890 return err;
4891 if (val > perf_max_counters)
4892 return -EINVAL;
4893
4894 spin_lock(&perf_resource_lock);
4895 perf_reserved_percpu = val;
4896 for_each_online_cpu(cpu) {
4897 cpuctx = &per_cpu(perf_cpu_context, cpu);
4898 spin_lock_irq(&cpuctx->ctx.lock);
4899 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4900 perf_max_counters - perf_reserved_percpu);
4901 cpuctx->max_pertask = mpt;
4902 spin_unlock_irq(&cpuctx->ctx.lock);
4903 }
4904 spin_unlock(&perf_resource_lock);
4905
4906 return count;
4907}
4908
4909static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4910{
4911 return sprintf(buf, "%d\n", perf_overcommit);
4912}
4913
4914static ssize_t
4915perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4916{
4917 unsigned long val;
4918 int err;
4919
4920 err = strict_strtoul(buf, 10, &val);
4921 if (err)
4922 return err;
4923 if (val > 1)
4924 return -EINVAL;
4925
4926 spin_lock(&perf_resource_lock);
4927 perf_overcommit = val;
4928 spin_unlock(&perf_resource_lock);
4929
4930 return count;
4931}
4932
4933static SYSDEV_CLASS_ATTR(
4934 reserve_percpu,
4935 0644,
4936 perf_show_reserve_percpu,
4937 perf_set_reserve_percpu
4938 );
4939
4940static SYSDEV_CLASS_ATTR(
4941 overcommit,
4942 0644,
4943 perf_show_overcommit,
4944 perf_set_overcommit
4945 );
4946
4947static struct attribute *perfclass_attrs[] = {
4948 &attr_reserve_percpu.attr,
4949 &attr_overcommit.attr,
4950 NULL
4951};
4952
4953static struct attribute_group perfclass_attr_group = {
4954 .attrs = perfclass_attrs,
4955 .name = "perf_counters",
4956};
4957
4958static int __init perf_counter_sysfs_init(void)
4959{
4960 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4961 &perfclass_attr_group);
4962}
4963device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..9d0b5c665883
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5108 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/vmalloc.h>
24#include <linux/hardirq.h>
25#include <linux/rculist.h>
26#include <linux/uaccess.h>
27#include <linux/syscalls.h>
28#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h>
30#include <linux/perf_event.h>
31
32#include <asm/irq_regs.h>
33
34/*
35 * Each CPU has a list of per CPU events:
36 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38
39int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly;
41static int perf_overcommit __read_mostly = 1;
42
43static atomic_t nr_events __read_mostly;
44static atomic_t nr_mmap_events __read_mostly;
45static atomic_t nr_comm_events __read_mostly;
46static atomic_t nr_task_events __read_mostly;
47
48/*
49 * perf event paranoia level:
50 * -1 - not paranoid at all
51 * 0 - disallow raw tracepoint access for unpriv
52 * 1 - disallow cpu events for unpriv
53 * 2 - disallow kernel profiling for unpriv
54 */
55int sysctl_perf_event_paranoid __read_mostly = 1;
56
57static inline bool perf_paranoid_tracepoint_raw(void)
58{
59 return sysctl_perf_event_paranoid > -1;
60}
61
62static inline bool perf_paranoid_cpu(void)
63{
64 return sysctl_perf_event_paranoid > 0;
65}
66
67static inline bool perf_paranoid_kernel(void)
68{
69 return sysctl_perf_event_paranoid > 1;
70}
71
72int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
73
74/*
75 * max perf event sample rate
76 */
77int sysctl_perf_event_sample_rate __read_mostly = 100000;
78
79static atomic64_t perf_event_id;
80
81/*
82 * Lock for (sysadmin-configurable) event reservations:
83 */
84static DEFINE_SPINLOCK(perf_resource_lock);
85
86/*
87 * Architecture provided APIs - weak aliases:
88 */
89extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
90{
91 return NULL;
92}
93
94void __weak hw_perf_disable(void) { barrier(); }
95void __weak hw_perf_enable(void) { barrier(); }
96
97void __weak hw_perf_event_setup(int cpu) { barrier(); }
98void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
99
100int __weak
101hw_perf_group_sched_in(struct perf_event *group_leader,
102 struct perf_cpu_context *cpuctx,
103 struct perf_event_context *ctx, int cpu)
104{
105 return 0;
106}
107
108void __weak perf_event_print_debug(void) { }
109
110static DEFINE_PER_CPU(int, perf_disable_count);
111
112void __perf_disable(void)
113{
114 __get_cpu_var(perf_disable_count)++;
115}
116
117bool __perf_enable(void)
118{
119 return !--__get_cpu_var(perf_disable_count);
120}
121
122void perf_disable(void)
123{
124 __perf_disable();
125 hw_perf_disable();
126}
127
128void perf_enable(void)
129{
130 if (__perf_enable())
131 hw_perf_enable();
132}
133
134static void get_ctx(struct perf_event_context *ctx)
135{
136 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
137}
138
139static void free_ctx(struct rcu_head *head)
140{
141 struct perf_event_context *ctx;
142
143 ctx = container_of(head, struct perf_event_context, rcu_head);
144 kfree(ctx);
145}
146
147static void put_ctx(struct perf_event_context *ctx)
148{
149 if (atomic_dec_and_test(&ctx->refcount)) {
150 if (ctx->parent_ctx)
151 put_ctx(ctx->parent_ctx);
152 if (ctx->task)
153 put_task_struct(ctx->task);
154 call_rcu(&ctx->rcu_head, free_ctx);
155 }
156}
157
158static void unclone_ctx(struct perf_event_context *ctx)
159{
160 if (ctx->parent_ctx) {
161 put_ctx(ctx->parent_ctx);
162 ctx->parent_ctx = NULL;
163 }
164}
165
166/*
167 * If we inherit events we want to return the parent event id
168 * to userspace.
169 */
170static u64 primary_event_id(struct perf_event *event)
171{
172 u64 id = event->id;
173
174 if (event->parent)
175 id = event->parent->id;
176
177 return id;
178}
179
180/*
181 * Get the perf_event_context for a task and lock it.
182 * This has to cope with with the fact that until it is locked,
183 * the context could get moved to another task.
184 */
185static struct perf_event_context *
186perf_lock_task_context(struct task_struct *task, unsigned long *flags)
187{
188 struct perf_event_context *ctx;
189
190 rcu_read_lock();
191 retry:
192 ctx = rcu_dereference(task->perf_event_ctxp);
193 if (ctx) {
194 /*
195 * If this context is a clone of another, it might
196 * get swapped for another underneath us by
197 * perf_event_task_sched_out, though the
198 * rcu_read_lock() protects us from any context
199 * getting freed. Lock the context and check if it
200 * got swapped before we could get the lock, and retry
201 * if so. If we locked the right context, then it
202 * can't get swapped on us any more.
203 */
204 spin_lock_irqsave(&ctx->lock, *flags);
205 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
206 spin_unlock_irqrestore(&ctx->lock, *flags);
207 goto retry;
208 }
209
210 if (!atomic_inc_not_zero(&ctx->refcount)) {
211 spin_unlock_irqrestore(&ctx->lock, *flags);
212 ctx = NULL;
213 }
214 }
215 rcu_read_unlock();
216 return ctx;
217}
218
219/*
220 * Get the context for a task and increment its pin_count so it
221 * can't get swapped to another task. This also increments its
222 * reference count so that the context can't get freed.
223 */
224static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
225{
226 struct perf_event_context *ctx;
227 unsigned long flags;
228
229 ctx = perf_lock_task_context(task, &flags);
230 if (ctx) {
231 ++ctx->pin_count;
232 spin_unlock_irqrestore(&ctx->lock, flags);
233 }
234 return ctx;
235}
236
237static void perf_unpin_context(struct perf_event_context *ctx)
238{
239 unsigned long flags;
240
241 spin_lock_irqsave(&ctx->lock, flags);
242 --ctx->pin_count;
243 spin_unlock_irqrestore(&ctx->lock, flags);
244 put_ctx(ctx);
245}
246
247/*
248 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held.
250 */
251static void
252list_add_event(struct perf_event *event, struct perf_event_context *ctx)
253{
254 struct perf_event *group_leader = event->group_leader;
255
256 /*
257 * Depending on whether it is a standalone or sibling event,
258 * add it straight to the context's event list, or to the group
259 * leader's sibling list:
260 */
261 if (group_leader == event)
262 list_add_tail(&event->group_entry, &ctx->group_list);
263 else {
264 list_add_tail(&event->group_entry, &group_leader->sibling_list);
265 group_leader->nr_siblings++;
266 }
267
268 list_add_rcu(&event->event_entry, &ctx->event_list);
269 ctx->nr_events++;
270 if (event->attr.inherit_stat)
271 ctx->nr_stat++;
272}
273
274/*
275 * Remove a event from the lists for its context.
276 * Must be called with ctx->mutex and ctx->lock held.
277 */
278static void
279list_del_event(struct perf_event *event, struct perf_event_context *ctx)
280{
281 struct perf_event *sibling, *tmp;
282
283 if (list_empty(&event->group_entry))
284 return;
285 ctx->nr_events--;
286 if (event->attr.inherit_stat)
287 ctx->nr_stat--;
288
289 list_del_init(&event->group_entry);
290 list_del_rcu(&event->event_entry);
291
292 if (event->group_leader != event)
293 event->group_leader->nr_siblings--;
294
295 /*
296 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them
298 * to the context list directly:
299 */
300 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
301
302 list_move_tail(&sibling->group_entry, &ctx->group_list);
303 sibling->group_leader = sibling;
304 }
305}
306
307static void
308event_sched_out(struct perf_event *event,
309 struct perf_cpu_context *cpuctx,
310 struct perf_event_context *ctx)
311{
312 if (event->state != PERF_EVENT_STATE_ACTIVE)
313 return;
314
315 event->state = PERF_EVENT_STATE_INACTIVE;
316 if (event->pending_disable) {
317 event->pending_disable = 0;
318 event->state = PERF_EVENT_STATE_OFF;
319 }
320 event->tstamp_stopped = ctx->time;
321 event->pmu->disable(event);
322 event->oncpu = -1;
323
324 if (!is_software_event(event))
325 cpuctx->active_oncpu--;
326 ctx->nr_active--;
327 if (event->attr.exclusive || !cpuctx->active_oncpu)
328 cpuctx->exclusive = 0;
329}
330
331static void
332group_sched_out(struct perf_event *group_event,
333 struct perf_cpu_context *cpuctx,
334 struct perf_event_context *ctx)
335{
336 struct perf_event *event;
337
338 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
339 return;
340
341 event_sched_out(group_event, cpuctx, ctx);
342
343 /*
344 * Schedule out siblings (if any):
345 */
346 list_for_each_entry(event, &group_event->sibling_list, group_entry)
347 event_sched_out(event, cpuctx, ctx);
348
349 if (group_event->attr.exclusive)
350 cpuctx->exclusive = 0;
351}
352
353/*
354 * Cross CPU call to remove a performance event
355 *
356 * We disable the event on the hardware level first. After that we
357 * remove it from the context list.
358 */
359static void __perf_event_remove_from_context(void *info)
360{
361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
362 struct perf_event *event = info;
363 struct perf_event_context *ctx = event->ctx;
364
365 /*
366 * If this is a task context, we need to check whether it is
367 * the current task context of this cpu. If not it has been
368 * scheduled out before the smp call arrived.
369 */
370 if (ctx->task && cpuctx->task_ctx != ctx)
371 return;
372
373 spin_lock(&ctx->lock);
374 /*
375 * Protect the list operation against NMI by disabling the
376 * events on a global level.
377 */
378 perf_disable();
379
380 event_sched_out(event, cpuctx, ctx);
381
382 list_del_event(event, ctx);
383
384 if (!ctx->task) {
385 /*
386 * Allow more per task events with respect to the
387 * reservation:
388 */
389 cpuctx->max_pertask =
390 min(perf_max_events - ctx->nr_events,
391 perf_max_events - perf_reserved_percpu);
392 }
393
394 perf_enable();
395 spin_unlock(&ctx->lock);
396}
397
398
399/*
400 * Remove the event from a task's (or a CPU's) list of events.
401 *
402 * Must be called with ctx->mutex held.
403 *
404 * CPU events are removed with a smp call. For task events we only
405 * call when the task is on a CPU.
406 *
407 * If event->ctx is a cloned context, callers must make sure that
408 * every task struct that event->ctx->task could possibly point to
409 * remains valid. This is OK when called from perf_release since
410 * that only calls us on the top-level context, which can't be a clone.
411 * When called from perf_event_exit_task, it's OK because the
412 * context has been detached from its task.
413 */
414static void perf_event_remove_from_context(struct perf_event *event)
415{
416 struct perf_event_context *ctx = event->ctx;
417 struct task_struct *task = ctx->task;
418
419 if (!task) {
420 /*
421 * Per cpu events are removed via an smp call and
422 * the removal is always sucessful.
423 */
424 smp_call_function_single(event->cpu,
425 __perf_event_remove_from_context,
426 event, 1);
427 return;
428 }
429
430retry:
431 task_oncpu_function_call(task, __perf_event_remove_from_context,
432 event);
433
434 spin_lock_irq(&ctx->lock);
435 /*
436 * If the context is active we need to retry the smp call.
437 */
438 if (ctx->nr_active && !list_empty(&event->group_entry)) {
439 spin_unlock_irq(&ctx->lock);
440 goto retry;
441 }
442
443 /*
444 * The lock prevents that this context is scheduled in so we
445 * can remove the event safely, if the call above did not
446 * succeed.
447 */
448 if (!list_empty(&event->group_entry)) {
449 list_del_event(event, ctx);
450 }
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490}
491
492/*
493 * Update total_time_enabled and total_time_running for all events in a group.
494 */
495static void update_group_times(struct perf_event *leader)
496{
497 struct perf_event *event;
498
499 update_event_times(leader);
500 list_for_each_entry(event, &leader->sibling_list, group_entry)
501 update_event_times(event);
502}
503
504/*
505 * Cross CPU call to disable a performance event
506 */
507static void __perf_event_disable(void *info)
508{
509 struct perf_event *event = info;
510 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
511 struct perf_event_context *ctx = event->ctx;
512
513 /*
514 * If this is a per-task event, need to check whether this
515 * event's task is the current task on this cpu.
516 */
517 if (ctx->task && cpuctx->task_ctx != ctx)
518 return;
519
520 spin_lock(&ctx->lock);
521
522 /*
523 * If the event is on, turn it off.
524 * If it is in error state, leave it in error state.
525 */
526 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
527 update_context_time(ctx);
528 update_group_times(event);
529 if (event == event->group_leader)
530 group_sched_out(event, cpuctx, ctx);
531 else
532 event_sched_out(event, cpuctx, ctx);
533 event->state = PERF_EVENT_STATE_OFF;
534 }
535
536 spin_unlock(&ctx->lock);
537}
538
539/*
540 * Disable a event.
541 *
542 * If event->ctx is a cloned context, callers must make sure that
543 * every task struct that event->ctx->task could possibly point to
544 * remains valid. This condition is satisifed when called through
545 * perf_event_for_each_child or perf_event_for_each because they
546 * hold the top-level event's child_mutex, so any descendant that
547 * goes to exit will block in sync_child_event.
548 * When called from perf_pending_event it's OK because event->ctx
549 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context.
551 */
552static void perf_event_disable(struct perf_event *event)
553{
554 struct perf_event_context *ctx = event->ctx;
555 struct task_struct *task = ctx->task;
556
557 if (!task) {
558 /*
559 * Disable the event on the cpu that it's on
560 */
561 smp_call_function_single(event->cpu, __perf_event_disable,
562 event, 1);
563 return;
564 }
565
566 retry:
567 task_oncpu_function_call(task, __perf_event_disable, event);
568
569 spin_lock_irq(&ctx->lock);
570 /*
571 * If the event is still active, we need to retry the cross-call.
572 */
573 if (event->state == PERF_EVENT_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock);
575 goto retry;
576 }
577
578 /*
579 * Since we have the lock this context can't be scheduled
580 * in, so we can change the state safely.
581 */
582 if (event->state == PERF_EVENT_STATE_INACTIVE) {
583 update_group_times(event);
584 event->state = PERF_EVENT_STATE_OFF;
585 }
586
587 spin_unlock_irq(&ctx->lock);
588}
589
590static int
591event_sched_in(struct perf_event *event,
592 struct perf_cpu_context *cpuctx,
593 struct perf_event_context *ctx,
594 int cpu)
595{
596 if (event->state <= PERF_EVENT_STATE_OFF)
597 return 0;
598
599 event->state = PERF_EVENT_STATE_ACTIVE;
600 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
601 /*
602 * The new state must be visible before we turn it on in the hardware:
603 */
604 smp_wmb();
605
606 if (event->pmu->enable(event)) {
607 event->state = PERF_EVENT_STATE_INACTIVE;
608 event->oncpu = -1;
609 return -EAGAIN;
610 }
611
612 event->tstamp_running += ctx->time - event->tstamp_stopped;
613
614 if (!is_software_event(event))
615 cpuctx->active_oncpu++;
616 ctx->nr_active++;
617
618 if (event->attr.exclusive)
619 cpuctx->exclusive = 1;
620
621 return 0;
622}
623
624static int
625group_sched_in(struct perf_event *group_event,
626 struct perf_cpu_context *cpuctx,
627 struct perf_event_context *ctx,
628 int cpu)
629{
630 struct perf_event *event, *partial_group;
631 int ret;
632
633 if (group_event->state == PERF_EVENT_STATE_OFF)
634 return 0;
635
636 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
637 if (ret)
638 return ret < 0 ? ret : 0;
639
640 if (event_sched_in(group_event, cpuctx, ctx, cpu))
641 return -EAGAIN;
642
643 /*
644 * Schedule in siblings as one group (if any):
645 */
646 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
647 if (event_sched_in(event, cpuctx, ctx, cpu)) {
648 partial_group = event;
649 goto group_error;
650 }
651 }
652
653 return 0;
654
655group_error:
656 /*
657 * Groups can be scheduled in as one unit only, so undo any
658 * partial group before returning:
659 */
660 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
661 if (event == partial_group)
662 break;
663 event_sched_out(event, cpuctx, ctx);
664 }
665 event_sched_out(group_event, cpuctx, ctx);
666
667 return -EAGAIN;
668}
669
670/*
671 * Return 1 for a group consisting entirely of software events,
672 * 0 if the group contains any hardware events.
673 */
674static int is_software_only_group(struct perf_event *leader)
675{
676 struct perf_event *event;
677
678 if (!is_software_event(leader))
679 return 0;
680
681 list_for_each_entry(event, &leader->sibling_list, group_entry)
682 if (!is_software_event(event))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this event group on the CPU now.
690 */
691static int group_can_go_on(struct perf_event *event,
692 struct perf_cpu_context *cpuctx,
693 int can_add_hw)
694{
695 /*
696 * Groups consisting entirely of software events can always go on.
697 */
698 if (is_software_only_group(event))
699 return 1;
700 /*
701 * If an exclusive group is already on, no other hardware
702 * events can go on.
703 */
704 if (cpuctx->exclusive)
705 return 0;
706 /*
707 * If this group is exclusive and there are already
708 * events on the CPU, it can't go on.
709 */
710 if (event->attr.exclusive && cpuctx->active_oncpu)
711 return 0;
712 /*
713 * Otherwise, try to add it if all previous groups were able
714 * to go on.
715 */
716 return can_add_hw;
717}
718
719static void add_event_to_ctx(struct perf_event *event,
720 struct perf_event_context *ctx)
721{
722 list_add_event(event, ctx);
723 event->tstamp_enabled = ctx->time;
724 event->tstamp_running = ctx->time;
725 event->tstamp_stopped = ctx->time;
726}
727
728/*
729 * Cross CPU call to install and enable a performance event
730 *
731 * Must be called with ctx->mutex held
732 */
733static void __perf_install_in_context(void *info)
734{
735 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
736 struct perf_event *event = info;
737 struct perf_event_context *ctx = event->ctx;
738 struct perf_event *leader = event->group_leader;
739 int cpu = smp_processor_id();
740 int err;
741
742 /*
743 * If this is a task context, we need to check whether it is
744 * the current task context of this cpu. If not it has been
745 * scheduled out before the smp call arrived.
746 * Or possibly this is the right context but it isn't
747 * on this cpu because it had no events.
748 */
749 if (ctx->task && cpuctx->task_ctx != ctx) {
750 if (cpuctx->task_ctx || ctx->task != current)
751 return;
752 cpuctx->task_ctx = ctx;
753 }
754
755 spin_lock(&ctx->lock);
756 ctx->is_active = 1;
757 update_context_time(ctx);
758
759 /*
760 * Protect the list operation against NMI by disabling the
761 * events on a global level. NOP for non NMI based events.
762 */
763 perf_disable();
764
765 add_event_to_ctx(event, ctx);
766
767 /*
768 * Don't put the event on if it is disabled or if
769 * it is in a group and the group isn't on.
770 */
771 if (event->state != PERF_EVENT_STATE_INACTIVE ||
772 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
773 goto unlock;
774
775 /*
776 * An exclusive event can't go on if there are already active
777 * hardware events, and no hardware event can go on if there
778 * is already an exclusive event on.
779 */
780 if (!group_can_go_on(event, cpuctx, 1))
781 err = -EEXIST;
782 else
783 err = event_sched_in(event, cpuctx, ctx, cpu);
784
785 if (err) {
786 /*
787 * This event couldn't go on. If it is in a group
788 * then we have to pull the whole group off.
789 * If the event group is pinned then put it in error state.
790 */
791 if (leader != event)
792 group_sched_out(leader, cpuctx, ctx);
793 if (leader->attr.pinned) {
794 update_group_times(leader);
795 leader->state = PERF_EVENT_STATE_ERROR;
796 }
797 }
798
799 if (!err && !ctx->task && cpuctx->max_pertask)
800 cpuctx->max_pertask--;
801
802 unlock:
803 perf_enable();
804
805 spin_unlock(&ctx->lock);
806}
807
808/*
809 * Attach a performance event to a context
810 *
811 * First we add the event to the list with the hardware enable bit
812 * in event->hw_config cleared.
813 *
814 * If the event is attached to a task which is on a CPU we use a smp
815 * call to enable it in the task context. The task might have been
816 * scheduled away, but we check this in the smp call again.
817 *
818 * Must be called with ctx->mutex held.
819 */
820static void
821perf_install_in_context(struct perf_event_context *ctx,
822 struct perf_event *event,
823 int cpu)
824{
825 struct task_struct *task = ctx->task;
826
827 if (!task) {
828 /*
829 * Per cpu events are installed via an smp call and
830 * the install is always sucessful.
831 */
832 smp_call_function_single(cpu, __perf_install_in_context,
833 event, 1);
834 return;
835 }
836
837retry:
838 task_oncpu_function_call(task, __perf_install_in_context,
839 event);
840
841 spin_lock_irq(&ctx->lock);
842 /*
843 * we need to retry the smp call.
844 */
845 if (ctx->is_active && list_empty(&event->group_entry)) {
846 spin_unlock_irq(&ctx->lock);
847 goto retry;
848 }
849
850 /*
851 * The lock prevents that this context is scheduled in so we
852 * can add the event safely, if it the call above did not
853 * succeed.
854 */
855 if (list_empty(&event->group_entry))
856 add_event_to_ctx(event, ctx);
857 spin_unlock_irq(&ctx->lock);
858}
859
860/*
861 * Put a event into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_event_mark_enabled(struct perf_event *event,
869 struct perf_event_context *ctx)
870{
871 struct perf_event *sub;
872
873 event->state = PERF_EVENT_STATE_INACTIVE;
874 event->tstamp_enabled = ctx->time - event->total_time_enabled;
875 list_for_each_entry(sub, &event->sibling_list, group_entry)
876 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
882 * Cross CPU call to enable a performance event
883 */
884static void __perf_event_enable(void *info)
885{
886 struct perf_event *event = info;
887 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
888 struct perf_event_context *ctx = event->ctx;
889 struct perf_event *leader = event->group_leader;
890 int err;
891
892 /*
893 * If this is a per-task event, need to check whether this
894 * event's task is the current task on this cpu.
895 */
896 if (ctx->task && cpuctx->task_ctx != ctx) {
897 if (cpuctx->task_ctx || ctx->task != current)
898 return;
899 cpuctx->task_ctx = ctx;
900 }
901
902 spin_lock(&ctx->lock);
903 ctx->is_active = 1;
904 update_context_time(ctx);
905
906 if (event->state >= PERF_EVENT_STATE_INACTIVE)
907 goto unlock;
908 __perf_event_mark_enabled(event, ctx);
909
910 /*
911 * If the event is in a group and isn't the group leader,
912 * then don't put it on unless the group is on.
913 */
914 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
915 goto unlock;
916
917 if (!group_can_go_on(event, cpuctx, 1)) {
918 err = -EEXIST;
919 } else {
920 perf_disable();
921 if (event == leader)
922 err = group_sched_in(event, cpuctx, ctx,
923 smp_processor_id());
924 else
925 err = event_sched_in(event, cpuctx, ctx,
926 smp_processor_id());
927 perf_enable();
928 }
929
930 if (err) {
931 /*
932 * If this event can't go on and it's part of a
933 * group, then the whole group has to come off.
934 */
935 if (leader != event)
936 group_sched_out(leader, cpuctx, ctx);
937 if (leader->attr.pinned) {
938 update_group_times(leader);
939 leader->state = PERF_EVENT_STATE_ERROR;
940 }
941 }
942
943 unlock:
944 spin_unlock(&ctx->lock);
945}
946
947/*
948 * Enable a event.
949 *
950 * If event->ctx is a cloned context, callers must make sure that
951 * every task struct that event->ctx->task could possibly point to
952 * remains valid. This condition is satisfied when called through
953 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable.
955 */
956static void perf_event_enable(struct perf_event *event)
957{
958 struct perf_event_context *ctx = event->ctx;
959 struct task_struct *task = ctx->task;
960
961 if (!task) {
962 /*
963 * Enable the event on the cpu that it's on
964 */
965 smp_call_function_single(event->cpu, __perf_event_enable,
966 event, 1);
967 return;
968 }
969
970 spin_lock_irq(&ctx->lock);
971 if (event->state >= PERF_EVENT_STATE_INACTIVE)
972 goto out;
973
974 /*
975 * If the event is in error state, clear that first.
976 * That way, if we see the event in error state below, we
977 * know that it has gone back into error state, as distinct
978 * from the task having been scheduled away before the
979 * cross-call arrived.
980 */
981 if (event->state == PERF_EVENT_STATE_ERROR)
982 event->state = PERF_EVENT_STATE_OFF;
983
984 retry:
985 spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_event_enable, event);
987
988 spin_lock_irq(&ctx->lock);
989
990 /*
991 * If the context is active and the event is still off,
992 * we need to retry the cross-call.
993 */
994 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
995 goto retry;
996
997 /*
998 * Since we have the lock this context can't be scheduled
999 * in, so we can change the state safely.
1000 */
1001 if (event->state == PERF_EVENT_STATE_OFF)
1002 __perf_event_mark_enabled(event, ctx);
1003
1004 out:
1005 spin_unlock_irq(&ctx->lock);
1006}
1007
1008static int perf_event_refresh(struct perf_event *event, int refresh)
1009{
1010 /*
1011 * not supported on inherited events
1012 */
1013 if (event->attr.inherit)
1014 return -EINVAL;
1015
1016 atomic_add(refresh, &event->event_limit);
1017 perf_event_enable(event);
1018
1019 return 0;
1020}
1021
1022void __perf_event_sched_out(struct perf_event_context *ctx,
1023 struct perf_cpu_context *cpuctx)
1024{
1025 struct perf_event *event;
1026
1027 spin_lock(&ctx->lock);
1028 ctx->is_active = 0;
1029 if (likely(!ctx->nr_events))
1030 goto out;
1031 update_context_time(ctx);
1032
1033 perf_disable();
1034 if (ctx->nr_active)
1035 list_for_each_entry(event, &ctx->group_list, group_entry)
1036 group_sched_out(event, cpuctx, ctx);
1037
1038 perf_enable();
1039 out:
1040 spin_unlock(&ctx->lock);
1041}
1042
1043/*
1044 * Test whether two contexts are equivalent, i.e. whether they
1045 * have both been cloned from the same version of the same context
1046 * and they both have the same number of enabled events.
1047 * If the number of enabled events is the same, then the set
1048 * of enabled events should be the same, because these are both
1049 * inherited contexts, therefore we can't access individual events
1050 * in them directly with an fd; we can only enable/disable all
1051 * events via prctl, or enable/disable all events in a family
1052 * via ioctl, which will have the same effect on both contexts.
1053 */
1054static int context_equiv(struct perf_event_context *ctx1,
1055 struct perf_event_context *ctx2)
1056{
1057 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1058 && ctx1->parent_gen == ctx2->parent_gen
1059 && !ctx1->pin_count && !ctx2->pin_count;
1060}
1061
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event)
1066{
1067 u64 value;
1068
1069 if (!event->attr.inherit_stat)
1070 return;
1071
1072 /*
1073 * Update the event value, we cannot use perf_event_read()
1074 * because we're in the middle of a context switch and have IRQs
1075 * disabled, which upsets smp_call_function_single(), however
1076 * we know the event must be on the current CPU, therefore we
1077 * don't need to use it.
1078 */
1079 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event);
1082 break;
1083
1084 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event);
1086 break;
1087
1088 default:
1089 break;
1090 }
1091
1092 /*
1093 * In order to keep per-task stats reliable we need to flip the event
1094 * values when we flip the contexts.
1095 */
1096 value = atomic64_read(&next_event->count);
1097 value = atomic64_xchg(&event->count, value);
1098 atomic64_set(&next_event->count, value);
1099
1100 swap(event->total_time_enabled, next_event->total_time_enabled);
1101 swap(event->total_time_running, next_event->total_time_running);
1102
1103 /*
1104 * Since we swizzled the values, update the user visible data too.
1105 */
1106 perf_event_update_userpage(event);
1107 perf_event_update_userpage(next_event);
1108}
1109
1110#define list_next_entry(pos, member) \
1111 list_entry(pos->member.next, typeof(*pos), member)
1112
1113static void perf_event_sync_stat(struct perf_event_context *ctx,
1114 struct perf_event_context *next_ctx)
1115{
1116 struct perf_event *event, *next_event;
1117
1118 if (!ctx->nr_stat)
1119 return;
1120
1121 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry);
1123
1124 next_event = list_first_entry(&next_ctx->event_list,
1125 struct perf_event, event_entry);
1126
1127 while (&event->event_entry != &ctx->event_list &&
1128 &next_event->event_entry != &next_ctx->event_list) {
1129
1130 __perf_event_sync_stat(event, next_event);
1131
1132 event = list_next_entry(event, event_entry);
1133 next_event = list_next_entry(next_event, event_entry);
1134 }
1135}
1136
1137/*
1138 * Called from scheduler to remove the events of the current task,
1139 * with interrupts disabled.
1140 *
1141 * We stop each event and update the event value in event->count.
1142 *
1143 * This does not protect us against NMI, but disable()
1144 * sets the disabled bit in the control field of event _before_
1145 * accessing the event control register. If a NMI hits, then it will
1146 * not restart the event.
1147 */
1148void perf_event_task_sched_out(struct task_struct *task,
1149 struct task_struct *next, int cpu)
1150{
1151 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1152 struct perf_event_context *ctx = task->perf_event_ctxp;
1153 struct perf_event_context *next_ctx;
1154 struct perf_event_context *parent;
1155 struct pt_regs *regs;
1156 int do_switch = 1;
1157
1158 regs = task_pt_regs(task);
1159 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1160
1161 if (likely(!ctx || !cpuctx->task_ctx))
1162 return;
1163
1164 update_context_time(ctx);
1165
1166 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp;
1169 if (parent && next_ctx &&
1170 rcu_dereference(next_ctx->parent_ctx) == parent) {
1171 /*
1172 * Looks like the two contexts are clones, so we might be
1173 * able to optimize the context switch. We lock both
1174 * contexts and check that they are clones under the
1175 * lock (including re-checking that neither has been
1176 * uncloned in the meantime). It doesn't matter which
1177 * order we take the locks because no other cpu could
1178 * be trying to lock both of these tasks.
1179 */
1180 spin_lock(&ctx->lock);
1181 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182 if (context_equiv(ctx, next_ctx)) {
1183 /*
1184 * XXX do we need a memory barrier of sorts
1185 * wrt to rcu_dereference() of perf_event_ctxp
1186 */
1187 task->perf_event_ctxp = next_ctx;
1188 next->perf_event_ctxp = ctx;
1189 ctx->task = next;
1190 next_ctx->task = task;
1191 do_switch = 0;
1192
1193 perf_event_sync_stat(ctx, next_ctx);
1194 }
1195 spin_unlock(&next_ctx->lock);
1196 spin_unlock(&ctx->lock);
1197 }
1198 rcu_read_unlock();
1199
1200 if (do_switch) {
1201 __perf_event_sched_out(ctx, cpuctx);
1202 cpuctx->task_ctx = NULL;
1203 }
1204}
1205
1206/*
1207 * Called with IRQs disabled
1208 */
1209static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1210{
1211 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1212
1213 if (!cpuctx->task_ctx)
1214 return;
1215
1216 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1217 return;
1218
1219 __perf_event_sched_out(ctx, cpuctx);
1220 cpuctx->task_ctx = NULL;
1221}
1222
1223/*
1224 * Called with IRQs disabled
1225 */
1226static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1227{
1228 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1229}
1230
1231static void
1232__perf_event_sched_in(struct perf_event_context *ctx,
1233 struct perf_cpu_context *cpuctx, int cpu)
1234{
1235 struct perf_event *event;
1236 int can_add_hw = 1;
1237
1238 spin_lock(&ctx->lock);
1239 ctx->is_active = 1;
1240 if (likely(!ctx->nr_events))
1241 goto out;
1242
1243 ctx->timestamp = perf_clock();
1244
1245 perf_disable();
1246
1247 /*
1248 * First go through the list and put on any pinned groups
1249 * in order to give them the best chance of going on.
1250 */
1251 list_for_each_entry(event, &ctx->group_list, group_entry) {
1252 if (event->state <= PERF_EVENT_STATE_OFF ||
1253 !event->attr.pinned)
1254 continue;
1255 if (event->cpu != -1 && event->cpu != cpu)
1256 continue;
1257
1258 if (group_can_go_on(event, cpuctx, 1))
1259 group_sched_in(event, cpuctx, ctx, cpu);
1260
1261 /*
1262 * If this pinned group hasn't been scheduled,
1263 * put it in error state.
1264 */
1265 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1266 update_group_times(event);
1267 event->state = PERF_EVENT_STATE_ERROR;
1268 }
1269 }
1270
1271 list_for_each_entry(event, &ctx->group_list, group_entry) {
1272 /*
1273 * Ignore events in OFF or ERROR state, and
1274 * ignore pinned events since we did them already.
1275 */
1276 if (event->state <= PERF_EVENT_STATE_OFF ||
1277 event->attr.pinned)
1278 continue;
1279
1280 /*
1281 * Listen to the 'cpu' scheduling filter constraint
1282 * of events:
1283 */
1284 if (event->cpu != -1 && event->cpu != cpu)
1285 continue;
1286
1287 if (group_can_go_on(event, cpuctx, can_add_hw))
1288 if (group_sched_in(event, cpuctx, ctx, cpu))
1289 can_add_hw = 0;
1290 }
1291 perf_enable();
1292 out:
1293 spin_unlock(&ctx->lock);
1294}
1295
1296/*
1297 * Called from scheduler to add the events of the current task
1298 * with interrupts disabled.
1299 *
1300 * We restore the event value and then enable it.
1301 *
1302 * This does not protect us against NMI, but enable()
1303 * sets the enabled bit in the control field of event _before_
1304 * accessing the event control register. If a NMI hits, then it will
1305 * keep the event running.
1306 */
1307void perf_event_task_sched_in(struct task_struct *task, int cpu)
1308{
1309 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1310 struct perf_event_context *ctx = task->perf_event_ctxp;
1311
1312 if (likely(!ctx))
1313 return;
1314 if (cpuctx->task_ctx == ctx)
1315 return;
1316 __perf_event_sched_in(ctx, cpuctx, cpu);
1317 cpuctx->task_ctx = ctx;
1318}
1319
1320static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1321{
1322 struct perf_event_context *ctx = &cpuctx->ctx;
1323
1324 __perf_event_sched_in(ctx, cpuctx, cpu);
1325}
1326
1327#define MAX_INTERRUPTS (~0ULL)
1328
1329static void perf_log_throttle(struct perf_event *event, int enable);
1330
1331static void perf_adjust_period(struct perf_event *event, u64 events)
1332{
1333 struct hw_perf_event *hwc = &event->hw;
1334 u64 period, sample_period;
1335 s64 delta;
1336
1337 events *= hwc->sample_period;
1338 period = div64_u64(events, event->attr.sample_freq);
1339
1340 delta = (s64)(period - hwc->sample_period);
1341 delta = (delta + 7) / 8; /* low pass filter */
1342
1343 sample_period = hwc->sample_period + delta;
1344
1345 if (!sample_period)
1346 sample_period = 1;
1347
1348 hwc->sample_period = sample_period;
1349}
1350
1351static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1352{
1353 struct perf_event *event;
1354 struct hw_perf_event *hwc;
1355 u64 interrupts, freq;
1356
1357 spin_lock(&ctx->lock);
1358 list_for_each_entry(event, &ctx->group_list, group_entry) {
1359 if (event->state != PERF_EVENT_STATE_ACTIVE)
1360 continue;
1361
1362 hwc = &event->hw;
1363
1364 interrupts = hwc->interrupts;
1365 hwc->interrupts = 0;
1366
1367 /*
1368 * unthrottle events on the tick
1369 */
1370 if (interrupts == MAX_INTERRUPTS) {
1371 perf_log_throttle(event, 1);
1372 event->pmu->unthrottle(event);
1373 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1374 }
1375
1376 if (!event->attr.freq || !event->attr.sample_freq)
1377 continue;
1378
1379 /*
1380 * if the specified freq < HZ then we need to skip ticks
1381 */
1382 if (event->attr.sample_freq < HZ) {
1383 freq = event->attr.sample_freq;
1384
1385 hwc->freq_count += freq;
1386 hwc->freq_interrupts += interrupts;
1387
1388 if (hwc->freq_count < HZ)
1389 continue;
1390
1391 interrupts = hwc->freq_interrupts;
1392 hwc->freq_interrupts = 0;
1393 hwc->freq_count -= HZ;
1394 } else
1395 freq = HZ;
1396
1397 perf_adjust_period(event, freq * interrupts);
1398
1399 /*
1400 * In order to avoid being stalled by an (accidental) huge
1401 * sample period, force reset the sample period if we didn't
1402 * get any events in this freq period.
1403 */
1404 if (!interrupts) {
1405 perf_disable();
1406 event->pmu->disable(event);
1407 atomic64_set(&hwc->period_left, 0);
1408 event->pmu->enable(event);
1409 perf_enable();
1410 }
1411 }
1412 spin_unlock(&ctx->lock);
1413}
1414
1415/*
1416 * Round-robin a context's events:
1417 */
1418static void rotate_ctx(struct perf_event_context *ctx)
1419{
1420 struct perf_event *event;
1421
1422 if (!ctx->nr_events)
1423 return;
1424
1425 spin_lock(&ctx->lock);
1426 /*
1427 * Rotate the first entry last (works just fine for group events too):
1428 */
1429 perf_disable();
1430 list_for_each_entry(event, &ctx->group_list, group_entry) {
1431 list_move_tail(&event->group_entry, &ctx->group_list);
1432 break;
1433 }
1434 perf_enable();
1435
1436 spin_unlock(&ctx->lock);
1437}
1438
1439void perf_event_task_tick(struct task_struct *curr, int cpu)
1440{
1441 struct perf_cpu_context *cpuctx;
1442 struct perf_event_context *ctx;
1443
1444 if (!atomic_read(&nr_events))
1445 return;
1446
1447 cpuctx = &per_cpu(perf_cpu_context, cpu);
1448 ctx = curr->perf_event_ctxp;
1449
1450 perf_ctx_adjust_freq(&cpuctx->ctx);
1451 if (ctx)
1452 perf_ctx_adjust_freq(ctx);
1453
1454 perf_event_cpu_sched_out(cpuctx);
1455 if (ctx)
1456 __perf_event_task_sched_out(ctx);
1457
1458 rotate_ctx(&cpuctx->ctx);
1459 if (ctx)
1460 rotate_ctx(ctx);
1461
1462 perf_event_cpu_sched_in(cpuctx, cpu);
1463 if (ctx)
1464 perf_event_task_sched_in(curr, cpu);
1465}
1466
1467/*
1468 * Enable all of a task's events that have been marked enable-on-exec.
1469 * This expects task == current.
1470 */
1471static void perf_event_enable_on_exec(struct task_struct *task)
1472{
1473 struct perf_event_context *ctx;
1474 struct perf_event *event;
1475 unsigned long flags;
1476 int enabled = 0;
1477
1478 local_irq_save(flags);
1479 ctx = task->perf_event_ctxp;
1480 if (!ctx || !ctx->nr_events)
1481 goto out;
1482
1483 __perf_event_task_sched_out(ctx);
1484
1485 spin_lock(&ctx->lock);
1486
1487 list_for_each_entry(event, &ctx->group_list, group_entry) {
1488 if (!event->attr.enable_on_exec)
1489 continue;
1490 event->attr.enable_on_exec = 0;
1491 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1492 continue;
1493 __perf_event_mark_enabled(event, ctx);
1494 enabled = 1;
1495 }
1496
1497 /*
1498 * Unclone this context if we enabled any event.
1499 */
1500 if (enabled)
1501 unclone_ctx(ctx);
1502
1503 spin_unlock(&ctx->lock);
1504
1505 perf_event_task_sched_in(task, smp_processor_id());
1506 out:
1507 local_irq_restore(flags);
1508}
1509
1510/*
1511 * Cross CPU call to read the hardware event
1512 */
1513static void __perf_event_read(void *info)
1514{
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519
1520 /*
1521 * If this is a task context, we need to check whether it is
1522 * the current task context of this cpu. If not it has been
1523 * scheduled out before the smp call arrived. In that case
1524 * event->count would have been updated to a recent sample
1525 * when the event was scheduled out.
1526 */
1527 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return;
1529
1530 local_irq_save(flags);
1531 if (ctx->is_active)
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event);
1535 local_irq_restore(flags);
1536}
1537
1538static u64 perf_event_read(struct perf_event *event)
1539{
1540 /*
1541 * If event is enabled and currently active on a CPU, update the
1542 * value in the event structure:
1543 */
1544 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1545 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1548 update_event_times(event);
1549 }
1550
1551 return atomic64_read(&event->count);
1552}
1553
1554/*
1555 * Initialize the perf_event context in a task_struct:
1556 */
1557static void
1558__perf_event_init_context(struct perf_event_context *ctx,
1559 struct task_struct *task)
1560{
1561 memset(ctx, 0, sizeof(*ctx));
1562 spin_lock_init(&ctx->lock);
1563 mutex_init(&ctx->mutex);
1564 INIT_LIST_HEAD(&ctx->group_list);
1565 INIT_LIST_HEAD(&ctx->event_list);
1566 atomic_set(&ctx->refcount, 1);
1567 ctx->task = task;
1568}
1569
1570static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1571{
1572 struct perf_event_context *ctx;
1573 struct perf_cpu_context *cpuctx;
1574 struct task_struct *task;
1575 unsigned long flags;
1576 int err;
1577
1578 /*
1579 * If cpu is not a wildcard then this is a percpu event:
1580 */
1581 if (cpu != -1) {
1582 /* Must be root to operate on a CPU event: */
1583 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1584 return ERR_PTR(-EACCES);
1585
1586 if (cpu < 0 || cpu > num_possible_cpus())
1587 return ERR_PTR(-EINVAL);
1588
1589 /*
1590 * We could be clever and allow to attach a event to an
1591 * offline CPU and activate it when the CPU comes up, but
1592 * that's for later.
1593 */
1594 if (!cpu_isset(cpu, cpu_online_map))
1595 return ERR_PTR(-ENODEV);
1596
1597 cpuctx = &per_cpu(perf_cpu_context, cpu);
1598 ctx = &cpuctx->ctx;
1599 get_ctx(ctx);
1600
1601 return ctx;
1602 }
1603
1604 rcu_read_lock();
1605 if (!pid)
1606 task = current;
1607 else
1608 task = find_task_by_vpid(pid);
1609 if (task)
1610 get_task_struct(task);
1611 rcu_read_unlock();
1612
1613 if (!task)
1614 return ERR_PTR(-ESRCH);
1615
1616 /*
1617 * Can't attach events to a dying task.
1618 */
1619 err = -ESRCH;
1620 if (task->flags & PF_EXITING)
1621 goto errout;
1622
1623 /* Reuse ptrace permission checks for now. */
1624 err = -EACCES;
1625 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1626 goto errout;
1627
1628 retry:
1629 ctx = perf_lock_task_context(task, &flags);
1630 if (ctx) {
1631 unclone_ctx(ctx);
1632 spin_unlock_irqrestore(&ctx->lock, flags);
1633 }
1634
1635 if (!ctx) {
1636 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637 err = -ENOMEM;
1638 if (!ctx)
1639 goto errout;
1640 __perf_event_init_context(ctx, task);
1641 get_ctx(ctx);
1642 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1643 /*
1644 * We raced with some other task; use
1645 * the context they set.
1646 */
1647 kfree(ctx);
1648 goto retry;
1649 }
1650 get_task_struct(task);
1651 }
1652
1653 put_task_struct(task);
1654 return ctx;
1655
1656 errout:
1657 put_task_struct(task);
1658 return ERR_PTR(err);
1659}
1660
1661static void free_event_rcu(struct rcu_head *head)
1662{
1663 struct perf_event *event;
1664
1665 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns)
1667 put_pid_ns(event->ns);
1668 kfree(event);
1669}
1670
1671static void perf_pending_sync(struct perf_event *event);
1672
1673static void free_event(struct perf_event *event)
1674{
1675 perf_pending_sync(event);
1676
1677 if (!event->parent) {
1678 atomic_dec(&nr_events);
1679 if (event->attr.mmap)
1680 atomic_dec(&nr_mmap_events);
1681 if (event->attr.comm)
1682 atomic_dec(&nr_comm_events);
1683 if (event->attr.task)
1684 atomic_dec(&nr_task_events);
1685 }
1686
1687 if (event->output) {
1688 fput(event->output->filp);
1689 event->output = NULL;
1690 }
1691
1692 if (event->destroy)
1693 event->destroy(event);
1694
1695 put_ctx(event->ctx);
1696 call_rcu(&event->rcu_head, free_event_rcu);
1697}
1698
1699/*
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx;
1706
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event);
1712 mutex_unlock(&ctx->mutex);
1713
1714 mutex_lock(&event->owner->perf_event_mutex);
1715 list_del_init(&event->owner_entry);
1716 mutex_unlock(&event->owner->perf_event_mutex);
1717 put_task_struct(event->owner);
1718
1719 free_event(event);
1720
1721 return 0;
1722}
1723
1724static int perf_event_read_size(struct perf_event *event)
1725{
1726 int entry = sizeof(u64); /* value */
1727 int size = 0;
1728 int nr = 1;
1729
1730 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1731 size += sizeof(u64);
1732
1733 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1734 size += sizeof(u64);
1735
1736 if (event->attr.read_format & PERF_FORMAT_ID)
1737 entry += sizeof(u64);
1738
1739 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1740 nr += event->group_leader->nr_siblings;
1741 size += sizeof(u64);
1742 }
1743
1744 size += entry * nr;
1745
1746 return size;
1747}
1748
1749static u64 perf_event_read_value(struct perf_event *event)
1750{
1751 struct perf_event *child;
1752 u64 total = 0;
1753
1754 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list)
1756 total += perf_event_read(child);
1757
1758 return total;
1759}
1760
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778
1779static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf)
1781{
1782 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT;
1784 u64 values[3];
1785
1786 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1788 values[n++] = leader->total_time_enabled +
1789 atomic64_read(&leader->child_total_time_enabled);
1790 }
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1792 values[n++] = leader->total_time_running +
1793 atomic64_read(&leader->child_total_time_running);
1794 }
1795
1796 size = n * sizeof(u64);
1797
1798 if (copy_to_user(buf, values, size))
1799 return -EFAULT;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804
1805 size += err;
1806
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format,
1809 buf + size);
1810 if (err < 0)
1811 return err;
1812
1813 size += err;
1814 }
1815
1816 return size;
1817}
1818
1819static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf)
1821{
1822 u64 values[4];
1823 int n = 0;
1824
1825 values[n++] = perf_event_read_value(event);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1827 values[n++] = event->total_time_enabled +
1828 atomic64_read(&event->child_total_time_enabled);
1829 }
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event);
1836
1837 if (copy_to_user(buf, values, n * sizeof(u64)))
1838 return -EFAULT;
1839
1840 return n * sizeof(u64);
1841}
1842
1843/*
1844 * Read the performance event - simple non blocking version for now
1845 */
1846static ssize_t
1847perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1848{
1849 u64 read_format = event->attr.read_format;
1850 int ret;
1851
1852 /*
1853 * Return end-of-file for a read on a event that is in
1854 * error state (i.e. because it was pinned but it couldn't be
1855 * scheduled on to the CPU at some point).
1856 */
1857 if (event->state == PERF_EVENT_STATE_ERROR)
1858 return 0;
1859
1860 if (count < perf_event_read_size(event))
1861 return -ENOSPC;
1862
1863 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf);
1867 else
1868 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870
1871 return ret;
1872}
1873
1874static ssize_t
1875perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1876{
1877 struct perf_event *event = file->private_data;
1878
1879 return perf_read_hw(event, buf, count);
1880}
1881
1882static unsigned int perf_poll(struct file *file, poll_table *wait)
1883{
1884 struct perf_event *event = file->private_data;
1885 struct perf_mmap_data *data;
1886 unsigned int events = POLL_HUP;
1887
1888 rcu_read_lock();
1889 data = rcu_dereference(event->data);
1890 if (data)
1891 events = atomic_xchg(&data->poll, 0);
1892 rcu_read_unlock();
1893
1894 poll_wait(file, &event->waitq, wait);
1895
1896 return events;
1897}
1898
1899static void perf_event_reset(struct perf_event *event)
1900{
1901 (void)perf_event_read(event);
1902 atomic64_set(&event->count, 0);
1903 perf_event_update_userpage(event);
1904}
1905
1906/*
1907 * Holding the top-level event's child_mutex means that any
1908 * descendant process that has inherited this event will block
1909 * in sync_child_event if it goes to exit, thus satisfying the
1910 * task existence requirements of perf_event_enable/disable.
1911 */
1912static void perf_event_for_each_child(struct perf_event *event,
1913 void (*func)(struct perf_event *))
1914{
1915 struct perf_event *child;
1916
1917 WARN_ON_ONCE(event->ctx->parent_ctx);
1918 mutex_lock(&event->child_mutex);
1919 func(event);
1920 list_for_each_entry(child, &event->child_list, child_list)
1921 func(child);
1922 mutex_unlock(&event->child_mutex);
1923}
1924
1925static void perf_event_for_each(struct perf_event *event,
1926 void (*func)(struct perf_event *))
1927{
1928 struct perf_event_context *ctx = event->ctx;
1929 struct perf_event *sibling;
1930
1931 WARN_ON_ONCE(ctx->parent_ctx);
1932 mutex_lock(&ctx->mutex);
1933 event = event->group_leader;
1934
1935 perf_event_for_each_child(event, func);
1936 func(event);
1937 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1938 perf_event_for_each_child(event, func);
1939 mutex_unlock(&ctx->mutex);
1940}
1941
1942static int perf_event_period(struct perf_event *event, u64 __user *arg)
1943{
1944 struct perf_event_context *ctx = event->ctx;
1945 unsigned long size;
1946 int ret = 0;
1947 u64 value;
1948
1949 if (!event->attr.sample_period)
1950 return -EINVAL;
1951
1952 size = copy_from_user(&value, arg, sizeof(value));
1953 if (size != sizeof(value))
1954 return -EFAULT;
1955
1956 if (!value)
1957 return -EINVAL;
1958
1959 spin_lock_irq(&ctx->lock);
1960 if (event->attr.freq) {
1961 if (value > sysctl_perf_event_sample_rate) {
1962 ret = -EINVAL;
1963 goto unlock;
1964 }
1965
1966 event->attr.sample_freq = value;
1967 } else {
1968 event->attr.sample_period = value;
1969 event->hw.sample_period = value;
1970 }
1971unlock:
1972 spin_unlock_irq(&ctx->lock);
1973
1974 return ret;
1975}
1976
1977int perf_event_set_output(struct perf_event *event, int output_fd);
1978
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{
1981 struct perf_event *event = file->private_data;
1982 void (*func)(struct perf_event *);
1983 u32 flags = arg;
1984
1985 switch (cmd) {
1986 case PERF_EVENT_IOC_ENABLE:
1987 func = perf_event_enable;
1988 break;
1989 case PERF_EVENT_IOC_DISABLE:
1990 func = perf_event_disable;
1991 break;
1992 case PERF_EVENT_IOC_RESET:
1993 func = perf_event_reset;
1994 break;
1995
1996 case PERF_EVENT_IOC_REFRESH:
1997 return perf_event_refresh(event, arg);
1998
1999 case PERF_EVENT_IOC_PERIOD:
2000 return perf_event_period(event, (u64 __user *)arg);
2001
2002 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg);
2004
2005 default:
2006 return -ENOTTY;
2007 }
2008
2009 if (flags & PERF_IOC_FLAG_GROUP)
2010 perf_event_for_each(event, func);
2011 else
2012 perf_event_for_each_child(event, func);
2013
2014 return 0;
2015}
2016
2017int perf_event_task_enable(void)
2018{
2019 struct perf_event *event;
2020
2021 mutex_lock(&current->perf_event_mutex);
2022 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2023 perf_event_for_each_child(event, perf_event_enable);
2024 mutex_unlock(&current->perf_event_mutex);
2025
2026 return 0;
2027}
2028
2029int perf_event_task_disable(void)
2030{
2031 struct perf_event *event;
2032
2033 mutex_lock(&current->perf_event_mutex);
2034 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2035 perf_event_for_each_child(event, perf_event_disable);
2036 mutex_unlock(&current->perf_event_mutex);
2037
2038 return 0;
2039}
2040
2041#ifndef PERF_EVENT_INDEX_OFFSET
2042# define PERF_EVENT_INDEX_OFFSET 0
2043#endif
2044
2045static int perf_event_index(struct perf_event *event)
2046{
2047 if (event->state != PERF_EVENT_STATE_ACTIVE)
2048 return 0;
2049
2050 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2051}
2052
2053/*
2054 * Callers need to ensure there can be no nesting of this function, otherwise
2055 * the seqlock logic goes bad. We can not serialize this because the arch
2056 * code calls this from NMI context.
2057 */
2058void perf_event_update_userpage(struct perf_event *event)
2059{
2060 struct perf_event_mmap_page *userpg;
2061 struct perf_mmap_data *data;
2062
2063 rcu_read_lock();
2064 data = rcu_dereference(event->data);
2065 if (!data)
2066 goto unlock;
2067
2068 userpg = data->user_page;
2069
2070 /*
2071 * Disable preemption so as to not let the corresponding user-space
2072 * spin too long if we get preempted.
2073 */
2074 preempt_disable();
2075 ++userpg->lock;
2076 barrier();
2077 userpg->index = perf_event_index(event);
2078 userpg->offset = atomic64_read(&event->count);
2079 if (event->state == PERF_EVENT_STATE_ACTIVE)
2080 userpg->offset -= atomic64_read(&event->hw.prev_count);
2081
2082 userpg->time_enabled = event->total_time_enabled +
2083 atomic64_read(&event->child_total_time_enabled);
2084
2085 userpg->time_running = event->total_time_running +
2086 atomic64_read(&event->child_total_time_running);
2087
2088 barrier();
2089 ++userpg->lock;
2090 preempt_enable();
2091unlock:
2092 rcu_read_unlock();
2093}
2094
2095static unsigned long perf_data_size(struct perf_mmap_data *data)
2096{
2097 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2098}
2099
2100#ifndef CONFIG_PERF_USE_VMALLOC
2101
2102/*
2103 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
2104 */
2105
2106static struct page *
2107perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2108{
2109 if (pgoff > data->nr_pages)
2110 return NULL;
2111
2112 if (pgoff == 0)
2113 return virt_to_page(data->user_page);
2114
2115 return virt_to_page(data->data_pages[pgoff - 1]);
2116}
2117
2118static struct perf_mmap_data *
2119perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2120{
2121 struct perf_mmap_data *data;
2122 unsigned long size;
2123 int i;
2124
2125 WARN_ON(atomic_read(&event->mmap_count));
2126
2127 size = sizeof(struct perf_mmap_data);
2128 size += nr_pages * sizeof(void *);
2129
2130 data = kzalloc(size, GFP_KERNEL);
2131 if (!data)
2132 goto fail;
2133
2134 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2135 if (!data->user_page)
2136 goto fail_user_page;
2137
2138 for (i = 0; i < nr_pages; i++) {
2139 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2140 if (!data->data_pages[i])
2141 goto fail_data_pages;
2142 }
2143
2144 data->data_order = 0;
2145 data->nr_pages = nr_pages;
2146
2147 return data;
2148
2149fail_data_pages:
2150 for (i--; i >= 0; i--)
2151 free_page((unsigned long)data->data_pages[i]);
2152
2153 free_page((unsigned long)data->user_page);
2154
2155fail_user_page:
2156 kfree(data);
2157
2158fail:
2159 return NULL;
2160}
2161
2162static void perf_mmap_free_page(unsigned long addr)
2163{
2164 struct page *page = virt_to_page((void *)addr);
2165
2166 page->mapping = NULL;
2167 __free_page(page);
2168}
2169
2170static void perf_mmap_data_free(struct perf_mmap_data *data)
2171{
2172 int i;
2173
2174 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2177}
2178
2179#else
2180
2181/*
2182 * Back perf_mmap() with vmalloc memory.
2183 *
2184 * Required for architectures that have d-cache aliasing issues.
2185 */
2186
2187static struct page *
2188perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2189{
2190 if (pgoff > (1UL << data->data_order))
2191 return NULL;
2192
2193 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
2194}
2195
2196static void perf_mmap_unmark_page(void *addr)
2197{
2198 struct page *page = vmalloc_to_page(addr);
2199
2200 page->mapping = NULL;
2201}
2202
2203static void perf_mmap_data_free_work(struct work_struct *work)
2204{
2205 struct perf_mmap_data *data;
2206 void *base;
2207 int i, nr;
2208
2209 data = container_of(work, struct perf_mmap_data, work);
2210 nr = 1 << data->data_order;
2211
2212 base = data->user_page;
2213 for (i = 0; i < nr + 1; i++)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215
2216 vfree(base);
2217}
2218
2219static void perf_mmap_data_free(struct perf_mmap_data *data)
2220{
2221 schedule_work(&data->work);
2222}
2223
2224static struct perf_mmap_data *
2225perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2226{
2227 struct perf_mmap_data *data;
2228 unsigned long size;
2229 void *all_buf;
2230
2231 WARN_ON(atomic_read(&event->mmap_count));
2232
2233 size = sizeof(struct perf_mmap_data);
2234 size += sizeof(void *);
2235
2236 data = kzalloc(size, GFP_KERNEL);
2237 if (!data)
2238 goto fail;
2239
2240 INIT_WORK(&data->work, perf_mmap_data_free_work);
2241
2242 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
2243 if (!all_buf)
2244 goto fail_all_buf;
2245
2246 data->user_page = all_buf;
2247 data->data_pages[0] = all_buf + PAGE_SIZE;
2248 data->data_order = ilog2(nr_pages);
2249 data->nr_pages = 1;
2250
2251 return data;
2252
2253fail_all_buf:
2254 kfree(data);
2255
2256fail:
2257 return NULL;
2258}
2259
2260#endif
2261
2262static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2263{
2264 struct perf_event *event = vma->vm_file->private_data;
2265 struct perf_mmap_data *data;
2266 int ret = VM_FAULT_SIGBUS;
2267
2268 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2269 if (vmf->pgoff == 0)
2270 ret = 0;
2271 return ret;
2272 }
2273
2274 rcu_read_lock();
2275 data = rcu_dereference(event->data);
2276 if (!data)
2277 goto unlock;
2278
2279 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
2280 goto unlock;
2281
2282 vmf->page = perf_mmap_to_page(data, vmf->pgoff);
2283 if (!vmf->page)
2284 goto unlock;
2285
2286 get_page(vmf->page);
2287 vmf->page->mapping = vma->vm_file->f_mapping;
2288 vmf->page->index = vmf->pgoff;
2289
2290 ret = 0;
2291unlock:
2292 rcu_read_unlock();
2293
2294 return ret;
2295}
2296
2297static void
2298perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2299{
2300 long max_size = perf_data_size(data);
2301
2302 atomic_set(&data->lock, -1);
2303
2304 if (event->attr.watermark) {
2305 data->watermark = min_t(long, max_size,
2306 event->attr.wakeup_watermark);
2307 }
2308
2309 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
2311
2312
2313 rcu_assign_pointer(event->data, data);
2314}
2315
2316static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2317{
2318 struct perf_mmap_data *data;
2319
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data);
2322 kfree(data);
2323}
2324
2325static void perf_mmap_data_release(struct perf_event *event)
2326{
2327 struct perf_mmap_data *data = event->data;
2328
2329 WARN_ON(atomic_read(&event->mmap_count));
2330
2331 rcu_assign_pointer(event->data, NULL);
2332 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2333}
2334
2335static void perf_mmap_open(struct vm_area_struct *vma)
2336{
2337 struct perf_event *event = vma->vm_file->private_data;
2338
2339 atomic_inc(&event->mmap_count);
2340}
2341
2342static void perf_mmap_close(struct vm_area_struct *vma)
2343{
2344 struct perf_event *event = vma->vm_file->private_data;
2345
2346 WARN_ON_ONCE(event->ctx->parent_ctx);
2347 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2348 unsigned long size = perf_data_size(event->data);
2349 struct user_struct *user = current_user();
2350
2351 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2352 vma->vm_mm->locked_vm -= event->data->nr_locked;
2353 perf_mmap_data_release(event);
2354 mutex_unlock(&event->mmap_mutex);
2355 }
2356}
2357
2358static const struct vm_operations_struct perf_mmap_vmops = {
2359 .open = perf_mmap_open,
2360 .close = perf_mmap_close,
2361 .fault = perf_mmap_fault,
2362 .page_mkwrite = perf_mmap_fault,
2363};
2364
2365static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2366{
2367 struct perf_event *event = file->private_data;
2368 unsigned long user_locked, user_lock_limit;
2369 struct user_struct *user = current_user();
2370 unsigned long locked, lock_limit;
2371 struct perf_mmap_data *data;
2372 unsigned long vma_size;
2373 unsigned long nr_pages;
2374 long user_extra, extra;
2375 int ret = 0;
2376
2377 if (!(vma->vm_flags & VM_SHARED))
2378 return -EINVAL;
2379
2380 vma_size = vma->vm_end - vma->vm_start;
2381 nr_pages = (vma_size / PAGE_SIZE) - 1;
2382
2383 /*
2384 * If we have data pages ensure they're a power-of-two number, so we
2385 * can do bitmasks instead of modulo.
2386 */
2387 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2388 return -EINVAL;
2389
2390 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2391 return -EINVAL;
2392
2393 if (vma->vm_pgoff != 0)
2394 return -EINVAL;
2395
2396 WARN_ON_ONCE(event->ctx->parent_ctx);
2397 mutex_lock(&event->mmap_mutex);
2398 if (event->output) {
2399 ret = -EINVAL;
2400 goto unlock;
2401 }
2402
2403 if (atomic_inc_not_zero(&event->mmap_count)) {
2404 if (nr_pages != event->data->nr_pages)
2405 ret = -EINVAL;
2406 goto unlock;
2407 }
2408
2409 user_extra = nr_pages + 1;
2410 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2411
2412 /*
2413 * Increase the limit linearly with more CPUs:
2414 */
2415 user_lock_limit *= num_online_cpus();
2416
2417 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2418
2419 extra = 0;
2420 if (user_locked > user_lock_limit)
2421 extra = user_locked - user_lock_limit;
2422
2423 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2424 lock_limit >>= PAGE_SHIFT;
2425 locked = vma->vm_mm->locked_vm + extra;
2426
2427 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2428 !capable(CAP_IPC_LOCK)) {
2429 ret = -EPERM;
2430 goto unlock;
2431 }
2432
2433 WARN_ON(event->data);
2434
2435 data = perf_mmap_data_alloc(event, nr_pages);
2436 ret = -ENOMEM;
2437 if (!data)
2438 goto unlock;
2439
2440 ret = 0;
2441 perf_mmap_data_init(event, data);
2442
2443 atomic_set(&event->mmap_count, 1);
2444 atomic_long_add(user_extra, &user->locked_vm);
2445 vma->vm_mm->locked_vm += extra;
2446 event->data->nr_locked = extra;
2447 if (vma->vm_flags & VM_WRITE)
2448 event->data->writable = 1;
2449
2450unlock:
2451 mutex_unlock(&event->mmap_mutex);
2452
2453 vma->vm_flags |= VM_RESERVED;
2454 vma->vm_ops = &perf_mmap_vmops;
2455
2456 return ret;
2457}
2458
2459static int perf_fasync(int fd, struct file *filp, int on)
2460{
2461 struct inode *inode = filp->f_path.dentry->d_inode;
2462 struct perf_event *event = filp->private_data;
2463 int retval;
2464
2465 mutex_lock(&inode->i_mutex);
2466 retval = fasync_helper(fd, filp, on, &event->fasync);
2467 mutex_unlock(&inode->i_mutex);
2468
2469 if (retval < 0)
2470 return retval;
2471
2472 return 0;
2473}
2474
2475static const struct file_operations perf_fops = {
2476 .release = perf_release,
2477 .read = perf_read,
2478 .poll = perf_poll,
2479 .unlocked_ioctl = perf_ioctl,
2480 .compat_ioctl = perf_ioctl,
2481 .mmap = perf_mmap,
2482 .fasync = perf_fasync,
2483};
2484
2485/*
2486 * Perf event wakeup
2487 *
2488 * If there's data, ensure we set the poll() state and publish everything
2489 * to user-space before waking everybody up.
2490 */
2491
2492void perf_event_wakeup(struct perf_event *event)
2493{
2494 wake_up_all(&event->waitq);
2495
2496 if (event->pending_kill) {
2497 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2498 event->pending_kill = 0;
2499 }
2500}
2501
2502/*
2503 * Pending wakeups
2504 *
2505 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2506 *
2507 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2508 * single linked list and use cmpxchg() to add entries lockless.
2509 */
2510
2511static void perf_pending_event(struct perf_pending_entry *entry)
2512{
2513 struct perf_event *event = container_of(entry,
2514 struct perf_event, pending);
2515
2516 if (event->pending_disable) {
2517 event->pending_disable = 0;
2518 __perf_event_disable(event);
2519 }
2520
2521 if (event->pending_wakeup) {
2522 event->pending_wakeup = 0;
2523 perf_event_wakeup(event);
2524 }
2525}
2526
2527#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2528
2529static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2530 PENDING_TAIL,
2531};
2532
2533static void perf_pending_queue(struct perf_pending_entry *entry,
2534 void (*func)(struct perf_pending_entry *))
2535{
2536 struct perf_pending_entry **head;
2537
2538 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2539 return;
2540
2541 entry->func = func;
2542
2543 head = &get_cpu_var(perf_pending_head);
2544
2545 do {
2546 entry->next = *head;
2547 } while (cmpxchg(head, entry->next, entry) != entry->next);
2548
2549 set_perf_event_pending();
2550
2551 put_cpu_var(perf_pending_head);
2552}
2553
2554static int __perf_pending_run(void)
2555{
2556 struct perf_pending_entry *list;
2557 int nr = 0;
2558
2559 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2560 while (list != PENDING_TAIL) {
2561 void (*func)(struct perf_pending_entry *);
2562 struct perf_pending_entry *entry = list;
2563
2564 list = list->next;
2565
2566 func = entry->func;
2567 entry->next = NULL;
2568 /*
2569 * Ensure we observe the unqueue before we issue the wakeup,
2570 * so that we won't be waiting forever.
2571 * -- see perf_not_pending().
2572 */
2573 smp_wmb();
2574
2575 func(entry);
2576 nr++;
2577 }
2578
2579 return nr;
2580}
2581
2582static inline int perf_not_pending(struct perf_event *event)
2583{
2584 /*
2585 * If we flush on whatever cpu we run, there is a chance we don't
2586 * need to wait.
2587 */
2588 get_cpu();
2589 __perf_pending_run();
2590 put_cpu();
2591
2592 /*
2593 * Ensure we see the proper queue state before going to sleep
2594 * so that we do not miss the wakeup. -- see perf_pending_handle()
2595 */
2596 smp_rmb();
2597 return event->pending.next == NULL;
2598}
2599
2600static void perf_pending_sync(struct perf_event *event)
2601{
2602 wait_event(event->waitq, perf_not_pending(event));
2603}
2604
2605void perf_event_do_pending(void)
2606{
2607 __perf_pending_run();
2608}
2609
2610/*
2611 * Callchain support -- arch specific
2612 */
2613
2614__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2615{
2616 return NULL;
2617}
2618
2619/*
2620 * Output
2621 */
2622static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2623 unsigned long offset, unsigned long head)
2624{
2625 unsigned long mask;
2626
2627 if (!data->writable)
2628 return true;
2629
2630 mask = perf_data_size(data) - 1;
2631
2632 offset = (offset - tail) & mask;
2633 head = (head - tail) & mask;
2634
2635 if ((int)(head - offset) < 0)
2636 return false;
2637
2638 return true;
2639}
2640
2641static void perf_output_wakeup(struct perf_output_handle *handle)
2642{
2643 atomic_set(&handle->data->poll, POLL_IN);
2644
2645 if (handle->nmi) {
2646 handle->event->pending_wakeup = 1;
2647 perf_pending_queue(&handle->event->pending,
2648 perf_pending_event);
2649 } else
2650 perf_event_wakeup(handle->event);
2651}
2652
2653/*
2654 * Curious locking construct.
2655 *
2656 * We need to ensure a later event_id doesn't publish a head when a former
2657 * event_id isn't done writing. However since we need to deal with NMIs we
2658 * cannot fully serialize things.
2659 *
2660 * What we do is serialize between CPUs so we only have to deal with NMI
2661 * nesting on a single CPU.
2662 *
2663 * We only publish the head (and generate a wakeup) when the outer-most
2664 * event_id completes.
2665 */
2666static void perf_output_lock(struct perf_output_handle *handle)
2667{
2668 struct perf_mmap_data *data = handle->data;
2669 int cpu;
2670
2671 handle->locked = 0;
2672
2673 local_irq_save(handle->flags);
2674 cpu = smp_processor_id();
2675
2676 if (in_nmi() && atomic_read(&data->lock) == cpu)
2677 return;
2678
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax();
2681
2682 handle->locked = 1;
2683}
2684
2685static void perf_output_unlock(struct perf_output_handle *handle)
2686{
2687 struct perf_mmap_data *data = handle->data;
2688 unsigned long head;
2689 int cpu;
2690
2691 data->done_head = data->head;
2692
2693 if (!handle->locked)
2694 goto out;
2695
2696again:
2697 /*
2698 * The xchg implies a full barrier that ensures all writes are done
2699 * before we publish the new head, matched by a rmb() in userspace when
2700 * reading this position.
2701 */
2702 while ((head = atomic_long_xchg(&data->done_head, 0)))
2703 data->user_page->data_head = head;
2704
2705 /*
2706 * NMI can happen here, which means we can miss a done_head update.
2707 */
2708
2709 cpu = atomic_xchg(&data->lock, -1);
2710 WARN_ON_ONCE(cpu != smp_processor_id());
2711
2712 /*
2713 * Therefore we have to validate we did not indeed do so.
2714 */
2715 if (unlikely(atomic_long_read(&data->done_head))) {
2716 /*
2717 * Since we had it locked, we can lock it again.
2718 */
2719 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2720 cpu_relax();
2721
2722 goto again;
2723 }
2724
2725 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle);
2727out:
2728 local_irq_restore(handle->flags);
2729}
2730
2731void perf_output_copy(struct perf_output_handle *handle,
2732 const void *buf, unsigned int len)
2733{
2734 unsigned int pages_mask;
2735 unsigned long offset;
2736 unsigned int size;
2737 void **pages;
2738
2739 offset = handle->offset;
2740 pages_mask = handle->data->nr_pages - 1;
2741 pages = handle->data->data_pages;
2742
2743 do {
2744 unsigned long page_offset;
2745 unsigned long page_size;
2746 int nr;
2747
2748 nr = (offset >> PAGE_SHIFT) & pages_mask;
2749 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2750 page_offset = offset & (page_size - 1);
2751 size = min_t(unsigned int, page_size - page_offset, len);
2752
2753 memcpy(pages[nr] + page_offset, buf, size);
2754
2755 len -= size;
2756 buf += size;
2757 offset += size;
2758 } while (len);
2759
2760 handle->offset = offset;
2761
2762 /*
2763 * Check we didn't copy past our reservation window, taking the
2764 * possible unsigned int wrap into account.
2765 */
2766 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2767}
2768
2769int perf_output_begin(struct perf_output_handle *handle,
2770 struct perf_event *event, unsigned int size,
2771 int nmi, int sample)
2772{
2773 struct perf_event *output_event;
2774 struct perf_mmap_data *data;
2775 unsigned long tail, offset, head;
2776 int have_lost;
2777 struct {
2778 struct perf_event_header header;
2779 u64 id;
2780 u64 lost;
2781 } lost_event;
2782
2783 rcu_read_lock();
2784 /*
2785 * For inherited events we send all the output towards the parent.
2786 */
2787 if (event->parent)
2788 event = event->parent;
2789
2790 output_event = rcu_dereference(event->output);
2791 if (output_event)
2792 event = output_event;
2793
2794 data = rcu_dereference(event->data);
2795 if (!data)
2796 goto out;
2797
2798 handle->data = data;
2799 handle->event = event;
2800 handle->nmi = nmi;
2801 handle->sample = sample;
2802
2803 if (!data->nr_pages)
2804 goto fail;
2805
2806 have_lost = atomic_read(&data->lost);
2807 if (have_lost)
2808 size += sizeof(lost_event);
2809
2810 perf_output_lock(handle);
2811
2812 do {
2813 /*
2814 * Userspace could choose to issue a mb() before updating the
2815 * tail pointer. So that all reads will be completed before the
2816 * write is issued.
2817 */
2818 tail = ACCESS_ONCE(data->user_page->data_tail);
2819 smp_rmb();
2820 offset = head = atomic_long_read(&data->head);
2821 head += size;
2822 if (unlikely(!perf_output_space(data, tail, offset, head)))
2823 goto fail;
2824 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2825
2826 handle->offset = offset;
2827 handle->head = head;
2828
2829 if (head - tail > data->watermark)
2830 atomic_set(&data->wakeup, 1);
2831
2832 if (have_lost) {
2833 lost_event.header.type = PERF_RECORD_LOST;
2834 lost_event.header.misc = 0;
2835 lost_event.header.size = sizeof(lost_event);
2836 lost_event.id = event->id;
2837 lost_event.lost = atomic_xchg(&data->lost, 0);
2838
2839 perf_output_put(handle, lost_event);
2840 }
2841
2842 return 0;
2843
2844fail:
2845 atomic_inc(&data->lost);
2846 perf_output_unlock(handle);
2847out:
2848 rcu_read_unlock();
2849
2850 return -ENOSPC;
2851}
2852
2853void perf_output_end(struct perf_output_handle *handle)
2854{
2855 struct perf_event *event = handle->event;
2856 struct perf_mmap_data *data = handle->data;
2857
2858 int wakeup_events = event->attr.wakeup_events;
2859
2860 if (handle->sample && wakeup_events) {
2861 int events = atomic_inc_return(&data->events);
2862 if (events >= wakeup_events) {
2863 atomic_sub(wakeup_events, &data->events);
2864 atomic_set(&data->wakeup, 1);
2865 }
2866 }
2867
2868 perf_output_unlock(handle);
2869 rcu_read_unlock();
2870}
2871
2872static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2873{
2874 /*
2875 * only top level events have the pid namespace they were created in
2876 */
2877 if (event->parent)
2878 event = event->parent;
2879
2880 return task_tgid_nr_ns(p, event->ns);
2881}
2882
2883static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2884{
2885 /*
2886 * only top level events have the pid namespace they were created in
2887 */
2888 if (event->parent)
2889 event = event->parent;
2890
2891 return task_pid_nr_ns(p, event->ns);
2892}
2893
2894static void perf_output_read_one(struct perf_output_handle *handle,
2895 struct perf_event *event)
2896{
2897 u64 read_format = event->attr.read_format;
2898 u64 values[4];
2899 int n = 0;
2900
2901 values[n++] = atomic64_read(&event->count);
2902 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2903 values[n++] = event->total_time_enabled +
2904 atomic64_read(&event->child_total_time_enabled);
2905 }
2906 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2907 values[n++] = event->total_time_running +
2908 atomic64_read(&event->child_total_time_running);
2909 }
2910 if (read_format & PERF_FORMAT_ID)
2911 values[n++] = primary_event_id(event);
2912
2913 perf_output_copy(handle, values, n * sizeof(u64));
2914}
2915
2916/*
2917 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2918 */
2919static void perf_output_read_group(struct perf_output_handle *handle,
2920 struct perf_event *event)
2921{
2922 struct perf_event *leader = event->group_leader, *sub;
2923 u64 read_format = event->attr.read_format;
2924 u64 values[5];
2925 int n = 0;
2926
2927 values[n++] = 1 + leader->nr_siblings;
2928
2929 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2930 values[n++] = leader->total_time_enabled;
2931
2932 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2933 values[n++] = leader->total_time_running;
2934
2935 if (leader != event)
2936 leader->pmu->read(leader);
2937
2938 values[n++] = atomic64_read(&leader->count);
2939 if (read_format & PERF_FORMAT_ID)
2940 values[n++] = primary_event_id(leader);
2941
2942 perf_output_copy(handle, values, n * sizeof(u64));
2943
2944 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2945 n = 0;
2946
2947 if (sub != event)
2948 sub->pmu->read(sub);
2949
2950 values[n++] = atomic64_read(&sub->count);
2951 if (read_format & PERF_FORMAT_ID)
2952 values[n++] = primary_event_id(sub);
2953
2954 perf_output_copy(handle, values, n * sizeof(u64));
2955 }
2956}
2957
2958static void perf_output_read(struct perf_output_handle *handle,
2959 struct perf_event *event)
2960{
2961 if (event->attr.read_format & PERF_FORMAT_GROUP)
2962 perf_output_read_group(handle, event);
2963 else
2964 perf_output_read_one(handle, event);
2965}
2966
2967void perf_output_sample(struct perf_output_handle *handle,
2968 struct perf_event_header *header,
2969 struct perf_sample_data *data,
2970 struct perf_event *event)
2971{
2972 u64 sample_type = data->type;
2973
2974 perf_output_put(handle, *header);
2975
2976 if (sample_type & PERF_SAMPLE_IP)
2977 perf_output_put(handle, data->ip);
2978
2979 if (sample_type & PERF_SAMPLE_TID)
2980 perf_output_put(handle, data->tid_entry);
2981
2982 if (sample_type & PERF_SAMPLE_TIME)
2983 perf_output_put(handle, data->time);
2984
2985 if (sample_type & PERF_SAMPLE_ADDR)
2986 perf_output_put(handle, data->addr);
2987
2988 if (sample_type & PERF_SAMPLE_ID)
2989 perf_output_put(handle, data->id);
2990
2991 if (sample_type & PERF_SAMPLE_STREAM_ID)
2992 perf_output_put(handle, data->stream_id);
2993
2994 if (sample_type & PERF_SAMPLE_CPU)
2995 perf_output_put(handle, data->cpu_entry);
2996
2997 if (sample_type & PERF_SAMPLE_PERIOD)
2998 perf_output_put(handle, data->period);
2999
3000 if (sample_type & PERF_SAMPLE_READ)
3001 perf_output_read(handle, event);
3002
3003 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3004 if (data->callchain) {
3005 int size = 1;
3006
3007 if (data->callchain)
3008 size += data->callchain->nr;
3009
3010 size *= sizeof(u64);
3011
3012 perf_output_copy(handle, data->callchain, size);
3013 } else {
3014 u64 nr = 0;
3015 perf_output_put(handle, nr);
3016 }
3017 }
3018
3019 if (sample_type & PERF_SAMPLE_RAW) {
3020 if (data->raw) {
3021 perf_output_put(handle, data->raw->size);
3022 perf_output_copy(handle, data->raw->data,
3023 data->raw->size);
3024 } else {
3025 struct {
3026 u32 size;
3027 u32 data;
3028 } raw = {
3029 .size = sizeof(u32),
3030 .data = 0,
3031 };
3032 perf_output_put(handle, raw);
3033 }
3034 }
3035}
3036
3037void perf_prepare_sample(struct perf_event_header *header,
3038 struct perf_sample_data *data,
3039 struct perf_event *event,
3040 struct pt_regs *regs)
3041{
3042 u64 sample_type = event->attr.sample_type;
3043
3044 data->type = sample_type;
3045
3046 header->type = PERF_RECORD_SAMPLE;
3047 header->size = sizeof(*header);
3048
3049 header->misc = 0;
3050 header->misc |= perf_misc_flags(regs);
3051
3052 if (sample_type & PERF_SAMPLE_IP) {
3053 data->ip = perf_instruction_pointer(regs);
3054
3055 header->size += sizeof(data->ip);
3056 }
3057
3058 if (sample_type & PERF_SAMPLE_TID) {
3059 /* namespace issues */
3060 data->tid_entry.pid = perf_event_pid(event, current);
3061 data->tid_entry.tid = perf_event_tid(event, current);
3062
3063 header->size += sizeof(data->tid_entry);
3064 }
3065
3066 if (sample_type & PERF_SAMPLE_TIME) {
3067 data->time = perf_clock();
3068
3069 header->size += sizeof(data->time);
3070 }
3071
3072 if (sample_type & PERF_SAMPLE_ADDR)
3073 header->size += sizeof(data->addr);
3074
3075 if (sample_type & PERF_SAMPLE_ID) {
3076 data->id = primary_event_id(event);
3077
3078 header->size += sizeof(data->id);
3079 }
3080
3081 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3082 data->stream_id = event->id;
3083
3084 header->size += sizeof(data->stream_id);
3085 }
3086
3087 if (sample_type & PERF_SAMPLE_CPU) {
3088 data->cpu_entry.cpu = raw_smp_processor_id();
3089 data->cpu_entry.reserved = 0;
3090
3091 header->size += sizeof(data->cpu_entry);
3092 }
3093
3094 if (sample_type & PERF_SAMPLE_PERIOD)
3095 header->size += sizeof(data->period);
3096
3097 if (sample_type & PERF_SAMPLE_READ)
3098 header->size += perf_event_read_size(event);
3099
3100 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3101 int size = 1;
3102
3103 data->callchain = perf_callchain(regs);
3104
3105 if (data->callchain)
3106 size += data->callchain->nr;
3107
3108 header->size += size * sizeof(u64);
3109 }
3110
3111 if (sample_type & PERF_SAMPLE_RAW) {
3112 int size = sizeof(u32);
3113
3114 if (data->raw)
3115 size += data->raw->size;
3116 else
3117 size += sizeof(u32);
3118
3119 WARN_ON_ONCE(size & (sizeof(u64)-1));
3120 header->size += size;
3121 }
3122}
3123
3124static void perf_event_output(struct perf_event *event, int nmi,
3125 struct perf_sample_data *data,
3126 struct pt_regs *regs)
3127{
3128 struct perf_output_handle handle;
3129 struct perf_event_header header;
3130
3131 perf_prepare_sample(&header, data, event, regs);
3132
3133 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3134 return;
3135
3136 perf_output_sample(&handle, &header, data, event);
3137
3138 perf_output_end(&handle);
3139}
3140
3141/*
3142 * read event_id
3143 */
3144
3145struct perf_read_event {
3146 struct perf_event_header header;
3147
3148 u32 pid;
3149 u32 tid;
3150};
3151
3152static void
3153perf_event_read_event(struct perf_event *event,
3154 struct task_struct *task)
3155{
3156 struct perf_output_handle handle;
3157 struct perf_read_event read_event = {
3158 .header = {
3159 .type = PERF_RECORD_READ,
3160 .misc = 0,
3161 .size = sizeof(read_event) + perf_event_read_size(event),
3162 },
3163 .pid = perf_event_pid(event, task),
3164 .tid = perf_event_tid(event, task),
3165 };
3166 int ret;
3167
3168 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3169 if (ret)
3170 return;
3171
3172 perf_output_put(&handle, read_event);
3173 perf_output_read(&handle, event);
3174
3175 perf_output_end(&handle);
3176}
3177
3178/*
3179 * task tracking -- fork/exit
3180 *
3181 * enabled by: attr.comm | attr.mmap | attr.task
3182 */
3183
3184struct perf_task_event {
3185 struct task_struct *task;
3186 struct perf_event_context *task_ctx;
3187
3188 struct {
3189 struct perf_event_header header;
3190
3191 u32 pid;
3192 u32 ppid;
3193 u32 tid;
3194 u32 ptid;
3195 u64 time;
3196 } event_id;
3197};
3198
3199static void perf_event_task_output(struct perf_event *event,
3200 struct perf_task_event *task_event)
3201{
3202 struct perf_output_handle handle;
3203 int size;
3204 struct task_struct *task = task_event->task;
3205 int ret;
3206
3207 size = task_event->event_id.header.size;
3208 ret = perf_output_begin(&handle, event, size, 0, 0);
3209
3210 if (ret)
3211 return;
3212
3213 task_event->event_id.pid = perf_event_pid(event, task);
3214 task_event->event_id.ppid = perf_event_pid(event, current);
3215
3216 task_event->event_id.tid = perf_event_tid(event, task);
3217 task_event->event_id.ptid = perf_event_tid(event, current);
3218
3219 task_event->event_id.time = perf_clock();
3220
3221 perf_output_put(&handle, task_event->event_id);
3222
3223 perf_output_end(&handle);
3224}
3225
3226static int perf_event_task_match(struct perf_event *event)
3227{
3228 if (event->attr.comm || event->attr.mmap || event->attr.task)
3229 return 1;
3230
3231 return 0;
3232}
3233
3234static void perf_event_task_ctx(struct perf_event_context *ctx,
3235 struct perf_task_event *task_event)
3236{
3237 struct perf_event *event;
3238
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event);
3246 }
3247 rcu_read_unlock();
3248}
3249
3250static void perf_event_task_event(struct perf_task_event *task_event)
3251{
3252 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx;
3254
3255 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context);
3258
3259 rcu_read_lock();
3260 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3262 if (ctx)
3263 perf_event_task_ctx(ctx, task_event);
3264 rcu_read_unlock();
3265}
3266
3267static void perf_event_task(struct task_struct *task,
3268 struct perf_event_context *task_ctx,
3269 int new)
3270{
3271 struct perf_task_event task_event;
3272
3273 if (!atomic_read(&nr_comm_events) &&
3274 !atomic_read(&nr_mmap_events) &&
3275 !atomic_read(&nr_task_events))
3276 return;
3277
3278 task_event = (struct perf_task_event){
3279 .task = task,
3280 .task_ctx = task_ctx,
3281 .event_id = {
3282 .header = {
3283 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3284 .misc = 0,
3285 .size = sizeof(task_event.event_id),
3286 },
3287 /* .pid */
3288 /* .ppid */
3289 /* .tid */
3290 /* .ptid */
3291 },
3292 };
3293
3294 perf_event_task_event(&task_event);
3295}
3296
3297void perf_event_fork(struct task_struct *task)
3298{
3299 perf_event_task(task, NULL, 1);
3300}
3301
3302/*
3303 * comm tracking
3304 */
3305
3306struct perf_comm_event {
3307 struct task_struct *task;
3308 char *comm;
3309 int comm_size;
3310
3311 struct {
3312 struct perf_event_header header;
3313
3314 u32 pid;
3315 u32 tid;
3316 } event_id;
3317};
3318
3319static void perf_event_comm_output(struct perf_event *event,
3320 struct perf_comm_event *comm_event)
3321{
3322 struct perf_output_handle handle;
3323 int size = comm_event->event_id.header.size;
3324 int ret = perf_output_begin(&handle, event, size, 0, 0);
3325
3326 if (ret)
3327 return;
3328
3329 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3330 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3331
3332 perf_output_put(&handle, comm_event->event_id);
3333 perf_output_copy(&handle, comm_event->comm,
3334 comm_event->comm_size);
3335 perf_output_end(&handle);
3336}
3337
3338static int perf_event_comm_match(struct perf_event *event)
3339{
3340 if (event->attr.comm)
3341 return 1;
3342
3343 return 0;
3344}
3345
3346static void perf_event_comm_ctx(struct perf_event_context *ctx,
3347 struct perf_comm_event *comm_event)
3348{
3349 struct perf_event *event;
3350
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event);
3358 }
3359 rcu_read_unlock();
3360}
3361
3362static void perf_event_comm_event(struct perf_comm_event *comm_event)
3363{
3364 struct perf_cpu_context *cpuctx;
3365 struct perf_event_context *ctx;
3366 unsigned int size;
3367 char comm[TASK_COMM_LEN];
3368
3369 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372
3373 comm_event->comm = comm;
3374 comm_event->comm_size = size;
3375
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377
3378 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context);
3381
3382 rcu_read_lock();
3383 /*
3384 * doesn't really matter which of the child contexts the
3385 * events ends up in.
3386 */
3387 ctx = rcu_dereference(current->perf_event_ctxp);
3388 if (ctx)
3389 perf_event_comm_ctx(ctx, comm_event);
3390 rcu_read_unlock();
3391}
3392
3393void perf_event_comm(struct task_struct *task)
3394{
3395 struct perf_comm_event comm_event;
3396
3397 if (task->perf_event_ctxp)
3398 perf_event_enable_on_exec(task);
3399
3400 if (!atomic_read(&nr_comm_events))
3401 return;
3402
3403 comm_event = (struct perf_comm_event){
3404 .task = task,
3405 /* .comm */
3406 /* .comm_size */
3407 .event_id = {
3408 .header = {
3409 .type = PERF_RECORD_COMM,
3410 .misc = 0,
3411 /* .size */
3412 },
3413 /* .pid */
3414 /* .tid */
3415 },
3416 };
3417
3418 perf_event_comm_event(&comm_event);
3419}
3420
3421/*
3422 * mmap tracking
3423 */
3424
3425struct perf_mmap_event {
3426 struct vm_area_struct *vma;
3427
3428 const char *file_name;
3429 int file_size;
3430
3431 struct {
3432 struct perf_event_header header;
3433
3434 u32 pid;
3435 u32 tid;
3436 u64 start;
3437 u64 len;
3438 u64 pgoff;
3439 } event_id;
3440};
3441
3442static void perf_event_mmap_output(struct perf_event *event,
3443 struct perf_mmap_event *mmap_event)
3444{
3445 struct perf_output_handle handle;
3446 int size = mmap_event->event_id.header.size;
3447 int ret = perf_output_begin(&handle, event, size, 0, 0);
3448
3449 if (ret)
3450 return;
3451
3452 mmap_event->event_id.pid = perf_event_pid(event, current);
3453 mmap_event->event_id.tid = perf_event_tid(event, current);
3454
3455 perf_output_put(&handle, mmap_event->event_id);
3456 perf_output_copy(&handle, mmap_event->file_name,
3457 mmap_event->file_size);
3458 perf_output_end(&handle);
3459}
3460
3461static int perf_event_mmap_match(struct perf_event *event,
3462 struct perf_mmap_event *mmap_event)
3463{
3464 if (event->attr.mmap)
3465 return 1;
3466
3467 return 0;
3468}
3469
3470static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3471 struct perf_mmap_event *mmap_event)
3472{
3473 struct perf_event *event;
3474
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event);
3482 }
3483 rcu_read_unlock();
3484}
3485
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3487{
3488 struct perf_cpu_context *cpuctx;
3489 struct perf_event_context *ctx;
3490 struct vm_area_struct *vma = mmap_event->vma;
3491 struct file *file = vma->vm_file;
3492 unsigned int size;
3493 char tmp[16];
3494 char *buf = NULL;
3495 const char *name;
3496
3497 memset(tmp, 0, sizeof(tmp));
3498
3499 if (file) {
3500 /*
3501 * d_path works from the end of the buffer backwards, so we
3502 * need to add enough zero bytes after the string to handle
3503 * the 64bit alignment we do later.
3504 */
3505 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3506 if (!buf) {
3507 name = strncpy(tmp, "//enomem", sizeof(tmp));
3508 goto got_name;
3509 }
3510 name = d_path(&file->f_path, buf, PATH_MAX);
3511 if (IS_ERR(name)) {
3512 name = strncpy(tmp, "//toolong", sizeof(tmp));
3513 goto got_name;
3514 }
3515 } else {
3516 if (arch_vma_name(mmap_event->vma)) {
3517 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3518 sizeof(tmp));
3519 goto got_name;
3520 }
3521
3522 if (!vma->vm_mm) {
3523 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3524 goto got_name;
3525 }
3526
3527 name = strncpy(tmp, "//anon", sizeof(tmp));
3528 goto got_name;
3529 }
3530
3531got_name:
3532 size = ALIGN(strlen(name)+1, sizeof(u64));
3533
3534 mmap_event->file_name = name;
3535 mmap_event->file_size = size;
3536
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538
3539 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context);
3542
3543 rcu_read_lock();
3544 /*
3545 * doesn't really matter which of the child contexts the
3546 * events ends up in.
3547 */
3548 ctx = rcu_dereference(current->perf_event_ctxp);
3549 if (ctx)
3550 perf_event_mmap_ctx(ctx, mmap_event);
3551 rcu_read_unlock();
3552
3553 kfree(buf);
3554}
3555
3556void __perf_event_mmap(struct vm_area_struct *vma)
3557{
3558 struct perf_mmap_event mmap_event;
3559
3560 if (!atomic_read(&nr_mmap_events))
3561 return;
3562
3563 mmap_event = (struct perf_mmap_event){
3564 .vma = vma,
3565 /* .file_name */
3566 /* .file_size */
3567 .event_id = {
3568 .header = {
3569 .type = PERF_RECORD_MMAP,
3570 .misc = 0,
3571 /* .size */
3572 },
3573 /* .pid */
3574 /* .tid */
3575 .start = vma->vm_start,
3576 .len = vma->vm_end - vma->vm_start,
3577 .pgoff = vma->vm_pgoff,
3578 },
3579 };
3580
3581 perf_event_mmap_event(&mmap_event);
3582}
3583
3584/*
3585 * IRQ throttle logging
3586 */
3587
3588static void perf_log_throttle(struct perf_event *event, int enable)
3589{
3590 struct perf_output_handle handle;
3591 int ret;
3592
3593 struct {
3594 struct perf_event_header header;
3595 u64 time;
3596 u64 id;
3597 u64 stream_id;
3598 } throttle_event = {
3599 .header = {
3600 .type = PERF_RECORD_THROTTLE,
3601 .misc = 0,
3602 .size = sizeof(throttle_event),
3603 },
3604 .time = perf_clock(),
3605 .id = primary_event_id(event),
3606 .stream_id = event->id,
3607 };
3608
3609 if (enable)
3610 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3611
3612 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3613 if (ret)
3614 return;
3615
3616 perf_output_put(&handle, throttle_event);
3617 perf_output_end(&handle);
3618}
3619
3620/*
3621 * Generic event overflow handling, sampling.
3622 */
3623
3624static int __perf_event_overflow(struct perf_event *event, int nmi,
3625 int throttle, struct perf_sample_data *data,
3626 struct pt_regs *regs)
3627{
3628 int events = atomic_read(&event->event_limit);
3629 struct hw_perf_event *hwc = &event->hw;
3630 int ret = 0;
3631
3632 throttle = (throttle && event->pmu->unthrottle != NULL);
3633
3634 if (!throttle) {
3635 hwc->interrupts++;
3636 } else {
3637 if (hwc->interrupts != MAX_INTERRUPTS) {
3638 hwc->interrupts++;
3639 if (HZ * hwc->interrupts >
3640 (u64)sysctl_perf_event_sample_rate) {
3641 hwc->interrupts = MAX_INTERRUPTS;
3642 perf_log_throttle(event, 0);
3643 ret = 1;
3644 }
3645 } else {
3646 /*
3647 * Keep re-disabling events even though on the previous
3648 * pass we disabled it - just in case we raced with a
3649 * sched-in and the event got enabled again:
3650 */
3651 ret = 1;
3652 }
3653 }
3654
3655 if (event->attr.freq) {
3656 u64 now = perf_clock();
3657 s64 delta = now - hwc->freq_stamp;
3658
3659 hwc->freq_stamp = now;
3660
3661 if (delta > 0 && delta < TICK_NSEC)
3662 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3663 }
3664
3665 /*
3666 * XXX event_limit might not quite work as expected on inherited
3667 * events
3668 */
3669
3670 event->pending_kill = POLL_IN;
3671 if (events && atomic_dec_and_test(&event->event_limit)) {
3672 ret = 1;
3673 event->pending_kill = POLL_HUP;
3674 if (nmi) {
3675 event->pending_disable = 1;
3676 perf_pending_queue(&event->pending,
3677 perf_pending_event);
3678 } else
3679 perf_event_disable(event);
3680 }
3681
3682 perf_event_output(event, nmi, data, regs);
3683 return ret;
3684}
3685
3686int perf_event_overflow(struct perf_event *event, int nmi,
3687 struct perf_sample_data *data,
3688 struct pt_regs *regs)
3689{
3690 return __perf_event_overflow(event, nmi, 1, data, regs);
3691}
3692
3693/*
3694 * Generic software event infrastructure
3695 */
3696
3697/*
3698 * We directly increment event->count and keep a second value in
3699 * event->hw.period_left to count intervals. This period event
3700 * is kept in the range [-sample_period, 0] so that we can use the
3701 * sign as trigger.
3702 */
3703
3704static u64 perf_swevent_set_period(struct perf_event *event)
3705{
3706 struct hw_perf_event *hwc = &event->hw;
3707 u64 period = hwc->last_period;
3708 u64 nr, offset;
3709 s64 old, val;
3710
3711 hwc->last_period = hwc->sample_period;
3712
3713again:
3714 old = val = atomic64_read(&hwc->period_left);
3715 if (val < 0)
3716 return 0;
3717
3718 nr = div64_u64(period + val, period);
3719 offset = nr * period;
3720 val -= offset;
3721 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3722 goto again;
3723
3724 return nr;
3725}
3726
3727static void perf_swevent_overflow(struct perf_event *event,
3728 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs)
3730{
3731 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0;
3733 u64 overflow;
3734
3735 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event);
3737
3738 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return;
3740
3741 for (; overflow; overflow--) {
3742 if (__perf_event_overflow(event, nmi, throttle,
3743 data, regs)) {
3744 /*
3745 * We inhibit the overflow from happening when
3746 * hwc->interrupts == MAX_INTERRUPTS.
3747 */
3748 break;
3749 }
3750 throttle = 1;
3751 }
3752}
3753
3754static void perf_swevent_unthrottle(struct perf_event *event)
3755{
3756 /*
3757 * Nothing to do, we already reset hwc->interrupts.
3758 */
3759}
3760
3761static void perf_swevent_add(struct perf_event *event, u64 nr,
3762 int nmi, struct perf_sample_data *data,
3763 struct pt_regs *regs)
3764{
3765 struct hw_perf_event *hwc = &event->hw;
3766
3767 atomic64_add(nr, &event->count);
3768
3769 if (!hwc->sample_period)
3770 return;
3771
3772 if (!regs)
3773 return;
3774
3775 if (!atomic64_add_negative(nr, &hwc->period_left))
3776 perf_swevent_overflow(event, nmi, data, regs);
3777}
3778
3779static int perf_swevent_is_counting(struct perf_event *event)
3780{
3781 /*
3782 * The event is active, we're good!
3783 */
3784 if (event->state == PERF_EVENT_STATE_ACTIVE)
3785 return 1;
3786
3787 /*
3788 * The event is off/error, not counting.
3789 */
3790 if (event->state != PERF_EVENT_STATE_INACTIVE)
3791 return 0;
3792
3793 /*
3794 * The event is inactive, if the context is active
3795 * we're part of a group that didn't make it on the 'pmu',
3796 * not counting.
3797 */
3798 if (event->ctx->is_active)
3799 return 0;
3800
3801 /*
3802 * We're inactive and the context is too, this means the
3803 * task is scheduled out, we're counting events that happen
3804 * to us, like migration events.
3805 */
3806 return 1;
3807}
3808
3809static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs)
3812{
3813 if (!perf_swevent_is_counting(event))
3814 return 0;
3815
3816 if (event->attr.type != type)
3817 return 0;
3818 if (event->attr.config != event_id)
3819 return 0;
3820
3821 if (regs) {
3822 if (event->attr.exclude_user && user_mode(regs))
3823 return 0;
3824
3825 if (event->attr.exclude_kernel && !user_mode(regs))
3826 return 0;
3827 }
3828
3829 return 1;
3830}
3831
3832static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3833 enum perf_type_id type,
3834 u32 event_id, u64 nr, int nmi,
3835 struct perf_sample_data *data,
3836 struct pt_regs *regs)
3837{
3838 struct perf_event *event;
3839
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs);
3847 }
3848 rcu_read_unlock();
3849}
3850
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3852{
3853 if (in_nmi())
3854 return &cpuctx->recursion[3];
3855
3856 if (in_irq())
3857 return &cpuctx->recursion[2];
3858
3859 if (in_softirq())
3860 return &cpuctx->recursion[1];
3861
3862 return &cpuctx->recursion[0];
3863}
3864
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi,
3867 struct perf_sample_data *data,
3868 struct pt_regs *regs)
3869{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx;
3873
3874 if (*recursion)
3875 goto out;
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /*
3884 * doesn't really matter which of the child contexts the
3885 * events ends up in.
3886 */
3887 ctx = rcu_dereference(current->perf_event_ctxp);
3888 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897}
3898
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr)
3901{
3902 struct perf_sample_data data = {
3903 .addr = addr,
3904 };
3905
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3907 &data, regs);
3908}
3909
3910static void perf_swevent_read(struct perf_event *event)
3911{
3912}
3913
3914static int perf_swevent_enable(struct perf_event *event)
3915{
3916 struct hw_perf_event *hwc = &event->hw;
3917
3918 if (hwc->sample_period) {
3919 hwc->last_period = hwc->sample_period;
3920 perf_swevent_set_period(event);
3921 }
3922 return 0;
3923}
3924
3925static void perf_swevent_disable(struct perf_event *event)
3926{
3927}
3928
3929static const struct pmu perf_ops_generic = {
3930 .enable = perf_swevent_enable,
3931 .disable = perf_swevent_disable,
3932 .read = perf_swevent_read,
3933 .unthrottle = perf_swevent_unthrottle,
3934};
3935
3936/*
3937 * hrtimer based swevent callback
3938 */
3939
3940static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3941{
3942 enum hrtimer_restart ret = HRTIMER_RESTART;
3943 struct perf_sample_data data;
3944 struct pt_regs *regs;
3945 struct perf_event *event;
3946 u64 period;
3947
3948 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3949 event->pmu->read(event);
3950
3951 data.addr = 0;
3952 regs = get_irq_regs();
3953 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt
3955 * context, provide the next best thing, the user IP.
3956 */
3957 if ((event->attr.exclude_kernel || !regs) &&
3958 !event->attr.exclude_user)
3959 regs = task_pt_regs(current);
3960
3961 if (regs) {
3962 if (perf_event_overflow(event, 0, &data, regs))
3963 ret = HRTIMER_NORESTART;
3964 }
3965
3966 period = max_t(u64, 10000, event->hw.sample_period);
3967 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3968
3969 return ret;
3970}
3971
3972/*
3973 * Software event: cpu wall time clock
3974 */
3975
3976static void cpu_clock_perf_event_update(struct perf_event *event)
3977{
3978 int cpu = raw_smp_processor_id();
3979 s64 prev;
3980 u64 now;
3981
3982 now = cpu_clock(cpu);
3983 prev = atomic64_read(&event->hw.prev_count);
3984 atomic64_set(&event->hw.prev_count, now);
3985 atomic64_add(now - prev, &event->count);
3986}
3987
3988static int cpu_clock_perf_event_enable(struct perf_event *event)
3989{
3990 struct hw_perf_event *hwc = &event->hw;
3991 int cpu = raw_smp_processor_id();
3992
3993 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3994 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3995 hwc->hrtimer.function = perf_swevent_hrtimer;
3996 if (hwc->sample_period) {
3997 u64 period = max_t(u64, 10000, hwc->sample_period);
3998 __hrtimer_start_range_ns(&hwc->hrtimer,
3999 ns_to_ktime(period), 0,
4000 HRTIMER_MODE_REL, 0);
4001 }
4002
4003 return 0;
4004}
4005
4006static void cpu_clock_perf_event_disable(struct perf_event *event)
4007{
4008 if (event->hw.sample_period)
4009 hrtimer_cancel(&event->hw.hrtimer);
4010 cpu_clock_perf_event_update(event);
4011}
4012
4013static void cpu_clock_perf_event_read(struct perf_event *event)
4014{
4015 cpu_clock_perf_event_update(event);
4016}
4017
4018static const struct pmu perf_ops_cpu_clock = {
4019 .enable = cpu_clock_perf_event_enable,
4020 .disable = cpu_clock_perf_event_disable,
4021 .read = cpu_clock_perf_event_read,
4022};
4023
4024/*
4025 * Software event: task time clock
4026 */
4027
4028static void task_clock_perf_event_update(struct perf_event *event, u64 now)
4029{
4030 u64 prev;
4031 s64 delta;
4032
4033 prev = atomic64_xchg(&event->hw.prev_count, now);
4034 delta = now - prev;
4035 atomic64_add(delta, &event->count);
4036}
4037
4038static int task_clock_perf_event_enable(struct perf_event *event)
4039{
4040 struct hw_perf_event *hwc = &event->hw;
4041 u64 now;
4042
4043 now = event->ctx->time;
4044
4045 atomic64_set(&hwc->prev_count, now);
4046 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4047 hwc->hrtimer.function = perf_swevent_hrtimer;
4048 if (hwc->sample_period) {
4049 u64 period = max_t(u64, 10000, hwc->sample_period);
4050 __hrtimer_start_range_ns(&hwc->hrtimer,
4051 ns_to_ktime(period), 0,
4052 HRTIMER_MODE_REL, 0);
4053 }
4054
4055 return 0;
4056}
4057
4058static void task_clock_perf_event_disable(struct perf_event *event)
4059{
4060 if (event->hw.sample_period)
4061 hrtimer_cancel(&event->hw.hrtimer);
4062 task_clock_perf_event_update(event, event->ctx->time);
4063
4064}
4065
4066static void task_clock_perf_event_read(struct perf_event *event)
4067{
4068 u64 time;
4069
4070 if (!in_nmi()) {
4071 update_context_time(event->ctx);
4072 time = event->ctx->time;
4073 } else {
4074 u64 now = perf_clock();
4075 u64 delta = now - event->ctx->timestamp;
4076 time = event->ctx->time + delta;
4077 }
4078
4079 task_clock_perf_event_update(event, time);
4080}
4081
4082static const struct pmu perf_ops_task_clock = {
4083 .enable = task_clock_perf_event_enable,
4084 .disable = task_clock_perf_event_disable,
4085 .read = task_clock_perf_event_read,
4086};
4087
4088#ifdef CONFIG_EVENT_PROFILE
4089void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4090 int entry_size)
4091{
4092 struct perf_raw_record raw = {
4093 .size = entry_size,
4094 .data = record,
4095 };
4096
4097 struct perf_sample_data data = {
4098 .addr = addr,
4099 .raw = &raw,
4100 };
4101
4102 struct pt_regs *regs = get_irq_regs();
4103
4104 if (!regs)
4105 regs = task_pt_regs(current);
4106
4107 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4108 &data, regs);
4109}
4110EXPORT_SYMBOL_GPL(perf_tp_event);
4111
4112extern int ftrace_profile_enable(int);
4113extern void ftrace_profile_disable(int);
4114
4115static void tp_perf_event_destroy(struct perf_event *event)
4116{
4117 ftrace_profile_disable(event->attr.config);
4118}
4119
4120static const struct pmu *tp_perf_event_init(struct perf_event *event)
4121{
4122 /*
4123 * Raw tracepoint data is a severe data leak, only allow root to
4124 * have these.
4125 */
4126 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4127 perf_paranoid_tracepoint_raw() &&
4128 !capable(CAP_SYS_ADMIN))
4129 return ERR_PTR(-EPERM);
4130
4131 if (ftrace_profile_enable(event->attr.config))
4132 return NULL;
4133
4134 event->destroy = tp_perf_event_destroy;
4135
4136 return &perf_ops_generic;
4137}
4138#else
4139static const struct pmu *tp_perf_event_init(struct perf_event *event)
4140{
4141 return NULL;
4142}
4143#endif
4144
4145atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4146
4147static void sw_perf_event_destroy(struct perf_event *event)
4148{
4149 u64 event_id = event->attr.config;
4150
4151 WARN_ON(event->parent);
4152
4153 atomic_dec(&perf_swevent_enabled[event_id]);
4154}
4155
4156static const struct pmu *sw_perf_event_init(struct perf_event *event)
4157{
4158 const struct pmu *pmu = NULL;
4159 u64 event_id = event->attr.config;
4160
4161 /*
4162 * Software events (currently) can't in general distinguish
4163 * between user, kernel and hypervisor events.
4164 * However, context switches and cpu migrations are considered
4165 * to be kernel events, and page faults are never hypervisor
4166 * events.
4167 */
4168 switch (event_id) {
4169 case PERF_COUNT_SW_CPU_CLOCK:
4170 pmu = &perf_ops_cpu_clock;
4171
4172 break;
4173 case PERF_COUNT_SW_TASK_CLOCK:
4174 /*
4175 * If the user instantiates this as a per-cpu event,
4176 * use the cpu_clock event instead.
4177 */
4178 if (event->ctx->task)
4179 pmu = &perf_ops_task_clock;
4180 else
4181 pmu = &perf_ops_cpu_clock;
4182
4183 break;
4184 case PERF_COUNT_SW_PAGE_FAULTS:
4185 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4186 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4187 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4188 case PERF_COUNT_SW_CPU_MIGRATIONS:
4189 if (!event->parent) {
4190 atomic_inc(&perf_swevent_enabled[event_id]);
4191 event->destroy = sw_perf_event_destroy;
4192 }
4193 pmu = &perf_ops_generic;
4194 break;
4195 }
4196
4197 return pmu;
4198}
4199
4200/*
4201 * Allocate and initialize a event structure
4202 */
4203static struct perf_event *
4204perf_event_alloc(struct perf_event_attr *attr,
4205 int cpu,
4206 struct perf_event_context *ctx,
4207 struct perf_event *group_leader,
4208 struct perf_event *parent_event,
4209 gfp_t gfpflags)
4210{
4211 const struct pmu *pmu;
4212 struct perf_event *event;
4213 struct hw_perf_event *hwc;
4214 long err;
4215
4216 event = kzalloc(sizeof(*event), gfpflags);
4217 if (!event)
4218 return ERR_PTR(-ENOMEM);
4219
4220 /*
4221 * Single events are their own group leaders, with an
4222 * empty sibling list:
4223 */
4224 if (!group_leader)
4225 group_leader = event;
4226
4227 mutex_init(&event->child_mutex);
4228 INIT_LIST_HEAD(&event->child_list);
4229
4230 INIT_LIST_HEAD(&event->group_entry);
4231 INIT_LIST_HEAD(&event->event_entry);
4232 INIT_LIST_HEAD(&event->sibling_list);
4233 init_waitqueue_head(&event->waitq);
4234
4235 mutex_init(&event->mmap_mutex);
4236
4237 event->cpu = cpu;
4238 event->attr = *attr;
4239 event->group_leader = group_leader;
4240 event->pmu = NULL;
4241 event->ctx = ctx;
4242 event->oncpu = -1;
4243
4244 event->parent = parent_event;
4245
4246 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4247 event->id = atomic64_inc_return(&perf_event_id);
4248
4249 event->state = PERF_EVENT_STATE_INACTIVE;
4250
4251 if (attr->disabled)
4252 event->state = PERF_EVENT_STATE_OFF;
4253
4254 pmu = NULL;
4255
4256 hwc = &event->hw;
4257 hwc->sample_period = attr->sample_period;
4258 if (attr->freq && attr->sample_freq)
4259 hwc->sample_period = 1;
4260 hwc->last_period = hwc->sample_period;
4261
4262 atomic64_set(&hwc->period_left, hwc->sample_period);
4263
4264 /*
4265 * we currently do not support PERF_FORMAT_GROUP on inherited events
4266 */
4267 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4268 goto done;
4269
4270 switch (attr->type) {
4271 case PERF_TYPE_RAW:
4272 case PERF_TYPE_HARDWARE:
4273 case PERF_TYPE_HW_CACHE:
4274 pmu = hw_perf_event_init(event);
4275 break;
4276
4277 case PERF_TYPE_SOFTWARE:
4278 pmu = sw_perf_event_init(event);
4279 break;
4280
4281 case PERF_TYPE_TRACEPOINT:
4282 pmu = tp_perf_event_init(event);
4283 break;
4284
4285 default:
4286 break;
4287 }
4288done:
4289 err = 0;
4290 if (!pmu)
4291 err = -EINVAL;
4292 else if (IS_ERR(pmu))
4293 err = PTR_ERR(pmu);
4294
4295 if (err) {
4296 if (event->ns)
4297 put_pid_ns(event->ns);
4298 kfree(event);
4299 return ERR_PTR(err);
4300 }
4301
4302 event->pmu = pmu;
4303
4304 if (!event->parent) {
4305 atomic_inc(&nr_events);
4306 if (event->attr.mmap)
4307 atomic_inc(&nr_mmap_events);
4308 if (event->attr.comm)
4309 atomic_inc(&nr_comm_events);
4310 if (event->attr.task)
4311 atomic_inc(&nr_task_events);
4312 }
4313
4314 return event;
4315}
4316
4317static int perf_copy_attr(struct perf_event_attr __user *uattr,
4318 struct perf_event_attr *attr)
4319{
4320 u32 size;
4321 int ret;
4322
4323 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4324 return -EFAULT;
4325
4326 /*
4327 * zero the full structure, so that a short copy will be nice.
4328 */
4329 memset(attr, 0, sizeof(*attr));
4330
4331 ret = get_user(size, &uattr->size);
4332 if (ret)
4333 return ret;
4334
4335 if (size > PAGE_SIZE) /* silly large */
4336 goto err_size;
4337
4338 if (!size) /* abi compat */
4339 size = PERF_ATTR_SIZE_VER0;
4340
4341 if (size < PERF_ATTR_SIZE_VER0)
4342 goto err_size;
4343
4344 /*
4345 * If we're handed a bigger struct than we know of,
4346 * ensure all the unknown bits are 0 - i.e. new
4347 * user-space does not rely on any kernel feature
4348 * extensions we dont know about yet.
4349 */
4350 if (size > sizeof(*attr)) {
4351 unsigned char __user *addr;
4352 unsigned char __user *end;
4353 unsigned char val;
4354
4355 addr = (void __user *)uattr + sizeof(*attr);
4356 end = (void __user *)uattr + size;
4357
4358 for (; addr < end; addr++) {
4359 ret = get_user(val, addr);
4360 if (ret)
4361 return ret;
4362 if (val)
4363 goto err_size;
4364 }
4365 size = sizeof(*attr);
4366 }
4367
4368 ret = copy_from_user(attr, uattr, size);
4369 if (ret)
4370 return -EFAULT;
4371
4372 /*
4373 * If the type exists, the corresponding creation will verify
4374 * the attr->config.
4375 */
4376 if (attr->type >= PERF_TYPE_MAX)
4377 return -EINVAL;
4378
4379 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4380 return -EINVAL;
4381
4382 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4383 return -EINVAL;
4384
4385 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4386 return -EINVAL;
4387
4388out:
4389 return ret;
4390
4391err_size:
4392 put_user(sizeof(*attr), &uattr->size);
4393 ret = -E2BIG;
4394 goto out;
4395}
4396
4397int perf_event_set_output(struct perf_event *event, int output_fd)
4398{
4399 struct perf_event *output_event = NULL;
4400 struct file *output_file = NULL;
4401 struct perf_event *old_output;
4402 int fput_needed = 0;
4403 int ret = -EINVAL;
4404
4405 if (!output_fd)
4406 goto set;
4407
4408 output_file = fget_light(output_fd, &fput_needed);
4409 if (!output_file)
4410 return -EBADF;
4411
4412 if (output_file->f_op != &perf_fops)
4413 goto out;
4414
4415 output_event = output_file->private_data;
4416
4417 /* Don't chain output fds */
4418 if (output_event->output)
4419 goto out;
4420
4421 /* Don't set an output fd when we already have an output channel */
4422 if (event->data)
4423 goto out;
4424
4425 atomic_long_inc(&output_file->f_count);
4426
4427set:
4428 mutex_lock(&event->mmap_mutex);
4429 old_output = event->output;
4430 rcu_assign_pointer(event->output, output_event);
4431 mutex_unlock(&event->mmap_mutex);
4432
4433 if (old_output) {
4434 /*
4435 * we need to make sure no existing perf_output_*()
4436 * is still referencing this event.
4437 */
4438 synchronize_rcu();
4439 fput(old_output->filp);
4440 }
4441
4442 ret = 0;
4443out:
4444 fput_light(output_file, fput_needed);
4445 return ret;
4446}
4447
4448/**
4449 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4450 *
4451 * @attr_uptr: event_id type attributes for monitoring/sampling
4452 * @pid: target pid
4453 * @cpu: target cpu
4454 * @group_fd: group leader event fd
4455 */
4456SYSCALL_DEFINE5(perf_event_open,
4457 struct perf_event_attr __user *, attr_uptr,
4458 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4459{
4460 struct perf_event *event, *group_leader;
4461 struct perf_event_attr attr;
4462 struct perf_event_context *ctx;
4463 struct file *event_file = NULL;
4464 struct file *group_file = NULL;
4465 int fput_needed = 0;
4466 int fput_needed2 = 0;
4467 int err;
4468
4469 /* for future expandability... */
4470 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4471 return -EINVAL;
4472
4473 err = perf_copy_attr(attr_uptr, &attr);
4474 if (err)
4475 return err;
4476
4477 if (!attr.exclude_kernel) {
4478 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4479 return -EACCES;
4480 }
4481
4482 if (attr.freq) {
4483 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4484 return -EINVAL;
4485 }
4486
4487 /*
4488 * Get the target context (task or percpu):
4489 */
4490 ctx = find_get_context(pid, cpu);
4491 if (IS_ERR(ctx))
4492 return PTR_ERR(ctx);
4493
4494 /*
4495 * Look up the group leader (we will attach this event to it):
4496 */
4497 group_leader = NULL;
4498 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4499 err = -EINVAL;
4500 group_file = fget_light(group_fd, &fput_needed);
4501 if (!group_file)
4502 goto err_put_context;
4503 if (group_file->f_op != &perf_fops)
4504 goto err_put_context;
4505
4506 group_leader = group_file->private_data;
4507 /*
4508 * Do not allow a recursive hierarchy (this new sibling
4509 * becoming part of another group-sibling):
4510 */
4511 if (group_leader->group_leader != group_leader)
4512 goto err_put_context;
4513 /*
4514 * Do not allow to attach to a group in a different
4515 * task or CPU context:
4516 */
4517 if (group_leader->ctx != ctx)
4518 goto err_put_context;
4519 /*
4520 * Only a group leader can be exclusive or pinned
4521 */
4522 if (attr.exclusive || attr.pinned)
4523 goto err_put_context;
4524 }
4525
4526 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4527 NULL, GFP_KERNEL);
4528 err = PTR_ERR(event);
4529 if (IS_ERR(event))
4530 goto err_put_context;
4531
4532 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4533 if (err < 0)
4534 goto err_free_put_context;
4535
4536 event_file = fget_light(err, &fput_needed2);
4537 if (!event_file)
4538 goto err_free_put_context;
4539
4540 if (flags & PERF_FLAG_FD_OUTPUT) {
4541 err = perf_event_set_output(event, group_fd);
4542 if (err)
4543 goto err_fput_free_put_context;
4544 }
4545
4546 event->filp = event_file;
4547 WARN_ON_ONCE(ctx->parent_ctx);
4548 mutex_lock(&ctx->mutex);
4549 perf_install_in_context(ctx, event, cpu);
4550 ++ctx->generation;
4551 mutex_unlock(&ctx->mutex);
4552
4553 event->owner = current;
4554 get_task_struct(current);
4555 mutex_lock(&current->perf_event_mutex);
4556 list_add_tail(&event->owner_entry, &current->perf_event_list);
4557 mutex_unlock(&current->perf_event_mutex);
4558
4559err_fput_free_put_context:
4560 fput_light(event_file, fput_needed2);
4561
4562err_free_put_context:
4563 if (err < 0)
4564 kfree(event);
4565
4566err_put_context:
4567 if (err < 0)
4568 put_ctx(ctx);
4569
4570 fput_light(group_file, fput_needed);
4571
4572 return err;
4573}
4574
4575/*
4576 * inherit a event from parent task to child task:
4577 */
4578static struct perf_event *
4579inherit_event(struct perf_event *parent_event,
4580 struct task_struct *parent,
4581 struct perf_event_context *parent_ctx,
4582 struct task_struct *child,
4583 struct perf_event *group_leader,
4584 struct perf_event_context *child_ctx)
4585{
4586 struct perf_event *child_event;
4587
4588 /*
4589 * Instead of creating recursive hierarchies of events,
4590 * we link inherited events back to the original parent,
4591 * which has a filp for sure, which we use as the reference
4592 * count:
4593 */
4594 if (parent_event->parent)
4595 parent_event = parent_event->parent;
4596
4597 child_event = perf_event_alloc(&parent_event->attr,
4598 parent_event->cpu, child_ctx,
4599 group_leader, parent_event,
4600 GFP_KERNEL);
4601 if (IS_ERR(child_event))
4602 return child_event;
4603 get_ctx(child_ctx);
4604
4605 /*
4606 * Make the child state follow the state of the parent event,
4607 * not its attr.disabled bit. We hold the parent's mutex,
4608 * so we won't race with perf_event_{en, dis}able_family.
4609 */
4610 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4611 child_event->state = PERF_EVENT_STATE_INACTIVE;
4612 else
4613 child_event->state = PERF_EVENT_STATE_OFF;
4614
4615 if (parent_event->attr.freq)
4616 child_event->hw.sample_period = parent_event->hw.sample_period;
4617
4618 /*
4619 * Link it up in the child's context:
4620 */
4621 add_event_to_ctx(child_event, child_ctx);
4622
4623 /*
4624 * Get a reference to the parent filp - we will fput it
4625 * when the child event exits. This is safe to do because
4626 * we are in the parent and we know that the filp still
4627 * exists and has a nonzero count:
4628 */
4629 atomic_long_inc(&parent_event->filp->f_count);
4630
4631 /*
4632 * Link this into the parent event's child list
4633 */
4634 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4635 mutex_lock(&parent_event->child_mutex);
4636 list_add_tail(&child_event->child_list, &parent_event->child_list);
4637 mutex_unlock(&parent_event->child_mutex);
4638
4639 return child_event;
4640}
4641
4642static int inherit_group(struct perf_event *parent_event,
4643 struct task_struct *parent,
4644 struct perf_event_context *parent_ctx,
4645 struct task_struct *child,
4646 struct perf_event_context *child_ctx)
4647{
4648 struct perf_event *leader;
4649 struct perf_event *sub;
4650 struct perf_event *child_ctr;
4651
4652 leader = inherit_event(parent_event, parent, parent_ctx,
4653 child, NULL, child_ctx);
4654 if (IS_ERR(leader))
4655 return PTR_ERR(leader);
4656 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4657 child_ctr = inherit_event(sub, parent, parent_ctx,
4658 child, leader, child_ctx);
4659 if (IS_ERR(child_ctr))
4660 return PTR_ERR(child_ctr);
4661 }
4662 return 0;
4663}
4664
4665static void sync_child_event(struct perf_event *child_event,
4666 struct task_struct *child)
4667{
4668 struct perf_event *parent_event = child_event->parent;
4669 u64 child_val;
4670
4671 if (child_event->attr.inherit_stat)
4672 perf_event_read_event(child_event, child);
4673
4674 child_val = atomic64_read(&child_event->count);
4675
4676 /*
4677 * Add back the child's count to the parent's count:
4678 */
4679 atomic64_add(child_val, &parent_event->count);
4680 atomic64_add(child_event->total_time_enabled,
4681 &parent_event->child_total_time_enabled);
4682 atomic64_add(child_event->total_time_running,
4683 &parent_event->child_total_time_running);
4684
4685 /*
4686 * Remove this event from the parent's list
4687 */
4688 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4689 mutex_lock(&parent_event->child_mutex);
4690 list_del_init(&child_event->child_list);
4691 mutex_unlock(&parent_event->child_mutex);
4692
4693 /*
4694 * Release the parent event, if this was the last
4695 * reference to it.
4696 */
4697 fput(parent_event->filp);
4698}
4699
4700static void
4701__perf_event_exit_task(struct perf_event *child_event,
4702 struct perf_event_context *child_ctx,
4703 struct task_struct *child)
4704{
4705 struct perf_event *parent_event;
4706
4707 update_event_times(child_event);
4708 perf_event_remove_from_context(child_event);
4709
4710 parent_event = child_event->parent;
4711 /*
4712 * It can happen that parent exits first, and has events
4713 * that are still around due to the child reference. These
4714 * events need to be zapped - but otherwise linger.
4715 */
4716 if (parent_event) {
4717 sync_child_event(child_event, child);
4718 free_event(child_event);
4719 }
4720}
4721
4722/*
4723 * When a child task exits, feed back event values to parent events.
4724 */
4725void perf_event_exit_task(struct task_struct *child)
4726{
4727 struct perf_event *child_event, *tmp;
4728 struct perf_event_context *child_ctx;
4729 unsigned long flags;
4730
4731 if (likely(!child->perf_event_ctxp)) {
4732 perf_event_task(child, NULL, 0);
4733 return;
4734 }
4735
4736 local_irq_save(flags);
4737 /*
4738 * We can't reschedule here because interrupts are disabled,
4739 * and either child is current or it is a task that can't be
4740 * scheduled, so we are now safe from rescheduling changing
4741 * our context.
4742 */
4743 child_ctx = child->perf_event_ctxp;
4744 __perf_event_task_sched_out(child_ctx);
4745
4746 /*
4747 * Take the context lock here so that if find_get_context is
4748 * reading child->perf_event_ctxp, we wait until it has
4749 * incremented the context's refcount before we do put_ctx below.
4750 */
4751 spin_lock(&child_ctx->lock);
4752 child->perf_event_ctxp = NULL;
4753 /*
4754 * If this context is a clone; unclone it so it can't get
4755 * swapped to another process while we're removing all
4756 * the events from it.
4757 */
4758 unclone_ctx(child_ctx);
4759 spin_unlock_irqrestore(&child_ctx->lock, flags);
4760
4761 /*
4762 * Report the task dead after unscheduling the events so that we
4763 * won't get any samples after PERF_RECORD_EXIT. We can however still
4764 * get a few PERF_RECORD_READ events.
4765 */
4766 perf_event_task(child, child_ctx, 0);
4767
4768 /*
4769 * We can recurse on the same lock type through:
4770 *
4771 * __perf_event_exit_task()
4772 * sync_child_event()
4773 * fput(parent_event->filp)
4774 * perf_release()
4775 * mutex_lock(&ctx->mutex)
4776 *
4777 * But since its the parent context it won't be the same instance.
4778 */
4779 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4780
4781again:
4782 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4783 group_entry)
4784 __perf_event_exit_task(child_event, child_ctx, child);
4785
4786 /*
4787 * If the last event was a group event, it will have appended all
4788 * its siblings to the list, but we obtained 'tmp' before that which
4789 * will still point to the list head terminating the iteration.
4790 */
4791 if (!list_empty(&child_ctx->group_list))
4792 goto again;
4793
4794 mutex_unlock(&child_ctx->mutex);
4795
4796 put_ctx(child_ctx);
4797}
4798
4799/*
4800 * free an unexposed, unused context as created by inheritance by
4801 * init_task below, used by fork() in case of fail.
4802 */
4803void perf_event_free_task(struct task_struct *task)
4804{
4805 struct perf_event_context *ctx = task->perf_event_ctxp;
4806 struct perf_event *event, *tmp;
4807
4808 if (!ctx)
4809 return;
4810
4811 mutex_lock(&ctx->mutex);
4812again:
4813 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4814 struct perf_event *parent = event->parent;
4815
4816 if (WARN_ON_ONCE(!parent))
4817 continue;
4818
4819 mutex_lock(&parent->child_mutex);
4820 list_del_init(&event->child_list);
4821 mutex_unlock(&parent->child_mutex);
4822
4823 fput(parent->filp);
4824
4825 list_del_event(event, ctx);
4826 free_event(event);
4827 }
4828
4829 if (!list_empty(&ctx->group_list))
4830 goto again;
4831
4832 mutex_unlock(&ctx->mutex);
4833
4834 put_ctx(ctx);
4835}
4836
4837/*
4838 * Initialize the perf_event context in task_struct
4839 */
4840int perf_event_init_task(struct task_struct *child)
4841{
4842 struct perf_event_context *child_ctx, *parent_ctx;
4843 struct perf_event_context *cloned_ctx;
4844 struct perf_event *event;
4845 struct task_struct *parent = current;
4846 int inherited_all = 1;
4847 int ret = 0;
4848
4849 child->perf_event_ctxp = NULL;
4850
4851 mutex_init(&child->perf_event_mutex);
4852 INIT_LIST_HEAD(&child->perf_event_list);
4853
4854 if (likely(!parent->perf_event_ctxp))
4855 return 0;
4856
4857 /*
4858 * This is executed from the parent task context, so inherit
4859 * events that have been marked for cloning.
4860 * First allocate and initialize a context for the child.
4861 */
4862
4863 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4864 if (!child_ctx)
4865 return -ENOMEM;
4866
4867 __perf_event_init_context(child_ctx, child);
4868 child->perf_event_ctxp = child_ctx;
4869 get_task_struct(child);
4870
4871 /*
4872 * If the parent's context is a clone, pin it so it won't get
4873 * swapped under us.
4874 */
4875 parent_ctx = perf_pin_task_context(parent);
4876
4877 /*
4878 * No need to check if parent_ctx != NULL here; since we saw
4879 * it non-NULL earlier, the only reason for it to become NULL
4880 * is if we exit, and since we're currently in the middle of
4881 * a fork we can't be exiting at the same time.
4882 */
4883
4884 /*
4885 * Lock the parent list. No need to lock the child - not PID
4886 * hashed yet and not running, so nobody can access it.
4887 */
4888 mutex_lock(&parent_ctx->mutex);
4889
4890 /*
4891 * We dont have to disable NMIs - we are only looking at
4892 * the list, not manipulating it:
4893 */
4894 list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
4895
4896 if (!event->attr.inherit) {
4897 inherited_all = 0;
4898 continue;
4899 }
4900
4901 ret = inherit_group(event, parent, parent_ctx,
4902 child, child_ctx);
4903 if (ret) {
4904 inherited_all = 0;
4905 break;
4906 }
4907 }
4908
4909 if (inherited_all) {
4910 /*
4911 * Mark the child context as a clone of the parent
4912 * context, or of whatever the parent is a clone of.
4913 * Note that if the parent is a clone, it could get
4914 * uncloned at any point, but that doesn't matter
4915 * because the list of events and the generation
4916 * count can't have changed since we took the mutex.
4917 */
4918 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4919 if (cloned_ctx) {
4920 child_ctx->parent_ctx = cloned_ctx;
4921 child_ctx->parent_gen = parent_ctx->parent_gen;
4922 } else {
4923 child_ctx->parent_ctx = parent_ctx;
4924 child_ctx->parent_gen = parent_ctx->generation;
4925 }
4926 get_ctx(child_ctx->parent_ctx);
4927 }
4928
4929 mutex_unlock(&parent_ctx->mutex);
4930
4931 perf_unpin_context(parent_ctx);
4932
4933 return ret;
4934}
4935
4936static void __cpuinit perf_event_init_cpu(int cpu)
4937{
4938 struct perf_cpu_context *cpuctx;
4939
4940 cpuctx = &per_cpu(perf_cpu_context, cpu);
4941 __perf_event_init_context(&cpuctx->ctx, NULL);
4942
4943 spin_lock(&perf_resource_lock);
4944 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4945 spin_unlock(&perf_resource_lock);
4946
4947 hw_perf_event_setup(cpu);
4948}
4949
4950#ifdef CONFIG_HOTPLUG_CPU
4951static void __perf_event_exit_cpu(void *info)
4952{
4953 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4954 struct perf_event_context *ctx = &cpuctx->ctx;
4955 struct perf_event *event, *tmp;
4956
4957 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4958 __perf_event_remove_from_context(event);
4959}
4960static void perf_event_exit_cpu(int cpu)
4961{
4962 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4963 struct perf_event_context *ctx = &cpuctx->ctx;
4964
4965 mutex_lock(&ctx->mutex);
4966 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4967 mutex_unlock(&ctx->mutex);
4968}
4969#else
4970static inline void perf_event_exit_cpu(int cpu) { }
4971#endif
4972
4973static int __cpuinit
4974perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4975{
4976 unsigned int cpu = (long)hcpu;
4977
4978 switch (action) {
4979
4980 case CPU_UP_PREPARE:
4981 case CPU_UP_PREPARE_FROZEN:
4982 perf_event_init_cpu(cpu);
4983 break;
4984
4985 case CPU_ONLINE:
4986 case CPU_ONLINE_FROZEN:
4987 hw_perf_event_setup_online(cpu);
4988 break;
4989
4990 case CPU_DOWN_PREPARE:
4991 case CPU_DOWN_PREPARE_FROZEN:
4992 perf_event_exit_cpu(cpu);
4993 break;
4994
4995 default:
4996 break;
4997 }
4998
4999 return NOTIFY_OK;
5000}
5001
5002/*
5003 * This has to have a higher priority than migration_notifier in sched.c.
5004 */
5005static struct notifier_block __cpuinitdata perf_cpu_nb = {
5006 .notifier_call = perf_cpu_notify,
5007 .priority = 20,
5008};
5009
5010void __init perf_event_init(void)
5011{
5012 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5013 (void *)(long)smp_processor_id());
5014 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
5015 (void *)(long)smp_processor_id());
5016 register_cpu_notifier(&perf_cpu_nb);
5017}
5018
5019static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
5020{
5021 return sprintf(buf, "%d\n", perf_reserved_percpu);
5022}
5023
5024static ssize_t
5025perf_set_reserve_percpu(struct sysdev_class *class,
5026 const char *buf,
5027 size_t count)
5028{
5029 struct perf_cpu_context *cpuctx;
5030 unsigned long val;
5031 int err, cpu, mpt;
5032
5033 err = strict_strtoul(buf, 10, &val);
5034 if (err)
5035 return err;
5036 if (val > perf_max_events)
5037 return -EINVAL;
5038
5039 spin_lock(&perf_resource_lock);
5040 perf_reserved_percpu = val;
5041 for_each_online_cpu(cpu) {
5042 cpuctx = &per_cpu(perf_cpu_context, cpu);
5043 spin_lock_irq(&cpuctx->ctx.lock);
5044 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5045 perf_max_events - perf_reserved_percpu);
5046 cpuctx->max_pertask = mpt;
5047 spin_unlock_irq(&cpuctx->ctx.lock);
5048 }
5049 spin_unlock(&perf_resource_lock);
5050
5051 return count;
5052}
5053
5054static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
5055{
5056 return sprintf(buf, "%d\n", perf_overcommit);
5057}
5058
5059static ssize_t
5060perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
5061{
5062 unsigned long val;
5063 int err;
5064
5065 err = strict_strtoul(buf, 10, &val);
5066 if (err)
5067 return err;
5068 if (val > 1)
5069 return -EINVAL;
5070
5071 spin_lock(&perf_resource_lock);
5072 perf_overcommit = val;
5073 spin_unlock(&perf_resource_lock);
5074
5075 return count;
5076}
5077
5078static SYSDEV_CLASS_ATTR(
5079 reserve_percpu,
5080 0644,
5081 perf_show_reserve_percpu,
5082 perf_set_reserve_percpu
5083 );
5084
5085static SYSDEV_CLASS_ATTR(
5086 overcommit,
5087 0644,
5088 perf_show_overcommit,
5089 perf_set_overcommit
5090 );
5091
5092static struct attribute *perfclass_attrs[] = {
5093 &attr_reserve_percpu.attr,
5094 &attr_overcommit.attr,
5095 NULL
5096};
5097
5098static struct attribute_group perfclass_attr_group = {
5099 .attrs = perfclass_attrs,
5100 .name = "perf_events",
5101};
5102
5103static int __init perf_event_sysfs_init(void)
5104{
5105 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
5106 &perfclass_attr_group);
5107}
5108device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..5c9dc228747b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,17 +8,18 @@
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
11 12
12/* 13/*
13 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
14 */ 15 */
15void update_rlimit_cpu(unsigned long rlim_new) 16void update_rlimit_cpu(unsigned long rlim_new)
16{ 17{
17 cputime_t cputime; 18 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
18 20
19 cputime = secs_to_cputime(rlim_new); 21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
542 now); 543 now);
543} 544}
544 545
546static inline int expires_gt(cputime_t expires, cputime_t new_exp)
547{
548 return cputime_eq(expires, cputime_zero) ||
549 cputime_gt(expires, new_exp);
550}
551
552static inline int expires_le(cputime_t expires, cputime_t new_exp)
553{
554 return !cputime_eq(expires, cputime_zero) &&
555 cputime_le(expires, new_exp);
556}
545/* 557/*
546 * Insert the timer on the appropriate list before any timers that 558 * Insert the timer on the appropriate list before any timers that
547 * expire later. This must be called with the tasklist_lock held 559 * expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
586 */ 598 */
587 599
588 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 600 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
601 union cpu_time_count *exp = &nt->expires;
602
589 switch (CPUCLOCK_WHICH(timer->it_clock)) { 603 switch (CPUCLOCK_WHICH(timer->it_clock)) {
590 default: 604 default:
591 BUG(); 605 BUG();
592 case CPUCLOCK_PROF: 606 case CPUCLOCK_PROF:
593 if (cputime_eq(p->cputime_expires.prof_exp, 607 if (expires_gt(p->cputime_expires.prof_exp,
594 cputime_zero) || 608 exp->cpu))
595 cputime_gt(p->cputime_expires.prof_exp, 609 p->cputime_expires.prof_exp = exp->cpu;
596 nt->expires.cpu))
597 p->cputime_expires.prof_exp =
598 nt->expires.cpu;
599 break; 610 break;
600 case CPUCLOCK_VIRT: 611 case CPUCLOCK_VIRT:
601 if (cputime_eq(p->cputime_expires.virt_exp, 612 if (expires_gt(p->cputime_expires.virt_exp,
602 cputime_zero) || 613 exp->cpu))
603 cputime_gt(p->cputime_expires.virt_exp, 614 p->cputime_expires.virt_exp = exp->cpu;
604 nt->expires.cpu))
605 p->cputime_expires.virt_exp =
606 nt->expires.cpu;
607 break; 615 break;
608 case CPUCLOCK_SCHED: 616 case CPUCLOCK_SCHED:
609 if (p->cputime_expires.sched_exp == 0 || 617 if (p->cputime_expires.sched_exp == 0 ||
610 p->cputime_expires.sched_exp > 618 p->cputime_expires.sched_exp > exp->sched)
611 nt->expires.sched)
612 p->cputime_expires.sched_exp = 619 p->cputime_expires.sched_exp =
613 nt->expires.sched; 620 exp->sched;
614 break; 621 break;
615 } 622 }
616 } else { 623 } else {
624 struct signal_struct *const sig = p->signal;
625 union cpu_time_count *exp = &timer->it.cpu.expires;
626
617 /* 627 /*
618 * For a process timer, set the cached expiration time. 628 * For a process timer, set the cached expiration time.
619 */ 629 */
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
621 default: 631 default:
622 BUG(); 632 BUG();
623 case CPUCLOCK_VIRT: 633 case CPUCLOCK_VIRT:
624 if (!cputime_eq(p->signal->it_virt_expires, 634 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
625 cputime_zero) && 635 exp->cpu))
626 cputime_lt(p->signal->it_virt_expires,
627 timer->it.cpu.expires.cpu))
628 break; 636 break;
629 p->signal->cputime_expires.virt_exp = 637 sig->cputime_expires.virt_exp = exp->cpu;
630 timer->it.cpu.expires.cpu;
631 break; 638 break;
632 case CPUCLOCK_PROF: 639 case CPUCLOCK_PROF:
633 if (!cputime_eq(p->signal->it_prof_expires, 640 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
634 cputime_zero) && 641 exp->cpu))
635 cputime_lt(p->signal->it_prof_expires,
636 timer->it.cpu.expires.cpu))
637 break; 642 break;
638 i = p->signal->rlim[RLIMIT_CPU].rlim_cur; 643 i = sig->rlim[RLIMIT_CPU].rlim_cur;
639 if (i != RLIM_INFINITY && 644 if (i != RLIM_INFINITY &&
640 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 645 i <= cputime_to_secs(exp->cpu))
641 break; 646 break;
642 p->signal->cputime_expires.prof_exp = 647 sig->cputime_expires.prof_exp = exp->cpu;
643 timer->it.cpu.expires.cpu;
644 break; 648 break;
645 case CPUCLOCK_SCHED: 649 case CPUCLOCK_SCHED:
646 p->signal->cputime_expires.sched_exp = 650 sig->cputime_expires.sched_exp = exp->sched;
647 timer->it.cpu.expires.sched;
648 break; 651 break;
649 } 652 }
650 } 653 }
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1072} 1075}
1073 1076
1077static u32 onecputick;
1078
1079static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1080 cputime_t *expires, cputime_t cur_time, int signo)
1081{
1082 if (cputime_eq(it->expires, cputime_zero))
1083 return;
1084
1085 if (cputime_ge(cur_time, it->expires)) {
1086 if (!cputime_eq(it->incr, cputime_zero)) {
1087 it->expires = cputime_add(it->expires, it->incr);
1088 it->error += it->incr_error;
1089 if (it->error >= onecputick) {
1090 it->expires = cputime_sub(it->expires,
1091 cputime_one_jiffy);
1092 it->error -= onecputick;
1093 }
1094 } else {
1095 it->expires = cputime_zero;
1096 }
1097
1098 trace_itimer_expire(signo == SIGPROF ?
1099 ITIMER_PROF : ITIMER_VIRTUAL,
1100 tsk->signal->leader_pid, cur_time);
1101 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1102 }
1103
1104 if (!cputime_eq(it->expires, cputime_zero) &&
1105 (cputime_eq(*expires, cputime_zero) ||
1106 cputime_lt(it->expires, *expires))) {
1107 *expires = it->expires;
1108 }
1109}
1110
1074/* 1111/*
1075 * Check for any per-thread CPU timers that have fired and move them 1112 * Check for any per-thread CPU timers that have fired and move them
1076 * off the tsk->*_timers list onto the firing list. Per-thread timers 1113 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
1090 * Don't sample the current process CPU clocks if there are no timers. 1127 * Don't sample the current process CPU clocks if there are no timers.
1091 */ 1128 */
1092 if (list_empty(&timers[CPUCLOCK_PROF]) && 1129 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1093 cputime_eq(sig->it_prof_expires, cputime_zero) && 1130 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1094 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1131 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1095 list_empty(&timers[CPUCLOCK_VIRT]) && 1132 list_empty(&timers[CPUCLOCK_VIRT]) &&
1096 cputime_eq(sig->it_virt_expires, cputime_zero) && 1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1097 list_empty(&timers[CPUCLOCK_SCHED])) { 1134 list_empty(&timers[CPUCLOCK_SCHED])) {
1098 stop_process_timers(tsk); 1135 stop_process_timers(tsk);
1099 return; 1136 return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
1153 /* 1190 /*
1154 * Check for the special case process timers. 1191 * Check for the special case process timers.
1155 */ 1192 */
1156 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1193 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1157 if (cputime_ge(ptime, sig->it_prof_expires)) { 1194 SIGPROF);
1158 /* ITIMER_PROF fires and reloads. */ 1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1159 sig->it_prof_expires = sig->it_prof_incr; 1196 SIGVTALRM);
1160 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1197
1161 sig->it_prof_expires = cputime_add(
1162 sig->it_prof_expires, ptime);
1163 }
1164 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1165 }
1166 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1167 (cputime_eq(prof_expires, cputime_zero) ||
1168 cputime_lt(sig->it_prof_expires, prof_expires))) {
1169 prof_expires = sig->it_prof_expires;
1170 }
1171 }
1172 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1173 if (cputime_ge(utime, sig->it_virt_expires)) {
1174 /* ITIMER_VIRTUAL fires and reloads. */
1175 sig->it_virt_expires = sig->it_virt_incr;
1176 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1177 sig->it_virt_expires = cputime_add(
1178 sig->it_virt_expires, utime);
1179 }
1180 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1181 }
1182 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1183 (cputime_eq(virt_expires, cputime_zero) ||
1184 cputime_lt(sig->it_virt_expires, virt_expires))) {
1185 virt_expires = sig->it_virt_expires;
1186 }
1187 }
1188 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1189 unsigned long psecs = cputime_to_secs(ptime); 1199 unsigned long psecs = cputime_to_secs(ptime);
1190 cputime_t x; 1200 cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 if (!cputime_eq(*oldval, cputime_zero)) { 1467 if (!cputime_eq(*oldval, cputime_zero)) {
1458 if (cputime_le(*oldval, now.cpu)) { 1468 if (cputime_le(*oldval, now.cpu)) {
1459 /* Just about to fire. */ 1469 /* Just about to fire. */
1460 *oldval = jiffies_to_cputime(1); 1470 *oldval = cputime_one_jiffy;
1461 } else { 1471 } else {
1462 *oldval = cputime_sub(*oldval, now.cpu); 1472 *oldval = cputime_sub(*oldval, now.cpu);
1463 } 1473 }
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
1703 .nsleep = thread_cpu_nsleep, 1713 .nsleep = thread_cpu_nsleep,
1704 .nsleep_restart = thread_cpu_nsleep_restart, 1714 .nsleep_restart = thread_cpu_nsleep_restart,
1705 }; 1715 };
1716 struct timespec ts;
1706 1717
1707 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1718 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1708 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1719 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1709 1720
1721 cputime_to_timespec(cputime_one_jiffy, &ts);
1722 onecputick = ts.tv_nsec;
1723 WARN_ON(ts.tv_sec != 0);
1724
1710 return 0; 1725 return 0;
1711} 1726}
1712__initcall(init_posix_cpu_timers); 1727__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 97955b0e44f4..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..b101cdc4df3f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 602033acd6c7..f38b07f78a4e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
206#ifdef CONFIG_BOOT_PRINTK_DELAY 206#ifdef CONFIG_BOOT_PRINTK_DELAY
207 207
208static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 208static unsigned int boot_delay; /* msecs delay after each printk during bootup */
209static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ 209static unsigned long long loops_per_msec; /* based on boot_delay */
210 210
211static int __init boot_delay_setup(char *str) 211static int __init boot_delay_setup(char *str)
212{ 212{
213 unsigned long lpj; 213 unsigned long lpj;
214 unsigned long long loops_per_msec;
215 214
216 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ 215 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
217 loops_per_msec = (unsigned long long)lpj / 1000 * HZ; 216 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str)
220 if (boot_delay > 10 * 1000) 219 if (boot_delay > 10 * 1000)
221 boot_delay = 0; 220 boot_delay = 0;
222 221
223 printk_delay_msec = loops_per_msec; 222 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
224 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 223 "HZ: %d, loops_per_msec: %llu\n",
225 "HZ: %d, printk_delay_msec: %llu\n", 224 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
226 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
227 return 1; 225 return 1;
228} 226}
229__setup("boot_delay=", boot_delay_setup); 227__setup("boot_delay=", boot_delay_setup);
@@ -236,7 +234,7 @@ static void boot_delay_msec(void)
236 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 234 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
237 return; 235 return;
238 236
239 k = (unsigned long long)printk_delay_msec * boot_delay; 237 k = (unsigned long long)loops_per_msec * boot_delay;
240 238
241 timeout = jiffies + msecs_to_jiffies(boot_delay); 239 timeout = jiffies + msecs_to_jiffies(boot_delay);
242 while (k) { 240 while (k) {
@@ -655,6 +653,20 @@ static int recursion_bug;
655static int new_text_line = 1; 653static int new_text_line = 1;
656static char printk_buf[1024]; 654static char printk_buf[1024];
657 655
656int printk_delay_msec __read_mostly;
657
658static inline void printk_delay(void)
659{
660 if (unlikely(printk_delay_msec)) {
661 int m = printk_delay_msec;
662
663 while (m--) {
664 mdelay(1);
665 touch_nmi_watchdog();
666 }
667 }
668}
669
658asmlinkage int vprintk(const char *fmt, va_list args) 670asmlinkage int vprintk(const char *fmt, va_list args)
659{ 671{
660 int printed_len = 0; 672 int printed_len = 0;
@@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
664 char *p; 676 char *p;
665 677
666 boot_delay_msec(); 678 boot_delay_msec();
679 printk_delay();
667 680
668 preempt_disable(); 681 preempt_disable();
669 /* This stops the holder of console_sem just where we want him */ 682 /* This stops the holder of console_sem just where we want him */
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 307c285af59e..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bd5d5c8e5140..400183346ad2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -46,22 +46,15 @@
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/kernel_stat.h>
48 48
49enum rcu_barrier { 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 RCU_BARRIER_STD, 50static struct lock_class_key rcu_lock_key;
51 RCU_BARRIER_BH, 51struct lockdep_map rcu_lock_map =
52 RCU_BARRIER_SCHED, 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53}; 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
54 55
55static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
56static atomic_t rcu_barrier_cpu_count;
57static DEFINE_MUTEX(rcu_barrier_mutex);
58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly; 56int rcu_scheduler_active __read_mostly;
60 57
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64
65/* 58/*
66 * Awaken the corresponding synchronize_rcu() instance now that a 59 * Awaken the corresponding synchronize_rcu() instance now that a
67 * grace period has elapsed. 60 * grace period has elapsed.
@@ -74,6 +67,8 @@ void wakeme_after_rcu(struct rcu_head *head)
74 complete(&rcu->completion); 67 complete(&rcu->completion);
75} 68}
76 69
70#ifdef CONFIG_TREE_PREEMPT_RCU
71
77/** 72/**
78 * synchronize_rcu - wait until a grace period has elapsed. 73 * synchronize_rcu - wait until a grace period has elapsed.
79 * 74 *
@@ -87,7 +82,7 @@ void synchronize_rcu(void)
87{ 82{
88 struct rcu_synchronize rcu; 83 struct rcu_synchronize rcu;
89 84
90 if (rcu_blocking_is_gp()) 85 if (!rcu_scheduler_active)
91 return; 86 return;
92 87
93 init_completion(&rcu.completion); 88 init_completion(&rcu.completion);
@@ -98,6 +93,46 @@ void synchronize_rcu(void)
98} 93}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 94EXPORT_SYMBOL_GPL(synchronize_rcu);
100 95
96#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
97
98/**
99 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
100 *
101 * Control will return to the caller some time after a full rcu-sched
102 * grace period has elapsed, in other words after all currently executing
103 * rcu-sched read-side critical sections have completed. These read-side
104 * critical sections are delimited by rcu_read_lock_sched() and
105 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
106 * local_irq_disable(), and so on may be used in place of
107 * rcu_read_lock_sched().
108 *
109 * This means that all preempt_disable code sequences, including NMI and
110 * hardware-interrupt handlers, in progress on entry will have completed
111 * before this primitive returns. However, this does not guarantee that
112 * softirq handlers will have completed, since in some kernels, these
113 * handlers can run in process context, and can block.
114 *
115 * This primitive provides the guarantees made by the (now removed)
116 * synchronize_kernel() API. In contrast, synchronize_rcu() only
117 * guarantees that rcu_read_lock() sections will have completed.
118 * In "classic RCU", these two guarantees happen to be one and
119 * the same, but can differ in realtime RCU implementations.
120 */
121void synchronize_sched(void)
122{
123 struct rcu_synchronize rcu;
124
125 if (rcu_blocking_is_gp())
126 return;
127
128 init_completion(&rcu.completion);
129 /* Will wake me after RCU finished. */
130 call_rcu_sched(&rcu.head, wakeme_after_rcu);
131 /* Wait for it. */
132 wait_for_completion(&rcu.completion);
133}
134EXPORT_SYMBOL_GPL(synchronize_sched);
135
101/** 136/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 137 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 * 138 *
@@ -122,129 +157,10 @@ void synchronize_rcu_bh(void)
122} 157}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 158EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124 159
125static void rcu_barrier_callback(struct rcu_head *notused)
126{
127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
128 complete(&rcu_barrier_completion);
129}
130
131/*
132 * Called with preemption disabled, and from cross-cpu IRQ context.
133 */
134static void rcu_barrier_func(void *type)
135{
136 int cpu = smp_processor_id();
137 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
138
139 atomic_inc(&rcu_barrier_cpu_count);
140 switch ((enum rcu_barrier)type) {
141 case RCU_BARRIER_STD:
142 call_rcu(head, rcu_barrier_callback);
143 break;
144 case RCU_BARRIER_BH:
145 call_rcu_bh(head, rcu_barrier_callback);
146 break;
147 case RCU_BARRIER_SCHED:
148 call_rcu_sched(head, rcu_barrier_callback);
149 break;
150 }
151}
152
153static inline void wait_migrated_callbacks(void)
154{
155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
157}
158
159/*
160 * Orchestrate the specified type of RCU barrier, waiting for all
161 * RCU callbacks of the specified type to complete.
162 */
163static void _rcu_barrier(enum rcu_barrier type)
164{
165 BUG_ON(in_interrupt());
166 /* Take cpucontrol mutex to protect against CPU hotplug */
167 mutex_lock(&rcu_barrier_mutex);
168 init_completion(&rcu_barrier_completion);
169 /*
170 * Initialize rcu_barrier_cpu_count to 1, then invoke
171 * rcu_barrier_func() on each CPU, so that each CPU also has
172 * incremented rcu_barrier_cpu_count. Only then is it safe to
173 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
174 * might complete its grace period before all of the other CPUs
175 * did their increment, causing this function to return too
176 * early.
177 */
178 atomic_set(&rcu_barrier_cpu_count, 1);
179 on_each_cpu(rcu_barrier_func, (void *)type, 1);
180 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
181 complete(&rcu_barrier_completion);
182 wait_for_completion(&rcu_barrier_completion);
183 mutex_unlock(&rcu_barrier_mutex);
184 wait_migrated_callbacks();
185}
186
187/**
188 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
189 */
190void rcu_barrier(void)
191{
192 _rcu_barrier(RCU_BARRIER_STD);
193}
194EXPORT_SYMBOL_GPL(rcu_barrier);
195
196/**
197 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
198 */
199void rcu_barrier_bh(void)
200{
201 _rcu_barrier(RCU_BARRIER_BH);
202}
203EXPORT_SYMBOL_GPL(rcu_barrier_bh);
204
205/**
206 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
207 */
208void rcu_barrier_sched(void)
209{
210 _rcu_barrier(RCU_BARRIER_SCHED);
211}
212EXPORT_SYMBOL_GPL(rcu_barrier_sched);
213
214static void rcu_migrate_callback(struct rcu_head *notused)
215{
216 if (atomic_dec_and_test(&rcu_migrate_type_count))
217 wake_up(&rcu_migrate_wq);
218}
219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 160static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
224 unsigned long action, void *hcpu) 161 unsigned long action, void *hcpu)
225{ 162{
226 rcu_cpu_notify(self, action, hcpu); 163 return rcu_cpu_notify(self, action, hcpu);
227 if (action == CPU_DYING) {
228 /*
229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
230 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
231 * returns, all online cpus have queued rcu_barrier_func(),
232 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
233 *
234 * These callbacks ensure _rcu_barrier() waits for all
235 * RCU callbacks of the specified type to complete.
236 */
237 atomic_set(&rcu_migrate_type_count, 3);
238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
244 wait_migrated_callbacks();
245 }
246
247 return NOTIFY_OK;
248} 164}
249 165
250void __init rcu_init(void) 166void __init rcu_init(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b33db539a8ad..697c0a0229d4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -267,7 +267,8 @@ struct rcu_torture_ops {
267 int irq_capable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -339,8 +343,8 @@ static struct rcu_torture_ops rcu_ops = {
339 .sync = synchronize_rcu, 343 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 344 .cb_barrier = rcu_barrier,
341 .stats = NULL, 345 .stats = NULL,
342 .irq_capable = 1, 346 .irq_capable = 1,
343 .name = "rcu" 347 .name = "rcu"
344}; 348};
345 349
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 350static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -602,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = {
602 .name = "sched_sync" 606 .name = "sched_sync"
603}; 607};
604 608
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = { 609static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init, 610 .init = rcu_sync_torture_init,
609 .cleanup = NULL, 611 .cleanup = NULL,
@@ -638,14 +640,15 @@ rcu_torture_writer(void *arg)
638 640
639 do { 641 do {
640 schedule_timeout_uninterruptible(1); 642 schedule_timeout_uninterruptible(1);
641 if ((rp = rcu_torture_alloc()) == NULL) 643 rp = rcu_torture_alloc();
644 if (rp == NULL)
642 continue; 645 continue;
643 rp->rtort_pipe_count = 0; 646 rp->rtort_pipe_count = 0;
644 udelay(rcu_random(&rand) & 0x3ff); 647 udelay(rcu_random(&rand) & 0x3ff);
645 old_rp = rcu_torture_current; 648 old_rp = rcu_torture_current;
646 rp->rtort_mbtest = 1; 649 rp->rtort_mbtest = 1;
647 rcu_assign_pointer(rcu_torture_current, rp); 650 rcu_assign_pointer(rcu_torture_current, rp);
648 smp_wmb(); 651 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
649 if (old_rp) { 652 if (old_rp) {
650 i = old_rp->rtort_pipe_count; 653 i = old_rp->rtort_pipe_count;
651 if (i > RCU_TORTURE_PIPE_LEN) 654 if (i > RCU_TORTURE_PIPE_LEN)
@@ -1110,7 +1113,7 @@ rcu_torture_init(void)
1110 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1113 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1111 torture_type); 1114 torture_type);
1112 mutex_unlock(&fullstop_mutex); 1115 mutex_unlock(&fullstop_mutex);
1113 return (-EINVAL); 1116 return -EINVAL;
1114 } 1117 }
1115 if (cur_ops->init) 1118 if (cur_ops->init)
1116 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1119 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1161,7 +1164,7 @@ rcu_torture_init(void)
1161 goto unwind; 1164 goto unwind;
1162 } 1165 }
1163 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1166 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1164 GFP_KERNEL); 1167 GFP_KERNEL);
1165 if (fakewriter_tasks == NULL) { 1168 if (fakewriter_tasks == NULL) {
1166 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1169 VERBOSE_PRINTK_ERRSTRING("out of memory");
1167 firsterr = -ENOMEM; 1170 firsterr = -ENOMEM;
@@ -1170,7 +1173,7 @@ rcu_torture_init(void)
1170 for (i = 0; i < nfakewriters; i++) { 1173 for (i = 0; i < nfakewriters; i++) {
1171 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1174 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1172 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1175 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1173 "rcu_torture_fakewriter"); 1176 "rcu_torture_fakewriter");
1174 if (IS_ERR(fakewriter_tasks[i])) { 1177 if (IS_ERR(fakewriter_tasks[i])) {
1175 firsterr = PTR_ERR(fakewriter_tasks[i]); 1178 firsterr = PTR_ERR(fakewriter_tasks[i]);
1176 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1179 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b11b07cfe7f..705f02ac7433 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -49,13 +49,6 @@
49 49
50#include "rcutree.h" 50#include "rcutree.h"
51 51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Data structures. */ 52/* Data structures. */
60 53
61#define RCU_STATE_INITIALIZER(name) { \ 54#define RCU_STATE_INITIALIZER(name) { \
@@ -70,6 +63,9 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
70 .gpnum = -300, \ 63 .gpnum = -300, \
71 .completed = -300, \ 64 .completed = -300, \
72 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
66 .orphan_cbs_list = NULL, \
67 .orphan_cbs_tail = &name.orphan_cbs_list, \
68 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 69 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 70 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 71 .n_force_qs_ngp = 0, \
@@ -81,24 +77,16 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 77struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 78DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 79
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100 80
101#include "rcutree_plugin.h" 81/*
82 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
83 * permit this function to be invoked without holding the root rcu_node
84 * structure's ->lock, but of course results can be subject to change.
85 */
86static int rcu_gp_in_progress(struct rcu_state *rsp)
87{
88 return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
89}
102 90
103/* 91/*
104 * Note a quiescent state. Because we do not need to know 92 * Note a quiescent state. Because we do not need to know
@@ -107,27 +95,23 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
107 */ 95 */
108void rcu_sched_qs(int cpu) 96void rcu_sched_qs(int cpu)
109{ 97{
110 unsigned long flags;
111 struct rcu_data *rdp; 98 struct rcu_data *rdp;
112 99
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu); 100 rdp = &per_cpu(rcu_sched_data, cpu);
115 rdp->passed_quiesc = 1;
116 rdp->passed_quiesc_completed = rdp->completed; 101 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu); 102 barrier();
118 local_irq_restore(flags); 103 rdp->passed_quiesc = 1;
104 rcu_preempt_note_context_switch(cpu);
119} 105}
120 106
121void rcu_bh_qs(int cpu) 107void rcu_bh_qs(int cpu)
122{ 108{
123 unsigned long flags;
124 struct rcu_data *rdp; 109 struct rcu_data *rdp;
125 110
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu); 111 rdp = &per_cpu(rcu_bh_data, cpu);
128 rdp->passed_quiesc = 1;
129 rdp->passed_quiesc_completed = rdp->completed; 112 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags); 113 barrier();
114 rdp->passed_quiesc = 1;
131} 115}
132 116
133#ifdef CONFIG_NO_HZ 117#ifdef CONFIG_NO_HZ
@@ -141,6 +125,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */
141static int qhimark = 10000; /* If this many pending, ignore blimit. */ 125static int qhimark = 10000; /* If this many pending, ignore blimit. */
142static int qlowmark = 100; /* Once only this many pending, use blimit. */ 126static int qlowmark = 100; /* Once only this many pending, use blimit. */
143 127
128module_param(blimit, int, 0);
129module_param(qhimark, int, 0);
130module_param(qlowmark, int, 0);
131
144static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 132static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu); 133static int rcu_pending(int cpu);
146 134
@@ -177,9 +165,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
177static int 165static int
178cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 166cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
179{ 167{
180 /* ACCESS_ONCE() because we are accessing outside of lock. */ 168 return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
181 return *rdp->nxttail[RCU_DONE_TAIL] &&
182 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
183} 169}
184 170
185/* 171/*
@@ -373,7 +359,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp)
373/* 359/*
374 * Snapshot the specified CPU's dynticks counter so that we can later 360 * Snapshot the specified CPU's dynticks counter so that we can later
375 * credit them with an implicit quiescent state. Return 1 if this CPU 361 * credit them with an implicit quiescent state. Return 1 if this CPU
376 * is already in a quiescent state courtesy of dynticks idle mode. 362 * is in dynticks idle mode, which is an extended quiescent state.
377 */ 363 */
378static int dyntick_save_progress_counter(struct rcu_data *rdp) 364static int dyntick_save_progress_counter(struct rcu_data *rdp)
379{ 365{
@@ -479,30 +465,34 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
479 long delta; 465 long delta;
480 unsigned long flags; 466 unsigned long flags;
481 struct rcu_node *rnp = rcu_get_root(rsp); 467 struct rcu_node *rnp = rcu_get_root(rsp);
482 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
483 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
484 468
485 /* Only let one CPU complain about others per time interval. */ 469 /* Only let one CPU complain about others per time interval. */
486 470
487 spin_lock_irqsave(&rnp->lock, flags); 471 spin_lock_irqsave(&rnp->lock, flags);
488 delta = jiffies - rsp->jiffies_stall; 472 delta = jiffies - rsp->jiffies_stall;
489 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) { 473 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
490 spin_unlock_irqrestore(&rnp->lock, flags); 474 spin_unlock_irqrestore(&rnp->lock, flags);
491 return; 475 return;
492 } 476 }
493 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 477 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
478
479 /*
480 * Now rat on any tasks that got kicked up to the root rcu_node
481 * due to CPU offlining.
482 */
483 rcu_print_task_stall(rnp);
494 spin_unlock_irqrestore(&rnp->lock, flags); 484 spin_unlock_irqrestore(&rnp->lock, flags);
495 485
496 /* OK, time to rat on our buddy... */ 486 /* OK, time to rat on our buddy... */
497 487
498 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 488 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
499 for (; rnp_cur < rnp_end; rnp_cur++) { 489 rcu_for_each_leaf_node(rsp, rnp) {
500 rcu_print_task_stall(rnp); 490 rcu_print_task_stall(rnp);
501 if (rnp_cur->qsmask == 0) 491 if (rnp->qsmask == 0)
502 continue; 492 continue;
503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 493 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
504 if (rnp_cur->qsmask & (1UL << cpu)) 494 if (rnp->qsmask & (1UL << cpu))
505 printk(" %d", rnp_cur->grplo + cpu); 495 printk(" %d", rnp->grplo + cpu);
506 } 496 }
507 printk(" (detected by %d, t=%ld jiffies)\n", 497 printk(" (detected by %d, t=%ld jiffies)\n",
508 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 498 smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -541,8 +531,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
541 /* We haven't checked in, so go dump stack. */ 531 /* We haven't checked in, so go dump stack. */
542 print_cpu_stall(rsp); 532 print_cpu_stall(rsp);
543 533
544 } else if (rsp->gpnum != rsp->completed && 534 } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
545 delta >= RCU_STALL_RAT_DELAY) {
546 535
547 /* They had two time units to dump stack, so complain. */ 536 /* They had two time units to dump stack, so complain. */
548 print_other_cpu_stall(rsp); 537 print_other_cpu_stall(rsp);
@@ -605,8 +594,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
605{ 594{
606 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 595 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
607 struct rcu_node *rnp = rcu_get_root(rsp); 596 struct rcu_node *rnp = rcu_get_root(rsp);
608 struct rcu_node *rnp_cur;
609 struct rcu_node *rnp_end;
610 597
611 if (!cpu_needs_another_gp(rsp, rdp)) { 598 if (!cpu_needs_another_gp(rsp, rdp)) {
612 spin_unlock_irqrestore(&rnp->lock, flags); 599 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -615,6 +602,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
615 602
616 /* Advance to a new grace period and initialize state. */ 603 /* Advance to a new grace period and initialize state. */
617 rsp->gpnum++; 604 rsp->gpnum++;
605 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
618 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 606 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
619 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 607 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
620 record_gp_stall_check_time(rsp); 608 record_gp_stall_check_time(rsp);
@@ -622,16 +610,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
622 note_new_gpnum(rsp, rdp); 610 note_new_gpnum(rsp, rdp);
623 611
624 /* 612 /*
625 * Because we are first, we know that all our callbacks will 613 * Because this CPU just now started the new grace period, we know
626 * be covered by this upcoming grace period, even the ones 614 * that all of its callbacks will be covered by this upcoming grace
627 * that were registered arbitrarily recently. 615 * period, even the ones that were registered arbitrarily recently.
616 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
617 *
618 * Other CPUs cannot be sure exactly when the grace period started.
619 * Therefore, their recently registered callbacks must pass through
620 * an additional RCU_NEXT_READY stage, so that they will be handled
621 * by the next RCU grace period.
628 */ 622 */
629 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 623 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
630 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 624 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
631 625
632 /* Special-case the common single-level case. */ 626 /* Special-case the common single-level case. */
633 if (NUM_RCU_NODES == 1) { 627 if (NUM_RCU_NODES == 1) {
628 rcu_preempt_check_blocked_tasks(rnp);
634 rnp->qsmask = rnp->qsmaskinit; 629 rnp->qsmask = rnp->qsmaskinit;
630 rnp->gpnum = rsp->gpnum;
635 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 631 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
636 spin_unlock_irqrestore(&rnp->lock, flags); 632 spin_unlock_irqrestore(&rnp->lock, flags);
637 return; 633 return;
@@ -644,42 +640,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
644 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 640 spin_lock(&rsp->onofflock); /* irqs already disabled. */
645 641
646 /* 642 /*
647 * Set the quiescent-state-needed bits in all the non-leaf RCU 643 * Set the quiescent-state-needed bits in all the rcu_node
648 * nodes for all currently online CPUs. This operation relies 644 * structures for all currently online CPUs in breadth-first
649 * on the layout of the hierarchy within the rsp->node[] array. 645 * order, starting from the root rcu_node structure. This
650 * Note that other CPUs will access only the leaves of the 646 * operation relies on the layout of the hierarchy within the
651 * hierarchy, which still indicate that no grace period is in 647 * rsp->node[] array. Note that other CPUs will access only
652 * progress. In addition, we have excluded CPU-hotplug operations. 648 * the leaves of the hierarchy, which still indicate that no
653 * 649 * grace period is in progress, at least until the corresponding
654 * We therefore do not need to hold any locks. Any required 650 * leaf node has been initialized. In addition, we have excluded
655 * memory barriers will be supplied by the locks guarding the 651 * CPU-hotplug operations.
656 * leaf rcu_nodes in the hierarchy.
657 */
658
659 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
660 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
661 rnp_cur->qsmask = rnp_cur->qsmaskinit;
662
663 /*
664 * Now set up the leaf nodes. Here we must be careful. First,
665 * we need to hold the lock in order to exclude other CPUs, which
666 * might be contending for the leaf nodes' locks. Second, as
667 * soon as we initialize a given leaf node, its CPUs might run
668 * up the rest of the hierarchy. We must therefore acquire locks
669 * for each node that we touch during this stage. (But we still
670 * are excluding CPU-hotplug operations.)
671 * 652 *
672 * Note that the grace period cannot complete until we finish 653 * Note that the grace period cannot complete until we finish
673 * the initialization process, as there will be at least one 654 * the initialization process, as there will be at least one
674 * qsmask bit set in the root node until that time, namely the 655 * qsmask bit set in the root node until that time, namely the
675 * one corresponding to this CPU. 656 * one corresponding to this CPU, due to the fact that we have
657 * irqs disabled.
676 */ 658 */
677 rnp_end = &rsp->node[NUM_RCU_NODES]; 659 rcu_for_each_node_breadth_first(rsp, rnp) {
678 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 660 spin_lock(&rnp->lock); /* irqs already disabled. */
679 for (; rnp_cur < rnp_end; rnp_cur++) { 661 rcu_preempt_check_blocked_tasks(rnp);
680 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 662 rnp->qsmask = rnp->qsmaskinit;
681 rnp_cur->qsmask = rnp_cur->qsmaskinit; 663 rnp->gpnum = rsp->gpnum;
682 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 664 spin_unlock(&rnp->lock); /* irqs already disabled. */
683 } 665 }
684 666
685 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 667 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -720,8 +702,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it. 702 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */ 703 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) 704static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock) 705 __releases(rcu_get_root(rsp)->lock)
724{ 706{
707 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
725 rsp->completed = rsp->gpnum; 708 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 709 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 710 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -739,6 +722,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
739 unsigned long flags) 722 unsigned long flags)
740 __releases(rnp->lock) 723 __releases(rnp->lock)
741{ 724{
725 struct rcu_node *rnp_c;
726
742 /* Walk up the rcu_node hierarchy. */ 727 /* Walk up the rcu_node hierarchy. */
743 for (;;) { 728 for (;;) {
744 if (!(rnp->qsmask & mask)) { 729 if (!(rnp->qsmask & mask)) {
@@ -762,8 +747,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
762 break; 747 break;
763 } 748 }
764 spin_unlock_irqrestore(&rnp->lock, flags); 749 spin_unlock_irqrestore(&rnp->lock, flags);
750 rnp_c = rnp;
765 rnp = rnp->parent; 751 rnp = rnp->parent;
766 spin_lock_irqsave(&rnp->lock, flags); 752 spin_lock_irqsave(&rnp->lock, flags);
753 WARN_ON_ONCE(rnp_c->qsmask);
767 } 754 }
768 755
769 /* 756 /*
@@ -776,10 +763,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
776 763
777/* 764/*
778 * Record a quiescent state for the specified CPU, which must either be 765 * Record a quiescent state for the specified CPU, which must either be
779 * the current CPU or an offline CPU. The lastcomp argument is used to 766 * the current CPU. The lastcomp argument is used to make sure we are
780 * make sure we are still in the grace period of interest. We don't want 767 * still in the grace period of interest. We don't want to end the current
781 * to end the current grace period based on quiescent states detected in 768 * grace period based on quiescent states detected in an earlier grace
782 * an earlier grace period! 769 * period!
783 */ 770 */
784static void 771static void
785cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 772cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -814,7 +801,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
814 * This GP can't end until cpu checks in, so all of our 801 * This GP can't end until cpu checks in, so all of our
815 * callbacks can be processed during the next GP. 802 * callbacks can be processed during the next GP.
816 */ 803 */
817 rdp = rsp->rda[smp_processor_id()];
818 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 804 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
819 805
820 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 806 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -855,24 +841,70 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
855#ifdef CONFIG_HOTPLUG_CPU 841#ifdef CONFIG_HOTPLUG_CPU
856 842
857/* 843/*
844 * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
845 * specified flavor of RCU. The callbacks will be adopted by the next
846 * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
847 * comes first. Because this is invoked from the CPU_DYING notifier,
848 * irqs are already disabled.
849 */
850static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
851{
852 int i;
853 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
854
855 if (rdp->nxtlist == NULL)
856 return; /* irqs disabled, so comparison is stable. */
857 spin_lock(&rsp->onofflock); /* irqs already disabled. */
858 *rsp->orphan_cbs_tail = rdp->nxtlist;
859 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
860 rdp->nxtlist = NULL;
861 for (i = 0; i < RCU_NEXT_SIZE; i++)
862 rdp->nxttail[i] = &rdp->nxtlist;
863 rsp->orphan_qlen += rdp->qlen;
864 rdp->qlen = 0;
865 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
866}
867
868/*
869 * Adopt previously orphaned RCU callbacks.
870 */
871static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
872{
873 unsigned long flags;
874 struct rcu_data *rdp;
875
876 spin_lock_irqsave(&rsp->onofflock, flags);
877 rdp = rsp->rda[smp_processor_id()];
878 if (rsp->orphan_cbs_list == NULL) {
879 spin_unlock_irqrestore(&rsp->onofflock, flags);
880 return;
881 }
882 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
883 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
884 rdp->qlen += rsp->orphan_qlen;
885 rsp->orphan_cbs_list = NULL;
886 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
887 rsp->orphan_qlen = 0;
888 spin_unlock_irqrestore(&rsp->onofflock, flags);
889}
890
891/*
858 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 892 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
859 * and move all callbacks from the outgoing CPU to the current one. 893 * and move all callbacks from the outgoing CPU to the current one.
860 */ 894 */
861static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 895static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
862{ 896{
863 int i;
864 unsigned long flags; 897 unsigned long flags;
865 long lastcomp; 898 long lastcomp;
866 unsigned long mask; 899 unsigned long mask;
867 struct rcu_data *rdp = rsp->rda[cpu]; 900 struct rcu_data *rdp = rsp->rda[cpu];
868 struct rcu_data *rdp_me;
869 struct rcu_node *rnp; 901 struct rcu_node *rnp;
870 902
871 /* Exclude any attempts to start a new grace period. */ 903 /* Exclude any attempts to start a new grace period. */
872 spin_lock_irqsave(&rsp->onofflock, flags); 904 spin_lock_irqsave(&rsp->onofflock, flags);
873 905
874 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 906 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
875 rnp = rdp->mynode; 907 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
876 mask = rdp->grpmask; /* rnp->grplo is constant. */ 908 mask = rdp->grpmask; /* rnp->grplo is constant. */
877 do { 909 do {
878 spin_lock(&rnp->lock); /* irqs already disabled. */ 910 spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -881,42 +913,16 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
881 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 913 spin_unlock(&rnp->lock); /* irqs remain disabled. */
882 break; 914 break;
883 } 915 }
884 rcu_preempt_offline_tasks(rsp, rnp); 916 rcu_preempt_offline_tasks(rsp, rnp, rdp);
885 mask = rnp->grpmask; 917 mask = rnp->grpmask;
886 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 918 spin_unlock(&rnp->lock); /* irqs remain disabled. */
887 rnp = rnp->parent; 919 rnp = rnp->parent;
888 } while (rnp != NULL); 920 } while (rnp != NULL);
889 lastcomp = rsp->completed; 921 lastcomp = rsp->completed;
890 922
891 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 923 spin_unlock_irqrestore(&rsp->onofflock, flags);
892
893 /* Being offline is a quiescent state, so go record it. */
894 cpu_quiet(cpu, rsp, rdp, lastcomp);
895 924
896 /* 925 rcu_adopt_orphan_cbs(rsp);
897 * Move callbacks from the outgoing CPU to the running CPU.
898 * Note that the outgoing CPU is now quiscent, so it is now
899 * (uncharacteristically) safe to access its rcu_data structure.
900 * Note also that we must carefully retain the order of the
901 * outgoing CPU's callbacks in order for rcu_barrier() to work
902 * correctly. Finally, note that we start all the callbacks
903 * afresh, even those that have passed through a grace period
904 * and are therefore ready to invoke. The theory is that hotplug
905 * events are rare, and that if they are frequent enough to
906 * indefinitely delay callbacks, you have far worse things to
907 * be worrying about.
908 */
909 rdp_me = rsp->rda[smp_processor_id()];
910 if (rdp->nxtlist != NULL) {
911 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
912 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
913 rdp->nxtlist = NULL;
914 for (i = 0; i < RCU_NEXT_SIZE; i++)
915 rdp->nxttail[i] = &rdp->nxtlist;
916 rdp_me->qlen += rdp->qlen;
917 rdp->qlen = 0;
918 }
919 local_irq_restore(flags);
920} 926}
921 927
922/* 928/*
@@ -934,6 +940,14 @@ static void rcu_offline_cpu(int cpu)
934 940
935#else /* #ifdef CONFIG_HOTPLUG_CPU */ 941#else /* #ifdef CONFIG_HOTPLUG_CPU */
936 942
943static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
944{
945}
946
947static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
948{
949}
950
937static void rcu_offline_cpu(int cpu) 951static void rcu_offline_cpu(int cpu)
938{ 952{
939} 953}
@@ -1066,33 +1080,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1066 int cpu; 1080 int cpu;
1067 unsigned long flags; 1081 unsigned long flags;
1068 unsigned long mask; 1082 unsigned long mask;
1069 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 1083 struct rcu_node *rnp;
1070 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
1071 1084
1072 for (; rnp_cur < rnp_end; rnp_cur++) { 1085 rcu_for_each_leaf_node(rsp, rnp) {
1073 mask = 0; 1086 mask = 0;
1074 spin_lock_irqsave(&rnp_cur->lock, flags); 1087 spin_lock_irqsave(&rnp->lock, flags);
1075 if (rsp->completed != lastcomp) { 1088 if (rsp->completed != lastcomp) {
1076 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1089 spin_unlock_irqrestore(&rnp->lock, flags);
1077 return 1; 1090 return 1;
1078 } 1091 }
1079 if (rnp_cur->qsmask == 0) { 1092 if (rnp->qsmask == 0) {
1080 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1093 spin_unlock_irqrestore(&rnp->lock, flags);
1081 continue; 1094 continue;
1082 } 1095 }
1083 cpu = rnp_cur->grplo; 1096 cpu = rnp->grplo;
1084 bit = 1; 1097 bit = 1;
1085 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { 1098 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1086 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1099 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1087 mask |= bit; 1100 mask |= bit;
1088 } 1101 }
1089 if (mask != 0 && rsp->completed == lastcomp) { 1102 if (mask != 0 && rsp->completed == lastcomp) {
1090 1103
1091 /* cpu_quiet_msk() releases rnp_cur->lock. */ 1104 /* cpu_quiet_msk() releases rnp->lock. */
1092 cpu_quiet_msk(mask, rsp, rnp_cur, flags); 1105 cpu_quiet_msk(mask, rsp, rnp, flags);
1093 continue; 1106 continue;
1094 } 1107 }
1095 spin_unlock_irqrestore(&rnp_cur->lock, flags); 1108 spin_unlock_irqrestore(&rnp->lock, flags);
1096 } 1109 }
1097 return 0; 1110 return 0;
1098} 1111}
@@ -1108,7 +1121,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1108 struct rcu_node *rnp = rcu_get_root(rsp); 1121 struct rcu_node *rnp = rcu_get_root(rsp);
1109 u8 signaled; 1122 u8 signaled;
1110 1123
1111 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) 1124 if (!rcu_gp_in_progress(rsp))
1112 return; /* No grace period in progress, nothing to force. */ 1125 return; /* No grace period in progress, nothing to force. */
1113 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1126 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1114 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1127 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
@@ -1267,7 +1280,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1267 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1280 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1268 1281
1269 /* Start a new grace period if one not already started. */ 1282 /* Start a new grace period if one not already started. */
1270 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) { 1283 if (!rcu_gp_in_progress(rsp)) {
1271 unsigned long nestflag; 1284 unsigned long nestflag;
1272 struct rcu_node *rnp_root = rcu_get_root(rsp); 1285 struct rcu_node *rnp_root = rcu_get_root(rsp);
1273 1286
@@ -1347,7 +1360,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1347 } 1360 }
1348 1361
1349 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1362 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1350 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1363 if (rcu_gp_in_progress(rsp) &&
1351 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1364 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
1352 rdp->n_rp_need_fqs++; 1365 rdp->n_rp_need_fqs++;
1353 return 1; 1366 return 1;
@@ -1384,6 +1397,82 @@ int rcu_needs_cpu(int cpu)
1384 rcu_preempt_needs_cpu(cpu); 1397 rcu_preempt_needs_cpu(cpu);
1385} 1398}
1386 1399
1400static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1401static atomic_t rcu_barrier_cpu_count;
1402static DEFINE_MUTEX(rcu_barrier_mutex);
1403static struct completion rcu_barrier_completion;
1404
1405static void rcu_barrier_callback(struct rcu_head *notused)
1406{
1407 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1408 complete(&rcu_barrier_completion);
1409}
1410
1411/*
1412 * Called with preemption disabled, and from cross-cpu IRQ context.
1413 */
1414static void rcu_barrier_func(void *type)
1415{
1416 int cpu = smp_processor_id();
1417 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
1418 void (*call_rcu_func)(struct rcu_head *head,
1419 void (*func)(struct rcu_head *head));
1420
1421 atomic_inc(&rcu_barrier_cpu_count);
1422 call_rcu_func = type;
1423 call_rcu_func(head, rcu_barrier_callback);
1424}
1425
1426/*
1427 * Orchestrate the specified type of RCU barrier, waiting for all
1428 * RCU callbacks of the specified type to complete.
1429 */
1430static void _rcu_barrier(struct rcu_state *rsp,
1431 void (*call_rcu_func)(struct rcu_head *head,
1432 void (*func)(struct rcu_head *head)))
1433{
1434 BUG_ON(in_interrupt());
1435 /* Take mutex to serialize concurrent rcu_barrier() requests. */
1436 mutex_lock(&rcu_barrier_mutex);
1437 init_completion(&rcu_barrier_completion);
1438 /*
1439 * Initialize rcu_barrier_cpu_count to 1, then invoke
1440 * rcu_barrier_func() on each CPU, so that each CPU also has
1441 * incremented rcu_barrier_cpu_count. Only then is it safe to
1442 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
1443 * might complete its grace period before all of the other CPUs
1444 * did their increment, causing this function to return too
1445 * early.
1446 */
1447 atomic_set(&rcu_barrier_cpu_count, 1);
1448 preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
1449 rcu_adopt_orphan_cbs(rsp);
1450 on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
1451 preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
1452 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
1453 complete(&rcu_barrier_completion);
1454 wait_for_completion(&rcu_barrier_completion);
1455 mutex_unlock(&rcu_barrier_mutex);
1456}
1457
1458/**
1459 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
1460 */
1461void rcu_barrier_bh(void)
1462{
1463 _rcu_barrier(&rcu_bh_state, call_rcu_bh);
1464}
1465EXPORT_SYMBOL_GPL(rcu_barrier_bh);
1466
1467/**
1468 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
1469 */
1470void rcu_barrier_sched(void)
1471{
1472 _rcu_barrier(&rcu_sched_state, call_rcu_sched);
1473}
1474EXPORT_SYMBOL_GPL(rcu_barrier_sched);
1475
1387/* 1476/*
1388 * Do boot-time initialization of a CPU's per-CPU RCU data. 1477 * Do boot-time initialization of a CPU's per-CPU RCU data.
1389 */ 1478 */
@@ -1457,20 +1546,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1457 rnp = rnp->parent; 1546 rnp = rnp->parent;
1458 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1547 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1459 1548
1460 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1549 spin_unlock_irqrestore(&rsp->onofflock, flags);
1461
1462 /*
1463 * A new grace period might start here. If so, we will be part of
1464 * it, and its gpnum will be greater than ours, so we will
1465 * participate. It is also possible for the gpnum to have been
1466 * incremented before this function was called, and the bitmasks
1467 * to not be filled out until now, in which case we will also
1468 * participate due to our gpnum being behind.
1469 */
1470
1471 /* Since it is coming online, the CPU is in a quiescent state. */
1472 cpu_quiet(cpu, rsp, rdp, lastcomp);
1473 local_irq_restore(flags);
1474} 1550}
1475 1551
1476static void __cpuinit rcu_online_cpu(int cpu) 1552static void __cpuinit rcu_online_cpu(int cpu)
@@ -1493,6 +1569,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1493 case CPU_UP_PREPARE_FROZEN: 1569 case CPU_UP_PREPARE_FROZEN:
1494 rcu_online_cpu(cpu); 1570 rcu_online_cpu(cpu);
1495 break; 1571 break;
1572 case CPU_DYING:
1573 case CPU_DYING_FROZEN:
1574 /*
1575 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
1576 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
1577 * returns, all online cpus have queued rcu_barrier_func().
1578 * The dying CPU clears its cpu_online_mask bit and
1579 * moves all of its RCU callbacks to ->orphan_cbs_list
1580 * in the context of stop_machine(), so subsequent calls
1581 * to _rcu_barrier() will adopt these callbacks and only
1582 * then queue rcu_barrier_func() on all remaining CPUs.
1583 */
1584 rcu_send_cbs_to_orphanage(&rcu_bh_state);
1585 rcu_send_cbs_to_orphanage(&rcu_sched_state);
1586 rcu_preempt_send_cbs_to_orphanage();
1587 break;
1496 case CPU_DEAD: 1588 case CPU_DEAD:
1497 case CPU_DEAD_FROZEN: 1589 case CPU_DEAD_FROZEN:
1498 case CPU_UP_CANCELED: 1590 case CPU_UP_CANCELED:
@@ -1555,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1555 cpustride *= rsp->levelspread[i]; 1647 cpustride *= rsp->levelspread[i];
1556 rnp = rsp->level[i]; 1648 rnp = rsp->level[i];
1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1649 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1558 spin_lock_init(&rnp->lock); 1650 if (rnp != rcu_get_root(rsp))
1651 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0; 1652 rnp->gpnum = 0;
1560 rnp->qsmask = 0; 1653 rnp->qsmask = 0;
1561 rnp->qsmaskinit = 0; 1654 rnp->qsmaskinit = 0;
@@ -1578,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]); 1671 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1579 } 1672 }
1580 } 1673 }
1674 spin_lock_init(&rcu_get_root(rsp)->lock);
1581} 1675}
1582 1676
1583/* 1677/*
@@ -1587,6 +1681,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1587 */ 1681 */
1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1682#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1589do { \ 1683do { \
1684 int i; \
1685 int j; \
1686 struct rcu_node *rnp; \
1687 \
1590 rcu_init_one(rsp); \ 1688 rcu_init_one(rsp); \
1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1689 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1592 j = 0; \ 1690 j = 0; \
@@ -1599,31 +1697,8 @@ do { \
1599 } \ 1697 } \
1600} while (0) 1698} while (0)
1601 1699
1602#ifdef CONFIG_TREE_PREEMPT_RCU
1603
1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1620
1621void __init __rcu_init(void) 1700void __init __rcu_init(void)
1622{ 1701{
1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1624 int j;
1625 struct rcu_node *rnp;
1626
1627 rcu_bootup_announce(); 1702 rcu_bootup_announce();
1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1703#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1704 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
@@ -1634,6 +1709,4 @@ void __init __rcu_init(void)
1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1709 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1635} 1710}
1636 1711
1637module_param(blimit, int, 0); 1712#include "rcutree_plugin.h"
1638module_param(qhimark, int, 0);
1639module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index bf8a6f9f134d..b40ac5706040 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -48,14 +48,14 @@
48#elif NR_CPUS <= RCU_FANOUT_SQ 48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2 49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1 50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) 51# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS) 52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0 53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE 54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3 55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1 56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) 57# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) 58# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
59# define NUM_RCU_LVL_3 NR_CPUS 59# define NUM_RCU_LVL_3 NR_CPUS
60#else 60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
@@ -79,15 +79,21 @@ struct rcu_dynticks {
79 * Definition for node within the RCU grace-period-detection hierarchy. 79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */ 80 */
81struct rcu_node { 81struct rcu_node {
82 spinlock_t lock; 82 spinlock_t lock; /* Root rcu_node's lock protects some */
83 /* rcu_state fields as well as following. */
83 long gpnum; /* Current grace period for this node. */ 84 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */ 85 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */ 86 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */ 87 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/ 88 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */
90 /* an rcu_data structure, otherwise, each */
91 /* bit corresponds to a child rcu_node */
92 /* structure. */
88 unsigned long qsmaskinit; 93 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */ 94 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 95 unsigned long grpmask; /* Mask to apply to parent qsmask. */
96 /* Only one bit will be set in this mask. */
91 int grplo; /* lowest-numbered CPU or group here. */ 97 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */ 98 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */ 99 u8 grpnum; /* CPU/group number for next level up. */
@@ -95,8 +101,23 @@ struct rcu_node {
95 struct rcu_node *parent; 101 struct rcu_node *parent;
96 struct list_head blocked_tasks[2]; 102 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */ 103 /* Tasks blocked in RCU read-side critsect. */
104 /* Grace period number (->gpnum) x blocked */
105 /* by tasks on the (x & 0x1) element of the */
106 /* blocked_tasks[] array. */
98} ____cacheline_internodealigned_in_smp; 107} ____cacheline_internodealigned_in_smp;
99 108
109/*
110 * Do a full breadth-first scan of the rcu_node structures for the
111 * specified rcu_state structure.
112 */
113#define rcu_for_each_node_breadth_first(rsp, rnp) \
114 for ((rnp) = &(rsp)->node[0]; \
115 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
116
117#define rcu_for_each_leaf_node(rsp, rnp) \
118 for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
119 (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
120
100/* Index values for nxttail array in struct rcu_data. */ 121/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ 122#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ 123#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
@@ -126,23 +147,26 @@ struct rcu_data {
126 * Any of the partitions might be empty, in which case the 147 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for 148 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of 149 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL. 150 * the nxttail elements point to the ->nxtlist pointer itself,
151 * which in that case is NULL.
130 * 152 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]): 153 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed 154 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and 155 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved 156 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks(). 157 * here temporarily in rcu_process_callbacks().
158 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
159 * Entries that batch # <= ->completed - 1: waiting for current GP
160 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
161 * Entries known to have arrived before current GP ended
162 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
163 * Entries that might have arrived after current GP ended
164 * Note that the value of *nxttail[RCU_NEXT_TAIL] will
165 * always be NULL, as this is the end of the list.
142 */ 166 */
143 struct rcu_head *nxtlist; 167 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 168 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */ 169 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */ 170 long blimit; /* Upper limit on a processed batch */
147 171
148#ifdef CONFIG_NO_HZ 172#ifdef CONFIG_NO_HZ
@@ -216,8 +240,19 @@ struct rcu_state {
216 /* Force QS state. */ 240 /* Force QS state. */
217 long gpnum; /* Current gp number. */ 241 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */ 242 long completed; /* # of last completed gp. */
243
244 /* End of fields guarded by root rcu_node's lock. */
245
219 spinlock_t onofflock; /* exclude on/offline and */ 246 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */ 247 /* starting new GP. Also */
248 /* protects the following */
249 /* orphan_cbs fields. */
250 struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */
251 /* orphaned by all CPUs in */
252 /* a given leaf rcu_node */
253 /* going offline. */
254 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
255 long orphan_qlen; /* Number of orphaned cbs. */
221 spinlock_t fqslock; /* Only one task forcing */ 256 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */ 257 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */ 258 unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -255,5 +290,30 @@ extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 290DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 291#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257 292
258#endif /* #ifdef RCU_TREE_NONCORE */ 293#else /* #ifdef RCU_TREE_NONCORE */
294
295/* Forward declarations for rcutree_plugin.h */
296static inline void rcu_bootup_announce(void);
297long rcu_batches_completed(void);
298static void rcu_preempt_note_context_switch(int cpu);
299static int rcu_preempted_readers(struct rcu_node *rnp);
300#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
301static void rcu_print_task_stall(struct rcu_node *rnp);
302#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
303static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
304#ifdef CONFIG_HOTPLUG_CPU
305static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
306 struct rcu_node *rnp,
307 struct rcu_data *rdp);
308static void rcu_preempt_offline_cpu(int cpu);
309#endif /* #ifdef CONFIG_HOTPLUG_CPU */
310static void rcu_preempt_check_callbacks(int cpu);
311static void rcu_preempt_process_callbacks(void);
312void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
313static int rcu_preempt_pending(int cpu);
314static int rcu_preempt_needs_cpu(int cpu);
315static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
316static void rcu_preempt_send_cbs_to_orphanage(void);
317static void __init __rcu_init_preempt(void);
259 318
319#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 47789369ea59..c0cb783aa16a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -64,22 +64,31 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
64 * not in a quiescent state. There might be any number of tasks blocked 64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section. 65 * while in an RCU read-side critical section.
66 */ 66 */
67static void rcu_preempt_qs_record(int cpu) 67static void rcu_preempt_qs(int cpu)
68{ 68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed; 70 rdp->passed_quiesc_completed = rdp->completed;
71 barrier();
72 rdp->passed_quiesc = 1;
72} 73}
73 74
74/* 75/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd. 76 * We have entered the scheduler, and the current task might soon be
76 * If we are in an RCU read-side critical section, we need to reflect 77 * context-switched away from. If this task is in an RCU read-side
77 * that in the state of the rcu_node structure corresponding to this CPU. 78 * critical section, we will no longer be able to rely on the CPU to
78 * Caller must disable hardirqs. 79 * record that fact, so we enqueue the task on the appropriate entry
80 * of the blocked_tasks[] array. The task will dequeue itself when
81 * it exits the outermost enclosing RCU read-side critical section.
82 * Therefore, the current grace period cannot be permitted to complete
83 * until the blocked_tasks[] entry indexed by the low-order bit of
84 * rnp->gpnum empties.
85 *
86 * Caller must disable preemption.
79 */ 87 */
80static void rcu_preempt_qs(int cpu) 88static void rcu_preempt_note_context_switch(int cpu)
81{ 89{
82 struct task_struct *t = current; 90 struct task_struct *t = current;
91 unsigned long flags;
83 int phase; 92 int phase;
84 struct rcu_data *rdp; 93 struct rcu_data *rdp;
85 struct rcu_node *rnp; 94 struct rcu_node *rnp;
@@ -90,7 +99,7 @@ static void rcu_preempt_qs(int cpu)
90 /* Possibly blocking in an RCU read-side critical section. */ 99 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu]; 100 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode; 101 rnp = rdp->mynode;
93 spin_lock(&rnp->lock); 102 spin_lock_irqsave(&rnp->lock, flags);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp; 104 t->rcu_blocked_node = rnp;
96 105
@@ -103,11 +112,15 @@ static void rcu_preempt_qs(int cpu)
103 * state for the current grace period), then as long 112 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period 113 * as that task remains queued, the current grace period
105 * cannot end. 114 * cannot end.
115 *
116 * But first, note that the current CPU must still be
117 * on line!
106 */ 118 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1); 119 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */ 123 spin_unlock_irqrestore(&rnp->lock, flags);
110 spin_unlock(&rnp->lock);
111 } 124 }
112 125
113 /* 126 /*
@@ -119,9 +132,10 @@ static void rcu_preempt_qs(int cpu)
119 * grace period, then the fact that the task has been enqueued 132 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period. 133 * means that we continue to block the current grace period.
121 */ 134 */
122 rcu_preempt_qs_record(cpu); 135 rcu_preempt_qs(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS | 136 local_irq_save(flags);
124 RCU_READ_UNLOCK_GOT_QS); 137 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
138 local_irq_restore(flags);
125} 139}
126 140
127/* 141/*
@@ -136,6 +150,16 @@ void __rcu_read_lock(void)
136} 150}
137EXPORT_SYMBOL_GPL(__rcu_read_lock); 151EXPORT_SYMBOL_GPL(__rcu_read_lock);
138 152
153/*
154 * Check for preempted RCU readers blocking the current grace period
155 * for the specified rcu_node structure. If the caller needs a reliable
156 * answer, it must hold the rcu_node's ->lock.
157 */
158static int rcu_preempted_readers(struct rcu_node *rnp)
159{
160 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
161}
162
139static void rcu_read_unlock_special(struct task_struct *t) 163static void rcu_read_unlock_special(struct task_struct *t)
140{ 164{
141 int empty; 165 int empty;
@@ -157,7 +181,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
157 special = t->rcu_read_unlock_special; 181 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) { 182 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 183 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS; 184 rcu_preempt_qs(smp_processor_id());
161 } 185 }
162 186
163 /* Hardware IRQ handlers cannot block. */ 187 /* Hardware IRQ handlers cannot block. */
@@ -177,12 +201,12 @@ static void rcu_read_unlock_special(struct task_struct *t)
177 */ 201 */
178 for (;;) { 202 for (;;) {
179 rnp = t->rcu_blocked_node; 203 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock); 204 spin_lock(&rnp->lock); /* irqs already disabled. */
181 if (rnp == t->rcu_blocked_node) 205 if (rnp == t->rcu_blocked_node)
182 break; 206 break;
183 spin_unlock(&rnp->lock); 207 spin_unlock(&rnp->lock); /* irqs remain disabled. */
184 } 208 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 209 empty = !rcu_preempted_readers(rnp);
186 list_del_init(&t->rcu_node_entry); 210 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL; 211 t->rcu_blocked_node = NULL;
188 212
@@ -193,10 +217,9 @@ static void rcu_read_unlock_special(struct task_struct *t)
193 * drop rnp->lock and restore irq. 217 * drop rnp->lock and restore irq.
194 */ 218 */
195 if (!empty && rnp->qsmask == 0 && 219 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) { 220 !rcu_preempted_readers(rnp)) {
197 t->rcu_read_unlock_special &= 221 struct rcu_node *rnp_p;
198 ~(RCU_READ_UNLOCK_NEED_QS | 222
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) { 223 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */ 224 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags); 225 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
@@ -205,9 +228,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
205 /* Report up the rest of the hierarchy. */ 228 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask; 229 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags); 230 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent; 231 rnp_p = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags); 232 spin_lock_irqsave(&rnp_p->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags); 233 WARN_ON_ONCE(rnp->qsmask);
234 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
211 return; 235 return;
212 } 236 }
213 spin_unlock(&rnp->lock); 237 spin_unlock(&rnp->lock);
@@ -243,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
243{ 267{
244 unsigned long flags; 268 unsigned long flags;
245 struct list_head *lp; 269 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1; 270 int phase;
247 struct task_struct *t; 271 struct task_struct *t;
248 272
249 if (!list_empty(&rnp->blocked_tasks[phase])) { 273 if (rcu_preempted_readers(rnp)) {
250 spin_lock_irqsave(&rnp->lock, flags); 274 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */ 275 phase = rnp->gpnum & 0x1;
252 lp = &rnp->blocked_tasks[phase]; 276 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry) 277 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid); 278 printk(" P%d", t->pid);
@@ -259,13 +283,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 283#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260 284
261/* 285/*
262 * Check for preempted RCU readers for the specified rcu_node structure. 286 * Check that the list of blocked tasks for the newly completed grace
263 * If the caller needs a reliable answer, it must hold the rcu_node's 287 * period is in fact empty. It is a serious bug to complete a grace
264 * >lock. 288 * period that still has RCU readers blocked! This function must be
289 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
290 * must be held by the caller.
265 */ 291 */
266static int rcu_preempted_readers(struct rcu_node *rnp) 292static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
267{ 293{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]); 294 WARN_ON_ONCE(rcu_preempted_readers(rnp));
295 WARN_ON_ONCE(rnp->qsmask);
269} 296}
270 297
271#ifdef CONFIG_HOTPLUG_CPU 298#ifdef CONFIG_HOTPLUG_CPU
@@ -280,7 +307,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
280 * The caller must hold rnp->lock with irqs disabled. 307 * The caller must hold rnp->lock with irqs disabled.
281 */ 308 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 309static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp) 310 struct rcu_node *rnp,
311 struct rcu_data *rdp)
284{ 312{
285 int i; 313 int i;
286 struct list_head *lp; 314 struct list_head *lp;
@@ -292,6 +320,9 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
292 WARN_ONCE(1, "Last CPU thought to be offlined?"); 320 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */ 321 return; /* Shouldn't happen: at least one CPU online. */
294 } 322 }
323 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1])));
295 326
296 /* 327 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the 328 * Move tasks up to root rcu_node. Rely on the fact that the
@@ -335,20 +366,12 @@ static void rcu_preempt_check_callbacks(int cpu)
335 struct task_struct *t = current; 366 struct task_struct *t = current;
336 367
337 if (t->rcu_read_lock_nesting == 0) { 368 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &= 369 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS); 370 rcu_preempt_qs(cpu);
340 rcu_preempt_qs_record(cpu);
341 return; 371 return;
342 } 372 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) { 373 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) { 374 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352} 375}
353 376
354/* 377/*
@@ -387,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu)
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388} 411}
389 412
413/**
414 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
415 */
416void rcu_barrier(void)
417{
418 _rcu_barrier(&rcu_preempt_state, call_rcu);
419}
420EXPORT_SYMBOL_GPL(rcu_barrier);
421
390/* 422/*
391 * Initialize preemptable RCU's per-CPU data. 423 * Initialize preemptable RCU's per-CPU data.
392 */ 424 */
@@ -396,6 +428,22 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
396} 428}
397 429
398/* 430/*
431 * Move preemptable RCU's callbacks to ->orphan_cbs_list.
432 */
433static void rcu_preempt_send_cbs_to_orphanage(void)
434{
435 rcu_send_cbs_to_orphanage(&rcu_preempt_state);
436}
437
438/*
439 * Initialize preemptable RCU's state structures.
440 */
441static void __init __rcu_init_preempt(void)
442{
443 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
444}
445
446/*
399 * Check for a task exiting while in a preemptable-RCU read-side 447 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings, 448 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep 449 * as debug_check_no_locks_held() already does this if lockdep
@@ -434,8 +482,17 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
434 * Because preemptable RCU does not exist, we never have to check for 482 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states. 483 * CPUs being in quiescent states.
436 */ 484 */
437static void rcu_preempt_qs(int cpu) 485static void rcu_preempt_note_context_switch(int cpu)
486{
487}
488
489/*
490 * Because preemptable RCU does not exist, there are never any preempted
491 * RCU readers.
492 */
493static int rcu_preempted_readers(struct rcu_node *rnp)
438{ 494{
495 return 0;
439} 496}
440 497
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 498#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -451,12 +508,13 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 508#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452 509
453/* 510/*
454 * Because preemptable RCU does not exist, there are never any preempted 511 * Because there is no preemptable RCU, there can be no readers blocked,
455 * RCU readers. 512 * so there is no need to check for blocked tasks. So check only for
513 * bogus qsmask values.
456 */ 514 */
457static int rcu_preempted_readers(struct rcu_node *rnp) 515static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
458{ 516{
459 return 0; 517 WARN_ON_ONCE(rnp->qsmask);
460} 518}
461 519
462#ifdef CONFIG_HOTPLUG_CPU 520#ifdef CONFIG_HOTPLUG_CPU
@@ -466,7 +524,8 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
466 * tasks that were blocked within RCU read-side critical sections. 524 * tasks that were blocked within RCU read-side critical sections.
467 */ 525 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 526static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp) 527 struct rcu_node *rnp,
528 struct rcu_data *rdp)
470{ 529{
471} 530}
472 531
@@ -484,7 +543,7 @@ static void rcu_preempt_offline_cpu(int cpu)
484 * Because preemptable RCU does not exist, it never has any callbacks 543 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check. 544 * to check.
486 */ 545 */
487void rcu_preempt_check_callbacks(int cpu) 546static void rcu_preempt_check_callbacks(int cpu)
488{ 547{
489} 548}
490 549
@@ -492,7 +551,7 @@ void rcu_preempt_check_callbacks(int cpu)
492 * Because preemptable RCU does not exist, it never has any callbacks 551 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process. 552 * to process.
494 */ 553 */
495void rcu_preempt_process_callbacks(void) 554static void rcu_preempt_process_callbacks(void)
496{ 555{
497} 556}
498 557
@@ -522,6 +581,16 @@ static int rcu_preempt_needs_cpu(int cpu)
522} 581}
523 582
524/* 583/*
584 * Because preemptable RCU does not exist, rcu_barrier() is just
585 * another name for rcu_barrier_sched().
586 */
587void rcu_barrier(void)
588{
589 rcu_barrier_sched();
590}
591EXPORT_SYMBOL_GPL(rcu_barrier);
592
593/*
525 * Because preemptable RCU does not exist, there is no per-CPU 594 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize. 595 * data to initialize.
527 */ 596 */
@@ -529,4 +598,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{ 598{
530} 599}
531 600
601/*
602 * Because there is no preemptable RCU, there are no callbacks to move.
603 */
604static void rcu_preempt_send_cbs_to_orphanage(void)
605{
606}
607
608/*
609 * Because preemptable RCU does not exist, it need not be initialized.
610 */
611static void __init __rcu_init_preempt(void)
612{
613}
614
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 615#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0ea1bff69727..4b31c779e62e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
93 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
94} 94}
95 95
96static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
97 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
98 .open = rcudata_open, 98 .open = rcudata_open,
99 .read = seq_read, 99 .read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
145 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
146} 146}
147 147
148static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
149 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
150 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
151 .read = seq_read, 151 .read = seq_read,
@@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
159 struct rcu_node *rnp; 159 struct rcu_node *rnp;
160 160
161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 161 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 162 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
163 rsp->completed, rsp->gpnum, rsp->signaled, 163 rsp->completed, rsp->gpnum, rsp->signaled,
164 (long)(rsp->jiffies_force_qs - jiffies), 164 (long)(rsp->jiffies_force_qs - jiffies),
165 (int)(jiffies & 0xffff), 165 (int)(jiffies & 0xffff),
166 rsp->n_force_qs, rsp->n_force_qs_ngp, 166 rsp->n_force_qs, rsp->n_force_qs_ngp,
167 rsp->n_force_qs - rsp->n_force_qs_ngp, 167 rsp->n_force_qs - rsp->n_force_qs_ngp,
168 rsp->n_force_qs_lh); 168 rsp->n_force_qs_lh, rsp->orphan_qlen);
169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { 169 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
170 if (rnp->level != level) { 170 if (rnp->level != level) {
171 seq_puts(m, "\n"); 171 seq_puts(m, "\n");
@@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
196 return single_open(file, show_rcuhier, NULL); 196 return single_open(file, show_rcuhier, NULL);
197} 197}
198 198
199static struct file_operations rcuhier_fops = { 199static const struct file_operations rcuhier_fops = {
200 .owner = THIS_MODULE, 200 .owner = THIS_MODULE,
201 .open = rcuhier_open, 201 .open = rcuhier_open,
202 .read = seq_read, 202 .read = seq_read,
@@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
222 return single_open(file, show_rcugp, NULL); 222 return single_open(file, show_rcugp, NULL);
223} 223}
224 224
225static struct file_operations rcugp_fops = { 225static const struct file_operations rcugp_fops = {
226 .owner = THIS_MODULE, 226 .owner = THIS_MODULE,
227 .open = rcugp_open, 227 .open = rcugp_open,
228 .read = seq_read, 228 .read = seq_read,
@@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
276 return single_open(file, show_rcu_pending, NULL); 276 return single_open(file, show_rcu_pending, NULL);
277} 277}
278 278
279static struct file_operations rcu_pending_fops = { 279static const struct file_operations rcu_pending_fops = {
280 .owner = THIS_MODULE, 280 .owner = THIS_MODULE,
281 .open = rcu_pending_open, 281 .open = rcu_pending_open,
282 .read = seq_read, 282 .read = seq_read,
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..760c26209a3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
223 223
224EXPORT_SYMBOL(release_resource); 224EXPORT_SYMBOL(release_resource);
225 225
226#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) 226#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
227/* 227/*
228 * Finds the lowest memory reosurce exists within [res->start.res->end) 228 * Finds the lowest memory reosurce exists within [res->start.res->end)
229 * the caller must specify res->start, res->end, res->flags. 229 * the caller must specify res->start, res->end, res->flags and "name".
230 * If found, returns 0, res is overwritten, if not found, returns -1. 230 * If found, returns 0, res is overwritten, if not found, returns -1.
231 */ 231 */
232static int find_next_system_ram(struct resource *res) 232static int find_next_system_ram(struct resource *res, char *name)
233{ 233{
234 resource_size_t start, end; 234 resource_size_t start, end;
235 struct resource *p; 235 struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
245 /* system ram is just marked as IORESOURCE_MEM */ 245 /* system ram is just marked as IORESOURCE_MEM */
246 if (p->flags != res->flags) 246 if (p->flags != res->flags)
247 continue; 247 continue;
248 if (name && strcmp(p->name, name))
249 continue;
248 if (p->start > end) { 250 if (p->start > end) {
249 p = NULL; 251 p = NULL;
250 break; 252 break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
262 res->end = p->end; 264 res->end = p->end;
263 return 0; 265 return 0;
264} 266}
265int 267
266walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, 268/*
267 int (*func)(unsigned long, unsigned long, void *)) 269 * This function calls callback against all memory range of "System RAM"
270 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
271 * Now, this function is only for "System RAM".
272 */
273int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *))
268{ 275{
269 struct resource res; 276 struct resource res;
270 unsigned long pfn, len; 277 unsigned long pfn, len;
271 u64 orig_end; 278 u64 orig_end;
272 int ret = -1; 279 int ret = -1;
280
273 res.start = (u64) start_pfn << PAGE_SHIFT; 281 res.start = (u64) start_pfn << PAGE_SHIFT;
274 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 282 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
275 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 283 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
276 orig_end = res.end; 284 orig_end = res.end;
277 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 285 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) {
278 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 287 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
279 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
280 ret = (*func)(pfn, len, arg); 289 ret = (*func)(pfn, len, arg);
diff --git a/kernel/sched.c b/kernel/sched.c
index d9db3fb17573..e88689522e66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -119,8 +119,6 @@
119 */ 119 */
120#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
121 121
122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
123
124static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
125{ 123{
126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
378 376
379#else 377#else
380 378
381#ifdef CONFIG_SMP
382static int root_task_group_empty(void)
383{
384 return 1;
385}
386#endif
387
388static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
389static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
390{ 381{
@@ -514,14 +505,6 @@ struct root_domain {
514#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
515 struct cpupri cpupri; 506 struct cpupri cpupri;
516#endif 507#endif
517#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
518 /*
519 * Preferred wake up cpu nominated by sched_mc balance that will be
520 * used when most cpus are idle in the system indicating overall very
521 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
522 */
523 unsigned int sched_mc_preferred_wakeup_cpu;
524#endif
525}; 508};
526 509
527/* 510/*
@@ -646,9 +629,10 @@ struct rq {
646 629
647static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
648 631
649static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
650{ 634{
651 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
652} 636}
653 637
654static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -692,20 +676,15 @@ inline void update_rq_clock(struct rq *rq)
692 676
693/** 677/**
694 * runqueue_is_locked 678 * runqueue_is_locked
679 * @cpu: the processor in question.
695 * 680 *
696 * Returns true if the current cpu runqueue is locked. 681 * Returns true if the current cpu runqueue is locked.
697 * This interface allows printk to be called with the runqueue lock 682 * This interface allows printk to be called with the runqueue lock
698 * held and know whether or not it is OK to wake up the klogd. 683 * held and know whether or not it is OK to wake up the klogd.
699 */ 684 */
700int runqueue_is_locked(void) 685int runqueue_is_locked(int cpu)
701{ 686{
702 int cpu = get_cpu(); 687 return spin_is_locked(&cpu_rq(cpu)->lock);
703 struct rq *rq = cpu_rq(cpu);
704 int ret;
705
706 ret = spin_is_locked(&rq->lock);
707 put_cpu();
708 return ret;
709} 688}
710 689
711/* 690/*
@@ -802,7 +781,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
802 return single_open(filp, sched_feat_show, NULL); 781 return single_open(filp, sched_feat_show, NULL);
803} 782}
804 783
805static struct file_operations sched_feat_fops = { 784static const struct file_operations sched_feat_fops = {
806 .open = sched_feat_open, 785 .open = sched_feat_open,
807 .write = sched_feat_write, 786 .write = sched_feat_write,
808 .read = seq_read, 787 .read = seq_read,
@@ -1509,8 +1488,65 @@ static int tg_nop(struct task_group *tg, void *data)
1509#endif 1488#endif
1510 1489
1511#ifdef CONFIG_SMP 1490#ifdef CONFIG_SMP
1512static unsigned long source_load(int cpu, int type); 1491/* Used instead of source_load when we know the type == 0 */
1513static unsigned long target_load(int cpu, int type); 1492static unsigned long weighted_cpuload(const int cpu)
1493{
1494 return cpu_rq(cpu)->load.weight;
1495}
1496
1497/*
1498 * Return a low guess at the load of a migration-source cpu weighted
1499 * according to the scheduling class and "nice" value.
1500 *
1501 * We want to under-estimate the load of migration sources, to
1502 * balance conservatively.
1503 */
1504static unsigned long source_load(int cpu, int type)
1505{
1506 struct rq *rq = cpu_rq(cpu);
1507 unsigned long total = weighted_cpuload(cpu);
1508
1509 if (type == 0 || !sched_feat(LB_BIAS))
1510 return total;
1511
1512 return min(rq->cpu_load[type-1], total);
1513}
1514
1515/*
1516 * Return a high guess at the load of a migration-target cpu weighted
1517 * according to the scheduling class and "nice" value.
1518 */
1519static unsigned long target_load(int cpu, int type)
1520{
1521 struct rq *rq = cpu_rq(cpu);
1522 unsigned long total = weighted_cpuload(cpu);
1523
1524 if (type == 0 || !sched_feat(LB_BIAS))
1525 return total;
1526
1527 return max(rq->cpu_load[type-1], total);
1528}
1529
1530static struct sched_group *group_of(int cpu)
1531{
1532 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1533
1534 if (!sd)
1535 return NULL;
1536
1537 return sd->groups;
1538}
1539
1540static unsigned long power_of(int cpu)
1541{
1542 struct sched_group *group = group_of(cpu);
1543
1544 if (!group)
1545 return SCHED_LOAD_SCALE;
1546
1547 return group->cpu_power;
1548}
1549
1514static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1550static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1515 1551
1516static unsigned long cpu_avg_load_per_task(int cpu) 1552static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1695,6 +1731,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1695 1731
1696#ifdef CONFIG_PREEMPT 1732#ifdef CONFIG_PREEMPT
1697 1733
1734static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1735
1698/* 1736/*
1699 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1737 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1700 * way at the expense of forcing extra atomic operations in all 1738 * way at the expense of forcing extra atomic operations in all
@@ -1959,13 +1997,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1959} 1997}
1960 1998
1961#ifdef CONFIG_SMP 1999#ifdef CONFIG_SMP
1962
1963/* Used instead of source_load when we know the type == 0 */
1964static unsigned long weighted_cpuload(const int cpu)
1965{
1966 return cpu_rq(cpu)->load.weight;
1967}
1968
1969/* 2000/*
1970 * Is this task likely cache-hot: 2001 * Is this task likely cache-hot:
1971 */ 2002 */
@@ -2023,7 +2054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2023 if (task_hot(p, old_rq->clock, NULL)) 2054 if (task_hot(p, old_rq->clock, NULL))
2024 schedstat_inc(p, se.nr_forced2_migrations); 2055 schedstat_inc(p, se.nr_forced2_migrations);
2025#endif 2056#endif
2026 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2057 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2027 1, 1, NULL, 0); 2058 1, 1, NULL, 0);
2028 } 2059 }
2029 p->se.vruntime -= old_cfsrq->min_vruntime - 2060 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2239,185 +2270,6 @@ void kick_process(struct task_struct *p)
2239 preempt_enable(); 2270 preempt_enable();
2240} 2271}
2241EXPORT_SYMBOL_GPL(kick_process); 2272EXPORT_SYMBOL_GPL(kick_process);
2242
2243/*
2244 * Return a low guess at the load of a migration-source cpu weighted
2245 * according to the scheduling class and "nice" value.
2246 *
2247 * We want to under-estimate the load of migration sources, to
2248 * balance conservatively.
2249 */
2250static unsigned long source_load(int cpu, int type)
2251{
2252 struct rq *rq = cpu_rq(cpu);
2253 unsigned long total = weighted_cpuload(cpu);
2254
2255 if (type == 0 || !sched_feat(LB_BIAS))
2256 return total;
2257
2258 return min(rq->cpu_load[type-1], total);
2259}
2260
2261/*
2262 * Return a high guess at the load of a migration-target cpu weighted
2263 * according to the scheduling class and "nice" value.
2264 */
2265static unsigned long target_load(int cpu, int type)
2266{
2267 struct rq *rq = cpu_rq(cpu);
2268 unsigned long total = weighted_cpuload(cpu);
2269
2270 if (type == 0 || !sched_feat(LB_BIAS))
2271 return total;
2272
2273 return max(rq->cpu_load[type-1], total);
2274}
2275
2276/*
2277 * find_idlest_group finds and returns the least busy CPU group within the
2278 * domain.
2279 */
2280static struct sched_group *
2281find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2282{
2283 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2284 unsigned long min_load = ULONG_MAX, this_load = 0;
2285 int load_idx = sd->forkexec_idx;
2286 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2287
2288 do {
2289 unsigned long load, avg_load;
2290 int local_group;
2291 int i;
2292
2293 /* Skip over this group if it has no CPUs allowed */
2294 if (!cpumask_intersects(sched_group_cpus(group),
2295 &p->cpus_allowed))
2296 continue;
2297
2298 local_group = cpumask_test_cpu(this_cpu,
2299 sched_group_cpus(group));
2300
2301 /* Tally up the load of all CPUs in the group */
2302 avg_load = 0;
2303
2304 for_each_cpu(i, sched_group_cpus(group)) {
2305 /* Bias balancing toward cpus of our domain */
2306 if (local_group)
2307 load = source_load(i, load_idx);
2308 else
2309 load = target_load(i, load_idx);
2310
2311 avg_load += load;
2312 }
2313
2314 /* Adjust by relative CPU power of the group */
2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2316
2317 if (local_group) {
2318 this_load = avg_load;
2319 this = group;
2320 } else if (avg_load < min_load) {
2321 min_load = avg_load;
2322 idlest = group;
2323 }
2324 } while (group = group->next, group != sd->groups);
2325
2326 if (!idlest || 100*this_load < imbalance*min_load)
2327 return NULL;
2328 return idlest;
2329}
2330
2331/*
2332 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2333 */
2334static int
2335find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2336{
2337 unsigned long load, min_load = ULONG_MAX;
2338 int idlest = -1;
2339 int i;
2340
2341 /* Traverse only the allowed CPUs */
2342 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2343 load = weighted_cpuload(i);
2344
2345 if (load < min_load || (load == min_load && i == this_cpu)) {
2346 min_load = load;
2347 idlest = i;
2348 }
2349 }
2350
2351 return idlest;
2352}
2353
2354/*
2355 * sched_balance_self: balance the current task (running on cpu) in domains
2356 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2357 * SD_BALANCE_EXEC.
2358 *
2359 * Balance, ie. select the least loaded group.
2360 *
2361 * Returns the target CPU number, or the same CPU if no balancing is needed.
2362 *
2363 * preempt must be disabled.
2364 */
2365static int sched_balance_self(int cpu, int flag)
2366{
2367 struct task_struct *t = current;
2368 struct sched_domain *tmp, *sd = NULL;
2369
2370 for_each_domain(cpu, tmp) {
2371 /*
2372 * If power savings logic is enabled for a domain, stop there.
2373 */
2374 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2375 break;
2376 if (tmp->flags & flag)
2377 sd = tmp;
2378 }
2379
2380 if (sd)
2381 update_shares(sd);
2382
2383 while (sd) {
2384 struct sched_group *group;
2385 int new_cpu, weight;
2386
2387 if (!(sd->flags & flag)) {
2388 sd = sd->child;
2389 continue;
2390 }
2391
2392 group = find_idlest_group(sd, t, cpu);
2393 if (!group) {
2394 sd = sd->child;
2395 continue;
2396 }
2397
2398 new_cpu = find_idlest_cpu(group, t, cpu);
2399 if (new_cpu == -1 || new_cpu == cpu) {
2400 /* Now try balancing at a lower domain level of cpu */
2401 sd = sd->child;
2402 continue;
2403 }
2404
2405 /* Now try balancing at a lower domain level of new_cpu */
2406 cpu = new_cpu;
2407 weight = cpumask_weight(sched_domain_span(sd));
2408 sd = NULL;
2409 for_each_domain(cpu, tmp) {
2410 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2411 break;
2412 if (tmp->flags & flag)
2413 sd = tmp;
2414 }
2415 /* while loop will break here if sd == NULL */
2416 }
2417
2418 return cpu;
2419}
2420
2421#endif /* CONFIG_SMP */ 2273#endif /* CONFIG_SMP */
2422 2274
2423/** 2275/**
@@ -2455,37 +2307,22 @@ void task_oncpu_function_call(struct task_struct *p,
2455 * 2307 *
2456 * returns failure only if the task is already active. 2308 * returns failure only if the task is already active.
2457 */ 2309 */
2458static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2310static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311 int wake_flags)
2459{ 2312{
2460 int cpu, orig_cpu, this_cpu, success = 0; 2313 int cpu, orig_cpu, this_cpu, success = 0;
2461 unsigned long flags; 2314 unsigned long flags;
2462 long old_state; 2315 struct rq *rq, *orig_rq;
2463 struct rq *rq;
2464 2316
2465 if (!sched_feat(SYNC_WAKEUPS)) 2317 if (!sched_feat(SYNC_WAKEUPS))
2466 sync = 0; 2318 wake_flags &= ~WF_SYNC;
2467 2319
2468#ifdef CONFIG_SMP 2320 this_cpu = get_cpu();
2469 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2470 struct sched_domain *sd;
2471
2472 this_cpu = raw_smp_processor_id();
2473 cpu = task_cpu(p);
2474
2475 for_each_domain(this_cpu, sd) {
2476 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2477 update_shares(sd);
2478 break;
2479 }
2480 }
2481 }
2482#endif
2483 2321
2484 smp_wmb(); 2322 smp_wmb();
2485 rq = task_rq_lock(p, &flags); 2323 rq = orig_rq = task_rq_lock(p, &flags);
2486 update_rq_clock(rq); 2324 update_rq_clock(rq);
2487 old_state = p->state; 2325 if (!(p->state & state))
2488 if (!(old_state & state))
2489 goto out; 2326 goto out;
2490 2327
2491 if (p->se.on_rq) 2328 if (p->se.on_rq)
@@ -2493,27 +2330,33 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2493 2330
2494 cpu = task_cpu(p); 2331 cpu = task_cpu(p);
2495 orig_cpu = cpu; 2332 orig_cpu = cpu;
2496 this_cpu = smp_processor_id();
2497 2333
2498#ifdef CONFIG_SMP 2334#ifdef CONFIG_SMP
2499 if (unlikely(task_running(rq, p))) 2335 if (unlikely(task_running(rq, p)))
2500 goto out_activate; 2336 goto out_activate;
2501 2337
2502 cpu = p->sched_class->select_task_rq(p, sync); 2338 /*
2503 if (cpu != orig_cpu) { 2339 * In order to handle concurrent wakeups and release the rq->lock
2340 * we put the task in TASK_WAKING state.
2341 *
2342 * First fix up the nr_uninterruptible count:
2343 */
2344 if (task_contributes_to_load(p))
2345 rq->nr_uninterruptible--;
2346 p->state = TASK_WAKING;
2347 task_rq_unlock(rq, &flags);
2348
2349 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2350 if (cpu != orig_cpu)
2504 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2505 task_rq_unlock(rq, &flags);
2506 /* might preempt at this point */
2507 rq = task_rq_lock(p, &flags);
2508 old_state = p->state;
2509 if (!(old_state & state))
2510 goto out;
2511 if (p->se.on_rq)
2512 goto out_running;
2513 2352
2514 this_cpu = smp_processor_id(); 2353 rq = task_rq_lock(p, &flags);
2515 cpu = task_cpu(p); 2354
2516 } 2355 if (rq != orig_rq)
2356 update_rq_clock(rq);
2357
2358 WARN_ON(p->state != TASK_WAKING);
2359 cpu = task_cpu(p);
2517 2360
2518#ifdef CONFIG_SCHEDSTATS 2361#ifdef CONFIG_SCHEDSTATS
2519 schedstat_inc(rq, ttwu_count); 2362 schedstat_inc(rq, ttwu_count);
@@ -2533,7 +2376,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2533out_activate: 2376out_activate:
2534#endif /* CONFIG_SMP */ 2377#endif /* CONFIG_SMP */
2535 schedstat_inc(p, se.nr_wakeups); 2378 schedstat_inc(p, se.nr_wakeups);
2536 if (sync) 2379 if (wake_flags & WF_SYNC)
2537 schedstat_inc(p, se.nr_wakeups_sync); 2380 schedstat_inc(p, se.nr_wakeups_sync);
2538 if (orig_cpu != cpu) 2381 if (orig_cpu != cpu)
2539 schedstat_inc(p, se.nr_wakeups_migrate); 2382 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2562,7 +2405,7 @@ out_activate:
2562 2405
2563out_running: 2406out_running:
2564 trace_sched_wakeup(rq, p, success); 2407 trace_sched_wakeup(rq, p, success);
2565 check_preempt_curr(rq, p, sync); 2408 check_preempt_curr(rq, p, wake_flags);
2566 2409
2567 p->state = TASK_RUNNING; 2410 p->state = TASK_RUNNING;
2568#ifdef CONFIG_SMP 2411#ifdef CONFIG_SMP
@@ -2571,6 +2414,7 @@ out_running:
2571#endif 2414#endif
2572out: 2415out:
2573 task_rq_unlock(rq, &flags); 2416 task_rq_unlock(rq, &flags);
2417 put_cpu();
2574 2418
2575 return success; 2419 return success;
2576} 2420}
@@ -2613,6 +2457,7 @@ static void __sched_fork(struct task_struct *p)
2613 p->se.avg_overlap = 0; 2457 p->se.avg_overlap = 0;
2614 p->se.start_runtime = 0; 2458 p->se.start_runtime = 0;
2615 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2459 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2460 p->se.avg_running = 0;
2616 2461
2617#ifdef CONFIG_SCHEDSTATS 2462#ifdef CONFIG_SCHEDSTATS
2618 p->se.wait_start = 0; 2463 p->se.wait_start = 0;
@@ -2674,28 +2519,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
2674 2519
2675 __sched_fork(p); 2520 __sched_fork(p);
2676 2521
2677#ifdef CONFIG_SMP
2678 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2679#endif
2680 set_task_cpu(p, cpu);
2681
2682 /*
2683 * Make sure we do not leak PI boosting priority to the child.
2684 */
2685 p->prio = current->normal_prio;
2686
2687 /* 2522 /*
2688 * Revert to default priority/policy on fork if requested. 2523 * Revert to default priority/policy on fork if requested.
2689 */ 2524 */
2690 if (unlikely(p->sched_reset_on_fork)) { 2525 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2692 p->policy = SCHED_NORMAL; 2527 p->policy = SCHED_NORMAL;
2693 2528 p->normal_prio = p->static_prio;
2694 if (p->normal_prio < DEFAULT_PRIO) 2529 }
2695 p->prio = DEFAULT_PRIO;
2696 2530
2697 if (PRIO_TO_NICE(p->static_prio) < 0) { 2531 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0); 2532 p->static_prio = NICE_TO_PRIO(0);
2533 p->normal_prio = p->static_prio;
2699 set_load_weight(p); 2534 set_load_weight(p);
2700 } 2535 }
2701 2536
@@ -2706,9 +2541,19 @@ void sched_fork(struct task_struct *p, int clone_flags)
2706 p->sched_reset_on_fork = 0; 2541 p->sched_reset_on_fork = 0;
2707 } 2542 }
2708 2543
2544 /*
2545 * Make sure we do not leak PI boosting priority to the child.
2546 */
2547 p->prio = current->normal_prio;
2548
2709 if (!rt_prio(p->prio)) 2549 if (!rt_prio(p->prio))
2710 p->sched_class = &fair_sched_class; 2550 p->sched_class = &fair_sched_class;
2711 2551
2552#ifdef CONFIG_SMP
2553 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2554#endif
2555 set_task_cpu(p, cpu);
2556
2712#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2557#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2713 if (likely(sched_info_on())) 2558 if (likely(sched_info_on()))
2714 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2559 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2741,8 +2586,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2741 BUG_ON(p->state != TASK_RUNNING); 2586 BUG_ON(p->state != TASK_RUNNING);
2742 update_rq_clock(rq); 2587 update_rq_clock(rq);
2743 2588
2744 p->prio = effective_prio(p);
2745
2746 if (!p->sched_class->task_new || !current->se.on_rq) { 2589 if (!p->sched_class->task_new || !current->se.on_rq) {
2747 activate_task(rq, p, 0); 2590 activate_task(rq, p, 0);
2748 } else { 2591 } else {
@@ -2754,7 +2597,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2754 inc_nr_running(rq); 2597 inc_nr_running(rq);
2755 } 2598 }
2756 trace_sched_wakeup_new(rq, p, 1); 2599 trace_sched_wakeup_new(rq, p, 1);
2757 check_preempt_curr(rq, p, 0); 2600 check_preempt_curr(rq, p, WF_FORK);
2758#ifdef CONFIG_SMP 2601#ifdef CONFIG_SMP
2759 if (p->sched_class->task_wake_up) 2602 if (p->sched_class->task_wake_up)
2760 p->sched_class->task_wake_up(rq, p); 2603 p->sched_class->task_wake_up(rq, p);
@@ -2878,7 +2721,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2878 */ 2721 */
2879 prev_state = prev->state; 2722 prev_state = prev->state;
2880 finish_arch_switch(prev); 2723 finish_arch_switch(prev);
2881 perf_counter_task_sched_in(current, cpu_of(rq)); 2724 perf_event_task_sched_in(current, cpu_of(rq));
2882 finish_lock_switch(rq, prev); 2725 finish_lock_switch(rq, prev);
2883 2726
2884 fire_sched_in_preempt_notifiers(current); 2727 fire_sched_in_preempt_notifiers(current);
@@ -3064,6 +2907,19 @@ unsigned long nr_iowait(void)
3064 return sum; 2907 return sum;
3065} 2908}
3066 2909
2910unsigned long nr_iowait_cpu(void)
2911{
2912 struct rq *this = this_rq();
2913 return atomic_read(&this->nr_iowait);
2914}
2915
2916unsigned long this_cpu_load(void)
2917{
2918 struct rq *this = this_rq();
2919 return this->cpu_load[0];
2920}
2921
2922
3067/* Variables and functions for calc_load */ 2923/* Variables and functions for calc_load */
3068static atomic_long_t calc_load_tasks; 2924static atomic_long_t calc_load_tasks;
3069static unsigned long calc_load_update; 2925static unsigned long calc_load_update;
@@ -3263,7 +3119,7 @@ out:
3263void sched_exec(void) 3119void sched_exec(void)
3264{ 3120{
3265 int new_cpu, this_cpu = get_cpu(); 3121 int new_cpu, this_cpu = get_cpu();
3266 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3122 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3267 put_cpu(); 3123 put_cpu();
3268 if (new_cpu != this_cpu) 3124 if (new_cpu != this_cpu)
3269 sched_migrate_task(current, new_cpu); 3125 sched_migrate_task(current, new_cpu);
@@ -3683,11 +3539,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3683 *imbalance = sds->min_load_per_task; 3539 *imbalance = sds->min_load_per_task;
3684 sds->busiest = sds->group_min; 3540 sds->busiest = sds->group_min;
3685 3541
3686 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3687 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3688 group_first_cpu(sds->group_leader);
3689 }
3690
3691 return 1; 3542 return 1;
3692 3543
3693} 3544}
@@ -3711,7 +3562,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3711} 3562}
3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3563#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3713 3564
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3565
3566unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3567{
3568 return SCHED_LOAD_SCALE;
3569}
3570
3571unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3572{
3573 return default_scale_freq_power(sd, cpu);
3574}
3575
3576unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3715{ 3577{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3578 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain; 3579 unsigned long smt_gain = sd->smt_gain;
@@ -3721,6 +3583,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3721 return smt_gain; 3583 return smt_gain;
3722} 3584}
3723 3585
3586unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3587{
3588 return default_scale_smt_power(sd, cpu);
3589}
3590
3724unsigned long scale_rt_power(int cpu) 3591unsigned long scale_rt_power(int cpu)
3725{ 3592{
3726 struct rq *rq = cpu_rq(cpu); 3593 struct rq *rq = cpu_rq(cpu);
@@ -3745,10 +3612,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3745 unsigned long power = SCHED_LOAD_SCALE; 3612 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups; 3613 struct sched_group *sdg = sd->groups;
3747 3614
3748 /* here we could scale based on cpufreq */ 3615 if (sched_feat(ARCH_POWER))
3616 power *= arch_scale_freq_power(sd, cpu);
3617 else
3618 power *= default_scale_freq_power(sd, cpu);
3619
3620 power >>= SCHED_LOAD_SHIFT;
3749 3621
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3622 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu); 3623 if (sched_feat(ARCH_POWER))
3624 power *= arch_scale_smt_power(sd, cpu);
3625 else
3626 power *= default_scale_smt_power(sd, cpu);
3627
3752 power >>= SCHED_LOAD_SHIFT; 3628 power >>= SCHED_LOAD_SHIFT;
3753 } 3629 }
3754 3630
@@ -3785,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3785 3661
3786/** 3662/**
3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3663 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3664 * @sd: The sched_domain whose statistics are to be updated.
3788 * @group: sched_group whose statistics are to be updated. 3665 * @group: sched_group whose statistics are to be updated.
3789 * @this_cpu: Cpu for which load balance is currently performed. 3666 * @this_cpu: Cpu for which load balance is currently performed.
3790 * @idle: Idle status of this_cpu 3667 * @idle: Idle status of this_cpu
@@ -4161,26 +4038,6 @@ ret:
4161 return NULL; 4038 return NULL;
4162} 4039}
4163 4040
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
4184/* 4041/*
4185 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4042 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4186 */ 4043 */
@@ -5239,17 +5096,16 @@ void account_idle_time(cputime_t cputime)
5239 */ 5096 */
5240void account_process_tick(struct task_struct *p, int user_tick) 5097void account_process_tick(struct task_struct *p, int user_tick)
5241{ 5098{
5242 cputime_t one_jiffy = jiffies_to_cputime(1); 5099 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5243 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5244 struct rq *rq = this_rq(); 5100 struct rq *rq = this_rq();
5245 5101
5246 if (user_tick) 5102 if (user_tick)
5247 account_user_time(p, one_jiffy, one_jiffy_scaled); 5103 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5248 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5104 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5249 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5105 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5250 one_jiffy_scaled); 5106 one_jiffy_scaled);
5251 else 5107 else
5252 account_idle_time(one_jiffy); 5108 account_idle_time(cputime_one_jiffy);
5253} 5109}
5254 5110
5255/* 5111/*
@@ -5353,7 +5209,7 @@ void scheduler_tick(void)
5353 curr->sched_class->task_tick(rq, curr, 0); 5209 curr->sched_class->task_tick(rq, curr, 0);
5354 spin_unlock(&rq->lock); 5210 spin_unlock(&rq->lock);
5355 5211
5356 perf_counter_task_tick(curr, cpu); 5212 perf_event_task_tick(curr, cpu);
5357 5213
5358#ifdef CONFIG_SMP 5214#ifdef CONFIG_SMP
5359 rq->idle_at_tick = idle_cpu(cpu); 5215 rq->idle_at_tick = idle_cpu(cpu);
@@ -5465,14 +5321,13 @@ static inline void schedule_debug(struct task_struct *prev)
5465#endif 5321#endif
5466} 5322}
5467 5323
5468static void put_prev_task(struct rq *rq, struct task_struct *prev) 5324static void put_prev_task(struct rq *rq, struct task_struct *p)
5469{ 5325{
5470 if (prev->state == TASK_RUNNING) { 5326 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5471 u64 runtime = prev->se.sum_exec_runtime;
5472 5327
5473 runtime -= prev->se.prev_sum_exec_runtime; 5328 update_avg(&p->se.avg_running, runtime);
5474 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5475 5329
5330 if (p->state == TASK_RUNNING) {
5476 /* 5331 /*
5477 * In order to avoid avg_overlap growing stale when we are 5332 * In order to avoid avg_overlap growing stale when we are
5478 * indeed overlapping and hence not getting put to sleep, grow 5333 * indeed overlapping and hence not getting put to sleep, grow
@@ -5482,9 +5337,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5482 * correlates to the amount of cache footprint a task can 5337 * correlates to the amount of cache footprint a task can
5483 * build up. 5338 * build up.
5484 */ 5339 */
5485 update_avg(&prev->se.avg_overlap, runtime); 5340 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5341 update_avg(&p->se.avg_overlap, runtime);
5342 } else {
5343 update_avg(&p->se.avg_running, 0);
5486 } 5344 }
5487 prev->sched_class->put_prev_task(rq, prev); 5345 p->sched_class->put_prev_task(rq, p);
5488} 5346}
5489 5347
5490/* 5348/*
@@ -5567,7 +5425,7 @@ need_resched_nonpreemptible:
5567 5425
5568 if (likely(prev != next)) { 5426 if (likely(prev != next)) {
5569 sched_info_switch(prev, next); 5427 sched_info_switch(prev, next);
5570 perf_counter_task_sched_out(prev, next, cpu); 5428 perf_event_task_sched_out(prev, next, cpu);
5571 5429
5572 rq->nr_switches++; 5430 rq->nr_switches++;
5573 rq->curr = next; 5431 rq->curr = next;
@@ -5716,10 +5574,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5716 5574
5717#endif /* CONFIG_PREEMPT */ 5575#endif /* CONFIG_PREEMPT */
5718 5576
5719int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5577int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5720 void *key) 5578 void *key)
5721{ 5579{
5722 return try_to_wake_up(curr->private, mode, sync); 5580 return try_to_wake_up(curr->private, mode, wake_flags);
5723} 5581}
5724EXPORT_SYMBOL(default_wake_function); 5582EXPORT_SYMBOL(default_wake_function);
5725 5583
@@ -5733,14 +5591,14 @@ EXPORT_SYMBOL(default_wake_function);
5733 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5591 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5734 */ 5592 */
5735static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5593static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5736 int nr_exclusive, int sync, void *key) 5594 int nr_exclusive, int wake_flags, void *key)
5737{ 5595{
5738 wait_queue_t *curr, *next; 5596 wait_queue_t *curr, *next;
5739 5597
5740 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5598 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5741 unsigned flags = curr->flags; 5599 unsigned flags = curr->flags;
5742 5600
5743 if (curr->func(curr, mode, sync, key) && 5601 if (curr->func(curr, mode, wake_flags, key) &&
5744 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5602 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5745 break; 5603 break;
5746 } 5604 }
@@ -5801,16 +5659,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5801 int nr_exclusive, void *key) 5659 int nr_exclusive, void *key)
5802{ 5660{
5803 unsigned long flags; 5661 unsigned long flags;
5804 int sync = 1; 5662 int wake_flags = WF_SYNC;
5805 5663
5806 if (unlikely(!q)) 5664 if (unlikely(!q))
5807 return; 5665 return;
5808 5666
5809 if (unlikely(!nr_exclusive)) 5667 if (unlikely(!nr_exclusive))
5810 sync = 0; 5668 wake_flags = 0;
5811 5669
5812 spin_lock_irqsave(&q->lock, flags); 5670 spin_lock_irqsave(&q->lock, flags);
5813 __wake_up_common(q, mode, nr_exclusive, sync, key); 5671 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5814 spin_unlock_irqrestore(&q->lock, flags); 5672 spin_unlock_irqrestore(&q->lock, flags);
5815} 5673}
5816EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5674EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6866,9 +6724,6 @@ EXPORT_SYMBOL(yield);
6866/* 6724/*
6867 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6725 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6868 * that process accounting knows that this is a task in IO wait state. 6726 * that process accounting knows that this is a task in IO wait state.
6869 *
6870 * But don't do that if it is a deliberate, throttling IO wait (this task
6871 * has set its backing_dev_info: the queue against which it should throttle)
6872 */ 6727 */
6873void __sched io_schedule(void) 6728void __sched io_schedule(void)
6874{ 6729{
@@ -6977,23 +6832,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6977 if (retval) 6832 if (retval)
6978 goto out_unlock; 6833 goto out_unlock;
6979 6834
6980 /* 6835 time_slice = p->sched_class->get_rr_interval(p);
6981 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6982 * tasks that are on an otherwise idle runqueue:
6983 */
6984 time_slice = 0;
6985 if (p->policy == SCHED_RR) {
6986 time_slice = DEF_TIMESLICE;
6987 } else if (p->policy != SCHED_FIFO) {
6988 struct sched_entity *se = &p->se;
6989 unsigned long flags;
6990 struct rq *rq;
6991 6836
6992 rq = task_rq_lock(p, &flags);
6993 if (rq->cfs.load.weight)
6994 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6995 task_rq_unlock(rq, &flags);
6996 }
6997 read_unlock(&tasklist_lock); 6837 read_unlock(&tasklist_lock);
6998 jiffies_to_timespec(time_slice, &t); 6838 jiffies_to_timespec(time_slice, &t);
6999 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6839 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -7844,7 +7684,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7844/* 7684/*
7845 * Register at high priority so that task migration (migrate_all_tasks) 7685 * Register at high priority so that task migration (migrate_all_tasks)
7846 * happens before everything else. This has to be lower priority than 7686 * happens before everything else. This has to be lower priority than
7847 * the notifier in the perf_counter subsystem, though. 7687 * the notifier in the perf_event subsystem, though.
7848 */ 7688 */
7849static struct notifier_block __cpuinitdata migration_notifier = { 7689static struct notifier_block __cpuinitdata migration_notifier = {
7850 .notifier_call = migration_call, 7690 .notifier_call = migration_call,
@@ -8000,9 +7840,7 @@ static int sd_degenerate(struct sched_domain *sd)
8000 } 7840 }
8001 7841
8002 /* Following flags don't use groups */ 7842 /* Following flags don't use groups */
8003 if (sd->flags & (SD_WAKE_IDLE | 7843 if (sd->flags & (SD_WAKE_AFFINE))
8004 SD_WAKE_AFFINE |
8005 SD_WAKE_BALANCE))
8006 return 0; 7844 return 0;
8007 7845
8008 return 1; 7846 return 1;
@@ -8019,10 +7857,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
8019 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7857 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
8020 return 0; 7858 return 0;
8021 7859
8022 /* Does parent contain flags not in child? */
8023 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
8024 if (cflags & SD_WAKE_AFFINE)
8025 pflags &= ~SD_WAKE_BALANCE;
8026 /* Flags needing groups don't count if only 1 group in parent */ 7860 /* Flags needing groups don't count if only 1 group in parent */
8027 if (parent->groups == parent->groups->next) { 7861 if (parent->groups == parent->groups->next) {
8028 pflags &= ~(SD_LOAD_BALANCE | 7862 pflags &= ~(SD_LOAD_BALANCE |
@@ -8708,10 +8542,10 @@ static void set_domain_attribute(struct sched_domain *sd,
8708 request = attr->relax_domain_level; 8542 request = attr->relax_domain_level;
8709 if (request < sd->level) { 8543 if (request < sd->level) {
8710 /* turn off idle balance on this domain */ 8544 /* turn off idle balance on this domain */
8711 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8545 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8712 } else { 8546 } else {
8713 /* turn on idle balance on this domain */ 8547 /* turn on idle balance on this domain */
8714 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8548 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8715 } 8549 }
8716} 8550}
8717 8551
@@ -9329,6 +9163,7 @@ void __init sched_init_smp(void)
9329 cpumask_var_t non_isolated_cpus; 9163 cpumask_var_t non_isolated_cpus;
9330 9164
9331 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9165 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9166 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9332 9167
9333#if defined(CONFIG_NUMA) 9168#if defined(CONFIG_NUMA)
9334 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9169 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9360,7 +9195,6 @@ void __init sched_init_smp(void)
9360 sched_init_granularity(); 9195 sched_init_granularity();
9361 free_cpumask_var(non_isolated_cpus); 9196 free_cpumask_var(non_isolated_cpus);
9362 9197
9363 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9364 init_sched_rt_class(); 9198 init_sched_rt_class();
9365} 9199}
9366#else 9200#else
@@ -9707,7 +9541,7 @@ void __init sched_init(void)
9707 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9708#endif /* SMP */ 9542#endif /* SMP */
9709 9543
9710 perf_counter_init(); 9544 perf_event_init();
9711 9545
9712 scheduler_running = 1; 9546 scheduler_running = 1;
9713} 9547}
@@ -10479,7 +10313,7 @@ static int sched_rt_global_constraints(void)
10479#endif /* CONFIG_RT_GROUP_SCHED */ 10313#endif /* CONFIG_RT_GROUP_SCHED */
10480 10314
10481int sched_rt_handler(struct ctl_table *table, int write, 10315int sched_rt_handler(struct ctl_table *table, int write,
10482 struct file *filp, void __user *buffer, size_t *lenp, 10316 void __user *buffer, size_t *lenp,
10483 loff_t *ppos) 10317 loff_t *ppos)
10484{ 10318{
10485 int ret; 10319 int ret;
@@ -10490,7 +10324,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10490 old_period = sysctl_sched_rt_period; 10324 old_period = sysctl_sched_rt_period;
10491 old_runtime = sysctl_sched_rt_runtime; 10325 old_runtime = sysctl_sched_rt_runtime;
10492 10326
10493 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10327 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10494 10328
10495 if (!ret && write) { 10329 if (!ret && write) {
10496 ret = sched_rt_global_constraints(); 10330 ret = sched_rt_global_constraints();
@@ -10544,8 +10378,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10544} 10378}
10545 10379
10546static int 10380static int
10547cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10381cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10548 struct task_struct *tsk)
10549{ 10382{
10550#ifdef CONFIG_RT_GROUP_SCHED 10383#ifdef CONFIG_RT_GROUP_SCHED
10551 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10555,15 +10388,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10555 if (tsk->sched_class != &fair_sched_class) 10388 if (tsk->sched_class != &fair_sched_class)
10556 return -EINVAL; 10389 return -EINVAL;
10557#endif 10390#endif
10391 return 0;
10392}
10558 10393
10394static int
10395cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10396 struct task_struct *tsk, bool threadgroup)
10397{
10398 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10399 if (retval)
10400 return retval;
10401 if (threadgroup) {
10402 struct task_struct *c;
10403 rcu_read_lock();
10404 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10405 retval = cpu_cgroup_can_attach_task(cgrp, c);
10406 if (retval) {
10407 rcu_read_unlock();
10408 return retval;
10409 }
10410 }
10411 rcu_read_unlock();
10412 }
10559 return 0; 10413 return 0;
10560} 10414}
10561 10415
10562static void 10416static void
10563cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10417cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10564 struct cgroup *old_cont, struct task_struct *tsk) 10418 struct cgroup *old_cont, struct task_struct *tsk,
10419 bool threadgroup)
10565{ 10420{
10566 sched_move_task(tsk); 10421 sched_move_task(tsk);
10422 if (threadgroup) {
10423 struct task_struct *c;
10424 rcu_read_lock();
10425 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10426 sched_move_task(c);
10427 }
10428 rcu_read_unlock();
10429 }
10567} 10430}
10568 10431
10569#ifdef CONFIG_FAIR_GROUP_SCHED 10432#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5ddbd0891267..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..4e777b47eeda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
384 384
385#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
387 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 388 loff_t *ppos)
389{ 389{
390 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
391 391
392 if (ret || !write) 392 if (ret || !write)
393 return ret; 393 return ret;
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
513 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
514 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
515 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
516 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
517 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
518 } 519 }
@@ -709,24 +710,28 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
709 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
710 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
711 712
712 if (!initial) { 713 /* sleeps up to a single latency don't count. */
713 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
714 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
715 unsigned long thresh = sysctl_sched_latency;
716 716
717 /* 717 /*
718 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
719 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
720 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
721 * all of which have the same weight. 721 * all of which have the same weight.
722 */ 722 */
723 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
724 (!entity_is_task(se) || 724 task_of(se)->policy != SCHED_IDLE))
725 task_of(se)->policy != SCHED_IDLE)) 725 thresh = calc_delta_fair(thresh, se);
726 thresh = calc_delta_fair(thresh, se);
727 726
728 vruntime -= thresh; 727 /*
729 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
733
734 vruntime -= thresh;
730 } 735 }
731 736
732 /* ensure we never gain time by being placed backwards. */ 737 /* ensure we never gain time by being placed backwards. */
@@ -757,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
757 762
758static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
759{ 764{
760 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
761 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
762 767
763 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
764 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
765} 770}
766 771
@@ -1062,83 +1067,6 @@ static void yield_task_fair(struct rq *rq)
1062 se->vruntime = rightmost->vruntime + 1; 1067 se->vruntime = rightmost->vruntime + 1;
1063} 1068}
1064 1069
1065/*
1066 * wake_idle() will wake a task on an idle cpu if task->cpu is
1067 * not idle and an idle cpu is available. The span of cpus to
1068 * search starts with cpus closest then further out as needed,
1069 * so we always favor a closer, idle cpu.
1070 * Domains may include CPUs that are not usable for migration,
1071 * hence we need to mask them out (rq->rd->online)
1072 *
1073 * Returns the CPU we should wake onto.
1074 */
1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079static int wake_idle(int cpu, struct task_struct *p)
1080{
1081 struct sched_domain *sd;
1082 int i;
1083 unsigned int chosen_wakeup_cpu;
1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1086
1087 /*
1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089 * are idle and this is not a kernel thread and this task's affinity
1090 * allows it to be moved to preferred cpu, then just move!
1091 */
1092
1093 this_cpu = smp_processor_id();
1094 chosen_wakeup_cpu =
1095 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099 p->mm && !(p->flags & PF_KTHREAD) &&
1100 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101 return chosen_wakeup_cpu;
1102
1103 /*
1104 * If it is idle, then it is the best cpu to run this task.
1105 *
1106 * This cpu is also the best, if it has more than one task already.
1107 * Siblings must be also busy(in most cases) as they didn't already
1108 * pickup the extra load from this cpu and hence we need not check
1109 * sibling runqueue info. This will avoid the checks and cache miss
1110 * penalities associated with that.
1111 */
1112 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1113 return cpu;
1114
1115 for_each_domain(cpu, sd) {
1116 if ((sd->flags & SD_WAKE_IDLE)
1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1118 && !task_hot(p, task_rq->clock, sd))) {
1119 for_each_cpu_and(i, sched_domain_span(sd),
1120 &p->cpus_allowed) {
1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122 if (i != task_cpu(p)) {
1123 schedstat_inc(p,
1124 se.nr_wakeups_idle);
1125 }
1126 return i;
1127 }
1128 }
1129 } else {
1130 break;
1131 }
1132 }
1133 return cpu;
1134}
1135#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136static inline int wake_idle(int cpu, struct task_struct *p)
1137{
1138 return cpu;
1139}
1140#endif
1141
1142#ifdef CONFIG_SMP 1070#ifdef CONFIG_SMP
1143 1071
1144#ifdef CONFIG_FAIR_GROUP_SCHED 1072#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225 1153
1226#endif 1154#endif
1227 1155
1228static int 1156static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1229wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1230 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231 int idx, unsigned long load, unsigned long this_load,
1232 unsigned int imbalance)
1233{ 1157{
1234 struct task_struct *curr = this_rq->curr; 1158 struct task_struct *curr = current;
1235 struct task_group *tg; 1159 unsigned long this_load, load;
1236 unsigned long tl = this_load; 1160 int idx, this_cpu, prev_cpu;
1237 unsigned long tl_per_task; 1161 unsigned long tl_per_task;
1162 unsigned int imbalance;
1163 struct task_group *tg;
1238 unsigned long weight; 1164 unsigned long weight;
1239 int balanced; 1165 int balanced;
1240 1166
1241 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1167 idx = sd->wake_idx;
1242 return 0; 1168 this_cpu = smp_processor_id();
1169 prev_cpu = task_cpu(p);
1170 load = source_load(prev_cpu, idx);
1171 this_load = target_load(this_cpu, idx);
1243 1172
1244 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1173 if (sync) {
1245 p->se.avg_overlap > sysctl_sched_migration_cost)) 1174 if (sched_feat(SYNC_LESS) &&
1246 sync = 0; 1175 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178 } else {
1179 if (sched_feat(SYNC_MORE) &&
1180 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1181 p->se.avg_overlap < sysctl_sched_migration_cost))
1182 sync = 1;
1183 }
1247 1184
1248 /* 1185 /*
1249 * If sync wakeup then subtract the (maximum possible) 1186 * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1254 tg = task_group(current); 1191 tg = task_group(current);
1255 weight = current->se.load.weight; 1192 weight = current->se.load.weight;
1256 1193
1257 tl += effective_load(tg, this_cpu, -weight, -weight); 1194 this_load += effective_load(tg, this_cpu, -weight, -weight);
1258 load += effective_load(tg, prev_cpu, 0, -weight); 1195 load += effective_load(tg, prev_cpu, 0, -weight);
1259 } 1196 }
1260 1197
1261 tg = task_group(p); 1198 tg = task_group(p);
1262 weight = p->se.load.weight; 1199 weight = p->se.load.weight;
1263 1200
1201 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1202
1264 /* 1203 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1204 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have 1205 * due to the sync cause above having dropped this_load to 0, we'll
1267 * an imbalance, but there's really nothing you can do about that, so 1206 * always have an imbalance, but there's really nothing you can do
1268 * that's good too. 1207 * about that, so that's good too.
1269 * 1208 *
1270 * Otherwise check if either cpus are near enough in load to allow this 1209 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu. 1210 * task to be woken on this_cpu.
1272 */ 1211 */
1273 balanced = !tl || 1212 balanced = !this_load ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1213 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1214 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276 1215
1277 /* 1216 /*
@@ -1285,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1285 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1224 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286 tl_per_task = cpu_avg_load_per_task(this_cpu); 1225 tl_per_task = cpu_avg_load_per_task(this_cpu);
1287 1226
1288 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1227 if (balanced ||
1289 tl_per_task)) { 1228 (this_load <= load &&
1229 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290 /* 1230 /*
1291 * This domain has SD_WAKE_AFFINE and 1231 * This domain has SD_WAKE_AFFINE and
1292 * p is cache cold in this domain, and 1232 * p is cache cold in this domain, and
1293 * there is no bad imbalance. 1233 * there is no bad imbalance.
1294 */ 1234 */
1295 schedstat_inc(this_sd, ttwu_move_affine); 1235 schedstat_inc(sd, ttwu_move_affine);
1296 schedstat_inc(p, se.nr_wakeups_affine); 1236 schedstat_inc(p, se.nr_wakeups_affine);
1297 1237
1298 return 1; 1238 return 1;
@@ -1300,65 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1300 return 0; 1240 return 0;
1301} 1241}
1302 1242
1303static int select_task_rq_fair(struct task_struct *p, int sync) 1243/*
1244 * find_idlest_group finds and returns the least busy CPU group within the
1245 * domain.
1246 */
1247static struct sched_group *
1248find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1249 int this_cpu, int load_idx)
1304{ 1250{
1305 struct sched_domain *sd, *this_sd = NULL; 1251 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1306 int prev_cpu, this_cpu, new_cpu; 1252 unsigned long min_load = ULONG_MAX, this_load = 0;
1307 unsigned long load, this_load; 1253 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1308 struct rq *this_rq;
1309 unsigned int imbalance;
1310 int idx;
1311 1254
1312 prev_cpu = task_cpu(p); 1255 do {
1313 this_cpu = smp_processor_id(); 1256 unsigned long load, avg_load;
1314 this_rq = cpu_rq(this_cpu); 1257 int local_group;
1315 new_cpu = prev_cpu; 1258 int i;
1316 1259
1317 /* 1260 /* Skip over this group if it has no CPUs allowed */
1318 * 'this_sd' is the first domain that both 1261 if (!cpumask_intersects(sched_group_cpus(group),
1319 * this_cpu and prev_cpu are present in: 1262 &p->cpus_allowed))
1320 */ 1263 continue;
1321 for_each_domain(this_cpu, sd) { 1264
1322 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1265 local_group = cpumask_test_cpu(this_cpu,
1323 this_sd = sd; 1266 sched_group_cpus(group));
1324 break; 1267
1268 /* Tally up the load of all CPUs in the group */
1269 avg_load = 0;
1270
1271 for_each_cpu(i, sched_group_cpus(group)) {
1272 /* Bias balancing toward cpus of our domain */
1273 if (local_group)
1274 load = source_load(i, load_idx);
1275 else
1276 load = target_load(i, load_idx);
1277
1278 avg_load += load;
1279 }
1280
1281 /* Adjust by relative CPU power of the group */
1282 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1283
1284 if (local_group) {
1285 this_load = avg_load;
1286 this = group;
1287 } else if (avg_load < min_load) {
1288 min_load = avg_load;
1289 idlest = group;
1290 }
1291 } while (group = group->next, group != sd->groups);
1292
1293 if (!idlest || 100*this_load < imbalance*min_load)
1294 return NULL;
1295 return idlest;
1296}
1297
1298/*
1299 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1300 */
1301static int
1302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1303{
1304 unsigned long load, min_load = ULONG_MAX;
1305 int idlest = -1;
1306 int i;
1307
1308 /* Traverse only the allowed CPUs */
1309 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1310 load = weighted_cpuload(i);
1311
1312 if (load < min_load || (load == min_load && i == this_cpu)) {
1313 min_load = load;
1314 idlest = i;
1325 } 1315 }
1326 } 1316 }
1327 1317
1328 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1318 return idlest;
1329 goto out; 1319}
1330 1320
1331 /* 1321/*
1332 * Check for affine wakeup and passive balancing possibilities. 1322 * sched_balance_self: balance the current task (running on cpu) in domains
1333 */ 1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1334 if (!this_sd) 1324 * SD_BALANCE_EXEC.
1325 *
1326 * Balance, ie. select the least loaded group.
1327 *
1328 * Returns the target CPU number, or the same CPU if no balancing is needed.
1329 *
1330 * preempt must be disabled.
1331 */
1332static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1333{
1334 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1335 int cpu = smp_processor_id();
1336 int prev_cpu = task_cpu(p);
1337 int new_cpu = cpu;
1338 int want_affine = 0;
1339 int want_sd = 1;
1340 int sync = wake_flags & WF_SYNC;
1341
1342 if (sd_flag & SD_BALANCE_WAKE) {
1343 if (sched_feat(AFFINE_WAKEUPS) &&
1344 cpumask_test_cpu(cpu, &p->cpus_allowed))
1345 want_affine = 1;
1346 new_cpu = prev_cpu;
1347 }
1348
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) {
1351 /*
1352 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider.
1354 */
1355 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1356 unsigned long power = 0;
1357 unsigned long nr_running = 0;
1358 unsigned long capacity;
1359 int i;
1360
1361 for_each_cpu(i, sched_domain_span(tmp)) {
1362 power += power_of(i);
1363 nr_running += cpu_rq(i)->cfs.nr_running;
1364 }
1365
1366 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1367
1368 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1369 nr_running /= 2;
1370
1371 if (nr_running < capacity)
1372 want_sd = 0;
1373 }
1374
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1377
1378 affine_sd = tmp;
1379 want_affine = 0;
1380 }
1381
1382 if (!want_sd && !want_affine)
1383 break;
1384
1385 if (!(tmp->flags & sd_flag))
1386 continue;
1387
1388 if (want_sd)
1389 sd = tmp;
1390 }
1391
1392 if (sched_feat(LB_SHARES_UPDATE)) {
1393 /*
1394 * Pick the largest domain to update shares over
1395 */
1396 tmp = sd;
1397 if (affine_sd && (!tmp ||
1398 cpumask_weight(sched_domain_span(affine_sd)) >
1399 cpumask_weight(sched_domain_span(sd))))
1400 tmp = affine_sd;
1401
1402 if (tmp)
1403 update_shares(tmp);
1404 }
1405
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1407 new_cpu = cpu;
1335 goto out; 1408 goto out;
1409 }
1336 1410
1337 idx = this_sd->wake_idx; 1411 while (sd) {
1412 int load_idx = sd->forkexec_idx;
1413 struct sched_group *group;
1414 int weight;
1338 1415
1339 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1416 if (!(sd->flags & sd_flag)) {
1417 sd = sd->child;
1418 continue;
1419 }
1340 1420
1341 load = source_load(prev_cpu, idx); 1421 if (sd_flag & SD_BALANCE_WAKE)
1342 this_load = target_load(this_cpu, idx); 1422 load_idx = sd->wake_idx;
1343 1423
1344 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1424 group = find_idlest_group(sd, p, cpu, load_idx);
1345 load, this_load, imbalance)) 1425 if (!group) {
1346 return this_cpu; 1426 sd = sd->child;
1427 continue;
1428 }
1347 1429
1348 /* 1430 new_cpu = find_idlest_cpu(group, p, cpu);
1349 * Start passive balancing when half the imbalance_pct 1431 if (new_cpu == -1 || new_cpu == cpu) {
1350 * limit is reached. 1432 /* Now try balancing at a lower domain level of cpu */
1351 */ 1433 sd = sd->child;
1352 if (this_sd->flags & SD_WAKE_BALANCE) { 1434 continue;
1353 if (imbalance*this_load <= 100*load) { 1435 }
1354 schedstat_inc(this_sd, ttwu_move_balance); 1436
1355 schedstat_inc(p, se.nr_wakeups_passive); 1437 /* Now try balancing at a lower domain level of new_cpu */
1356 return this_cpu; 1438 cpu = new_cpu;
1439 weight = cpumask_weight(sched_domain_span(sd));
1440 sd = NULL;
1441 for_each_domain(cpu, tmp) {
1442 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1443 break;
1444 if (tmp->flags & sd_flag)
1445 sd = tmp;
1357 } 1446 }
1447 /* while loop will break here if sd == NULL */
1358 } 1448 }
1359 1449
1360out: 1450out:
1361 return wake_idle(new_cpu, p); 1451 rcu_read_unlock();
1452 return new_cpu;
1362} 1453}
1363#endif /* CONFIG_SMP */ 1454#endif /* CONFIG_SMP */
1364 1455
@@ -1471,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
1471/* 1562/*
1472 * Preempt the current task with a newly woken task if needed: 1563 * Preempt the current task with a newly woken task if needed:
1473 */ 1564 */
1474static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1565static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1475{ 1566{
1476 struct task_struct *curr = rq->curr; 1567 struct task_struct *curr = rq->curr;
1477 struct sched_entity *se = &curr->se, *pse = &p->se; 1568 struct sched_entity *se = &curr->se, *pse = &p->se;
1478 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC;
1479 1571
1480 update_curr(cfs_rq); 1572 update_curr(cfs_rq);
1481 1573
@@ -1501,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1501 */ 1593 */
1502 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1503 set_last_buddy(se); 1595 set_last_buddy(se);
1504 set_next_buddy(pse); 1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse);
1505 1598
1506 /* 1599 /*
1507 * We can come here with TIF_NEED_RESCHED already set from new task 1600 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1523 return; 1616 return;
1524 } 1617 }
1525 1618
1526 if (!sched_feat(WAKEUP_PREEMPT)) 1619 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1527 return; 1620 (sched_feat(WAKEUP_OVERLAP) &&
1528 1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1529 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1530 (se->avg_overlap < sysctl_sched_migration_cost &&
1531 pse->avg_overlap < sysctl_sched_migration_cost))) {
1532 resched_task(curr); 1623 resched_task(curr);
1533 return; 1624 return;
1534 } 1625 }
1535 1626
1627 if (sched_feat(WAKEUP_RUNNING)) {
1628 if (pse->avg_running < se->avg_running) {
1629 set_next_buddy(pse);
1630 resched_task(curr);
1631 return;
1632 }
1633 }
1634
1635 if (!sched_feat(WAKEUP_PREEMPT))
1636 return;
1637
1536 find_matching_se(&se, &pse); 1638 find_matching_se(&se, &pse);
1537 1639
1538 BUG_ON(!pse); 1640 BUG_ON(!pse);
@@ -1555,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1555 /* 1657 /*
1556 * If se was a buddy, clear it so that it will have to earn 1658 * If se was a buddy, clear it so that it will have to earn
1557 * the favour again. 1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1558 */ 1665 */
1559 __clear_buddies(cfs_rq, se); 1666 __clear_buddies(cfs_rq, NULL);
1560 set_next_entity(cfs_rq, se); 1667 set_next_entity(cfs_rq, se);
1561 cfs_rq = group_cfs_rq(se); 1668 cfs_rq = group_cfs_rq(se);
1562 } while (cfs_rq); 1669 } while (cfs_rq);
@@ -1832,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
1832} 1939}
1833#endif 1940#endif
1834 1941
1942unsigned int get_rr_interval_fair(struct task_struct *task)
1943{
1944 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0;
1948
1949 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue:
1952 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957
1958 return rr_interval;
1959}
1960
1835/* 1961/*
1836 * All the scheduling class methods: 1962 * All the scheduling class methods:
1837 */ 1963 */
@@ -1860,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
1860 .prio_changed = prio_changed_fair, 1986 .prio_changed = prio_changed_fair,
1861 .switched_to = switched_to_fair, 1987 .switched_to = switched_to_fair,
1862 1988
1989 .get_rr_interval = get_rr_interval_fair,
1990
1863#ifdef CONFIG_FAIR_GROUP_SCHED 1991#ifdef CONFIG_FAIR_GROUP_SCHED
1864 .moved_group = moved_group_fair, 1992 .moved_group = moved_group_fair,
1865#endif 1993#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index e2dc63a5815d..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2eb4bd6a526c..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
938#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
939static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
940 940
941static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
942{ 942{
943 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
944 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
945 /* 948 /*
946 * If the current task is an RT task, then 949 * If the current task is an RT task, then
947 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
999/* 1002/*
1000 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
1001 */ 1004 */
1002static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1003{ 1006{
1004 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
1005 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1731,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
1731 dequeue_pushable_task(rq, p); 1734 dequeue_pushable_task(rq, p);
1732} 1735}
1733 1736
1737unsigned int get_rr_interval_rt(struct task_struct *task)
1738{
1739 /*
1740 * Time slice is 0 for SCHED_FIFO tasks
1741 */
1742 if (task->policy == SCHED_RR)
1743 return DEF_TIMESLICE;
1744 else
1745 return 0;
1746}
1747
1734static const struct sched_class rt_sched_class = { 1748static const struct sched_class rt_sched_class = {
1735 .next = &fair_sched_class, 1749 .next = &fair_sched_class,
1736 .enqueue_task = enqueue_task_rt, 1750 .enqueue_task = enqueue_task_rt,
@@ -1759,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
1759 .set_curr_task = set_curr_task_rt, 1773 .set_curr_task = set_curr_task_rt,
1760 .task_tick = task_tick_rt, 1774 .task_tick = task_tick_rt,
1761 1775
1776 .get_rr_interval = get_rr_interval_rt,
1777
1762 .prio_changed = prio_changed_rt, 1778 .prio_changed = prio_changed_rt,
1763 .switched_to = switched_to_rt, 1779 .switched_to = switched_to_rt,
1764}; 1780};
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6705320784fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 705
706 if (why) { 706 if (why) {
707 /* 707 /*
708 * The first thread which returns from finish_stop() 708 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 709 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 710 * notify its parent. See get_signal_to_deliver().
711 */ 711 */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 971 return send_signal(sig, info, t, 0);
972} 972}
973 973
974int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
975 bool group)
976{
977 unsigned long flags;
978 int ret = -ESRCH;
979
980 if (lock_task_sighand(p, &flags)) {
981 ret = send_signal(sig, info, p, group);
982 unlock_task_sighand(p, &flags);
983 }
984
985 return ret;
986}
987
974/* 988/*
975 * Force a signal that the process can't ignore: if necessary 989 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 990 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1050 }
1037} 1051}
1038 1052
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1053struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1054{
1047 struct sighand_struct *sighand; 1055 struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1076 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1077int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1078{
1071 unsigned long flags; 1079 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073 1080
1074 ret = check_kill_permission(sig, info, p); 1081 if (!ret && sig)
1075 1082 ret = do_send_sig_info(sig, info, p, true);
1076 if (!ret && sig) {
1077 ret = -ESRCH;
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1083
1084 return ret; 1084 return ret;
1085} 1085}
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1224 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1225 */
1226 1226
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1227int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1228send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1229{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1230 /*
1237 * Make sure legacy kernel users don't send in bad values 1231 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1232 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1234 if (!valid_signal(sig))
1241 return -EINVAL; 1235 return -EINVAL;
1242 1236
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1237 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1238}
1248 1239
1249#define __si_special(priv) \ 1240#define __si_special(priv) \
@@ -1383,15 +1374,6 @@ ret:
1383} 1374}
1384 1375
1385/* 1376/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1377 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1378 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1379 *
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1655 spin_unlock_irq(&current->sighand->siglock);
1674} 1656}
1675 1657
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1658/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1659 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1660 * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1664static int do_signal_stop(int signr)
1706{ 1665{
1707 struct signal_struct *sig = current->signal; 1666 struct signal_struct *sig = current->signal;
1708 int stop_count; 1667 int notify;
1709 1668
1710 if (sig->group_stop_count > 0) { 1669 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1670 struct task_struct *t;
1718 1671
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1672 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
1725 */ 1678 */
1726 sig->group_exit_code = signr; 1679 sig->group_exit_code = signr;
1727 1680
1728 stop_count = 0; 1681 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1682 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1683 /*
1731 * Setting state to TASK_STOPPED for a group 1684 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
1734 */ 1687 */
1735 if (!(t->flags & PF_EXITING) && 1688 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1689 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1690 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1691 signal_wake_up(t, 0);
1739 } 1692 }
1740 sig->group_stop_count = stop_count;
1741 } 1693 }
1694 /*
1695 * If there are no other threads in the group, or if there is
1696 * a group stop in progress and we are the last to stop, report
1697 * to the parent. When ptraced, every thread reports itself.
1698 */
1699 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1700 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1701 /*
1702 * tracehook_notify_jctl() can drop and reacquire siglock, so
1703 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1704 * or SIGKILL comes in between ->group_stop_count == 0.
1705 */
1706 if (sig->group_stop_count) {
1707 if (!--sig->group_stop_count)
1708 sig->flags = SIGNAL_STOP_STOPPED;
1709 current->exit_code = sig->group_exit_code;
1710 __set_current_state(TASK_STOPPED);
1711 }
1712 spin_unlock_irq(&current->sighand->siglock);
1742 1713
1743 if (stop_count == 0) 1714 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1715 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1716 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1717 read_unlock(&tasklist_lock);
1718 }
1719
1720 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1721 do {
1722 schedule();
1723 } while (try_to_freeze());
1724
1725 tracehook_finish_jctl();
1726 current->exit_code = 0;
1747 1727
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1728 return 1;
1751} 1729}
1752 1730
@@ -1815,14 +1793,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1793 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1794 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1795 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1796
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1797 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1798 spin_unlock_irq(&sighand->siglock);
1822 1799
1823 read_lock(&tasklist_lock); 1800 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1801 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1802 do_notify_parent_cldstop(current->group_leader, why);
1803 read_unlock(&tasklist_lock);
1804 }
1826 goto relock; 1805 goto relock;
1827 } 1806 }
1828 1807
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1966 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 1967 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 1968 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 1969 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 1970 }
1992out: 1971out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 1972 spin_unlock_irq(&tsk->sighand->siglock);
1994 1973
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 1974 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 1975 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1976 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 1977 read_unlock(&tasklist_lock);
1999 } 1978 }
2000} 1979}
@@ -2290,7 +2269,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2269do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2270{
2292 struct task_struct *p; 2271 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2272 int error = -ESRCH;
2295 2273
2296 rcu_read_lock(); 2274 rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2278 /*
2301 * The null signal is a permissions and process existence 2279 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2280 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2281 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2282 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2283 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2284 /*
2285 * If lock_task_sighand() failed we pretend the task
2286 * dies after receiving the signal. The window is tiny,
2287 * and the signal is private anyway.
2288 */
2289 if (unlikely(error == -ESRCH))
2290 error = 0;
2311 } 2291 }
2312 } 2292 }
2313 rcu_read_unlock(); 2293 rcu_read_unlock();
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..0d31135efbf4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 26static void slow_work_oom_timeout(unsigned long);
27 27
28#ifdef CONFIG_SYSCTL 28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 29static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
31 31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 32static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 33 void __user *, size_t *, loff_t *);
34#endif 34#endif
35 35
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 493 * Handle adjustment of the minimum number of threads
494 */ 494 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 496 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 497 size_t *lenp, loff_t *ppos)
498{ 498{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 499 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 500 int n;
501 501
502 if (ret == 0) { 502 if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 521 * Handle adjustment of the maximum number of threads
522 */ 522 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 524 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 525 size_t *lenp, loff_t *ppos)
526{ 526{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 527 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 528 int n;
529 529
530 if (ret == 0) { 530 if (ret == 0) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e218500ab14..c9d1c7835c2f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
29 29
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data csd; 31 struct call_single_data csd;
32 spinlock_t lock; 32 atomic_t refs;
33 unsigned int refs;
34 cpumask_var_t cpumask; 33 cpumask_var_t cpumask;
35}; 34};
36 35
@@ -39,9 +38,7 @@ struct call_single_queue {
39 spinlock_t lock; 38 spinlock_t lock;
40}; 39};
41 40
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45 42
46static int 43static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
196 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 193 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
197 int refs; 194 int refs;
198 195
199 spin_lock(&data->lock); 196 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
200 if (!cpumask_test_cpu(cpu, data->cpumask)) {
201 spin_unlock(&data->lock);
202 continue; 197 continue;
203 }
204 cpumask_clear_cpu(cpu, data->cpumask);
205 spin_unlock(&data->lock);
206 198
207 data->csd.func(data->csd.info); 199 data->csd.func(data->csd.info);
208 200
209 spin_lock(&data->lock); 201 refs = atomic_dec_return(&data->refs);
210 WARN_ON(data->refs == 0); 202 WARN_ON(refs < 0);
211 refs = --data->refs;
212 if (!refs) { 203 if (!refs) {
213 spin_lock(&call_function.lock); 204 spin_lock(&call_function.lock);
214 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
215 spin_unlock(&call_function.lock); 206 spin_unlock(&call_function.lock);
216 } 207 }
217 spin_unlock(&data->lock);
218 208
219 if (refs) 209 if (refs)
220 continue; 210 continue;
@@ -357,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
357 generic_exec_single(cpu, data, wait); 347 generic_exec_single(cpu, data, wait);
358} 348}
359 349
360/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
361
362#ifndef arch_send_call_function_ipi_mask
363# define arch_send_call_function_ipi_mask(maskp) \
364 arch_send_call_function_ipi(*(maskp))
365#endif
366
367/** 350/**
368 * smp_call_function_many(): Run a function on a set of other CPUs. 351 * smp_call_function_many(): Run a function on a set of other CPUs.
369 * @mask: The set of cpus to run on (only runs on online subset). 352 * @mask: The set of cpus to run on (only runs on online subset).
@@ -419,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask,
419 data = &__get_cpu_var(cfd_data); 402 data = &__get_cpu_var(cfd_data);
420 csd_lock(&data->csd); 403 csd_lock(&data->csd);
421 404
422 spin_lock_irqsave(&data->lock, flags);
423 data->csd.func = func; 405 data->csd.func = func;
424 data->csd.info = info; 406 data->csd.info = info;
425 cpumask_and(data->cpumask, mask, cpu_online_mask); 407 cpumask_and(data->cpumask, mask, cpu_online_mask);
426 cpumask_clear_cpu(this_cpu, data->cpumask); 408 cpumask_clear_cpu(this_cpu, data->cpumask);
427 data->refs = cpumask_weight(data->cpumask); 409 atomic_set(&data->refs, cpumask_weight(data->cpumask));
428 410
429 spin_lock(&call_function.lock); 411 spin_lock_irqsave(&call_function.lock, flags);
430 /* 412 /*
431 * Place entry at the _HEAD_ of the list, so that any cpu still 413 * Place entry at the _HEAD_ of the list, so that any cpu still
432 * observing the entry in generic_smp_call_function_interrupt() 414 * observing the entry in generic_smp_call_function_interrupt()
433 * will not miss any other list entries: 415 * will not miss any other list entries:
434 */ 416 */
435 list_add_rcu(&data->csd.list, &call_function.queue); 417 list_add_rcu(&data->csd.list, &call_function.queue);
436 spin_unlock(&call_function.lock); 418 spin_unlock_irqrestore(&call_function.lock, flags);
437
438 spin_unlock_irqrestore(&data->lock, flags);
439 419
440 /* 420 /*
441 * Make the list addition visible before sending the ipi. 421 * Make the list addition visible before sending the ipi.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7db25067cd2d..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..255475d163e0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338 unsigned long flags; 1338 unsigned long flags;
1339 cputime_t utime, stime; 1339 cputime_t utime, stime;
1340 struct task_cputime cputime; 1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0;
1341 1342
1342 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1343 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1346 utime = task_utime(current); 1347 utime = task_utime(current);
1347 stime = task_stime(current); 1348 stime = task_stime(current);
1348 accumulate_thread_rusage(p, r); 1349 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss;
1349 goto out; 1351 goto out;
1350 } 1352 }
1351 1353
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1363 r->ru_majflt = p->signal->cmaj_flt; 1365 r->ru_majflt = p->signal->cmaj_flt;
1364 r->ru_inblock = p->signal->cinblock; 1366 r->ru_inblock = p->signal->cinblock;
1365 r->ru_oublock = p->signal->coublock; 1367 r->ru_oublock = p->signal->coublock;
1368 maxrss = p->signal->cmaxrss;
1366 1369
1367 if (who == RUSAGE_CHILDREN) 1370 if (who == RUSAGE_CHILDREN)
1368 break; 1371 break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1377 r->ru_majflt += p->signal->maj_flt; 1380 r->ru_majflt += p->signal->maj_flt;
1378 r->ru_inblock += p->signal->inblock; 1381 r->ru_inblock += p->signal->inblock;
1379 r->ru_oublock += p->signal->oublock; 1382 r->ru_oublock += p->signal->oublock;
1383 if (maxrss < p->signal->maxrss)
1384 maxrss = p->signal->maxrss;
1380 t = p; 1385 t = p;
1381 do { 1386 do {
1382 accumulate_thread_rusage(t, r); 1387 accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1392out: 1397out:
1393 cputime_to_timeval(utime, &r->ru_utime); 1398 cputime_to_timeval(utime, &r->ru_utime);
1394 cputime_to_timeval(stime, &r->ru_stime); 1399 cputime_to_timeval(stime, &r->ru_stime);
1400
1401 if (who != RUSAGE_CHILDREN) {
1402 struct mm_struct *mm = get_task_mm(p);
1403 if (mm) {
1404 setmax_mm_hiwater_rss(&maxrss, mm);
1405 mmput(mm);
1406 }
1407 }
1408 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1395} 1409}
1396 1410
1397int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1411int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1525 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1526 error = SET_TSC_CTL(arg2);
1513 break; 1527 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1528 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1529 error = perf_event_task_disable();
1516 break; 1530 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1531 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1532 error = perf_event_task_enable();
1519 break; 1533 break;
1520 case PR_GET_TIMERSLACK: 1534 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1535 error = current->timer_slack_ns;
@@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1528 current->timer_slack_ns = arg2; 1542 current->timer_slack_ns = arg2;
1529 error = 0; 1543 error = 0;
1530 break; 1544 break;
1545 case PR_MCE_KILL:
1546 if (arg4 | arg5)
1547 return -EINVAL;
1548 switch (arg2) {
1549 case 0:
1550 if (arg3 != 0)
1551 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS;
1553 break;
1554 case 1:
1555 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0)
1557 current->flags |= PF_MCE_EARLY;
1558 else
1559 current->flags &= ~PF_MCE_EARLY;
1560 break;
1561 default:
1562 return -EINVAL;
1563 }
1564 error = 0;
1565 break;
1566
1531 default: 1567 default:
1532 error = -EINVAL; 1568 error = -EINVAL;
1533 break; 1569 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..e06d0b8d1951 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(compat_sys_recvmsg); 51cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom);
52cond_syscall(sys_socketcall); 53cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 54cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 55cond_syscall(compat_sys_futex);
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 178cond_syscall(sys_eventfd2);
178 179
179/* performance counters: */ 180/* performance counters: */
180cond_syscall(sys_perf_counter_open); 181cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6bb59f707402..0d949c517412 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -50,7 +49,7 @@
50#include <linux/reboot.h> 49#include <linux/reboot.h>
51#include <linux/ftrace.h> 50#include <linux/ftrace.h>
52#include <linux/slow-work.h> 51#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 52#include <linux/perf_event.h>
54 53
55#include <asm/uaccess.h> 54#include <asm/uaccess.h>
56#include <asm/processor.h> 55#include <asm/processor.h>
@@ -77,6 +76,7 @@ extern int max_threads;
77extern int core_uses_pid; 76extern int core_uses_pid;
78extern int suid_dumpable; 77extern int suid_dumpable;
79extern char core_pattern[]; 78extern char core_pattern[];
79extern unsigned int core_pipe_limit;
80extern int pid_max; 80extern int pid_max;
81extern int min_free_kbytes; 81extern int min_free_kbytes;
82extern int pid_max_min, pid_max_max; 82extern int pid_max_min, pid_max_max;
@@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages;
91#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable; 92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
94extern int blk_iopoll_enabled; 95extern int blk_iopoll_enabled;
96#endif
95 97
96/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -104,6 +106,9 @@ static int __maybe_unused one = 1;
104static int __maybe_unused two = 2; 106static int __maybe_unused two = 2;
105static unsigned long one_ul = 1; 107static unsigned long one_ul = 1;
106static int one_hundred = 100; 108static int one_hundred = 100;
109#ifdef CONFIG_PRINTK
110static int ten_thousand = 10000;
111#endif
107 112
108/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 113/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
109static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 114static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -158,9 +163,9 @@ extern int max_lock_depth;
158#endif 163#endif
159 164
160#ifdef CONFIG_PROC_SYSCTL 165#ifdef CONFIG_PROC_SYSCTL
161static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 166static int proc_do_cad_pid(struct ctl_table *table, int write,
162 void __user *buffer, size_t *lenp, loff_t *ppos); 167 void __user *buffer, size_t *lenp, loff_t *ppos);
163static int proc_taint(struct ctl_table *table, int write, struct file *filp, 168static int proc_taint(struct ctl_table *table, int write,
164 void __user *buffer, size_t *lenp, loff_t *ppos); 169 void __user *buffer, size_t *lenp, loff_t *ppos);
165#endif 170#endif
166 171
@@ -419,6 +424,14 @@ static struct ctl_table kern_table[] = {
419 .proc_handler = &proc_dostring, 424 .proc_handler = &proc_dostring,
420 .strategy = &sysctl_string, 425 .strategy = &sysctl_string,
421 }, 426 },
427 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int),
432 .mode = 0644,
433 .proc_handler = &proc_dointvec,
434 },
422#ifdef CONFIG_PROC_SYSCTL 435#ifdef CONFIG_PROC_SYSCTL
423 { 436 {
424 .procname = "tainted", 437 .procname = "tainted",
@@ -720,6 +733,17 @@ static struct ctl_table kern_table[] = {
720 .mode = 0644, 733 .mode = 0644,
721 .proc_handler = &proc_dointvec, 734 .proc_handler = &proc_dointvec,
722 }, 735 },
736 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay",
739 .data = &printk_delay_msec,
740 .maxlen = sizeof(int),
741 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero,
745 .extra2 = &ten_thousand,
746 },
723#endif 747#endif
724 { 748 {
725 .ctl_name = KERN_NGROUPS_MAX, 749 .ctl_name = KERN_NGROUPS_MAX,
@@ -962,28 +986,28 @@ static struct ctl_table kern_table[] = {
962 .child = slow_work_sysctls, 986 .child = slow_work_sysctls,
963 }, 987 },
964#endif 988#endif
965#ifdef CONFIG_PERF_COUNTERS 989#ifdef CONFIG_PERF_EVENTS
966 { 990 {
967 .ctl_name = CTL_UNNUMBERED, 991 .ctl_name = CTL_UNNUMBERED,
968 .procname = "perf_counter_paranoid", 992 .procname = "perf_event_paranoid",
969 .data = &sysctl_perf_counter_paranoid, 993 .data = &sysctl_perf_event_paranoid,
970 .maxlen = sizeof(sysctl_perf_counter_paranoid), 994 .maxlen = sizeof(sysctl_perf_event_paranoid),
971 .mode = 0644, 995 .mode = 0644,
972 .proc_handler = &proc_dointvec, 996 .proc_handler = &proc_dointvec,
973 }, 997 },
974 { 998 {
975 .ctl_name = CTL_UNNUMBERED, 999 .ctl_name = CTL_UNNUMBERED,
976 .procname = "perf_counter_mlock_kb", 1000 .procname = "perf_event_mlock_kb",
977 .data = &sysctl_perf_counter_mlock, 1001 .data = &sysctl_perf_event_mlock,
978 .maxlen = sizeof(sysctl_perf_counter_mlock), 1002 .maxlen = sizeof(sysctl_perf_event_mlock),
979 .mode = 0644, 1003 .mode = 0644,
980 .proc_handler = &proc_dointvec, 1004 .proc_handler = &proc_dointvec,
981 }, 1005 },
982 { 1006 {
983 .ctl_name = CTL_UNNUMBERED, 1007 .ctl_name = CTL_UNNUMBERED,
984 .procname = "perf_counter_max_sample_rate", 1008 .procname = "perf_event_max_sample_rate",
985 .data = &sysctl_perf_counter_sample_rate, 1009 .data = &sysctl_perf_event_sample_rate,
986 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 1010 .maxlen = sizeof(sysctl_perf_event_sample_rate),
987 .mode = 0644, 1011 .mode = 0644,
988 .proc_handler = &proc_dointvec, 1012 .proc_handler = &proc_dointvec,
989 }, 1013 },
@@ -998,6 +1022,7 @@ static struct ctl_table kern_table[] = {
998 .proc_handler = &proc_dointvec, 1022 .proc_handler = &proc_dointvec,
999 }, 1023 },
1000#endif 1024#endif
1025#ifdef CONFIG_BLOCK
1001 { 1026 {
1002 .ctl_name = CTL_UNNUMBERED, 1027 .ctl_name = CTL_UNNUMBERED,
1003 .procname = "blk_iopoll", 1028 .procname = "blk_iopoll",
@@ -1006,6 +1031,7 @@ static struct ctl_table kern_table[] = {
1006 .mode = 0644, 1031 .mode = 0644,
1007 .proc_handler = &proc_dointvec, 1032 .proc_handler = &proc_dointvec,
1008 }, 1033 },
1034#endif
1009/* 1035/*
1010 * NOTE: do not add new entries to this table unless you have read 1036 * NOTE: do not add new entries to this table unless you have read
1011 * Documentation/sysctl/ctl_unnumbered.txt 1037 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1372,6 +1398,31 @@ static struct ctl_table vm_table[] = {
1372 .mode = 0644, 1398 .mode = 0644,
1373 .proc_handler = &scan_unevictable_handler, 1399 .proc_handler = &scan_unevictable_handler,
1374 }, 1400 },
1401#ifdef CONFIG_MEMORY_FAILURE
1402 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero,
1411 .extra2 = &one,
1412 },
1413 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero,
1422 .extra2 = &one,
1423 },
1424#endif
1425
1375/* 1426/*
1376 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1377 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2200,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2200#ifdef CONFIG_PROC_SYSCTL 2251#ifdef CONFIG_PROC_SYSCTL
2201 2252
2202static int _proc_do_string(void* data, int maxlen, int write, 2253static int _proc_do_string(void* data, int maxlen, int write,
2203 struct file *filp, void __user *buffer, 2254 void __user *buffer,
2204 size_t *lenp, loff_t *ppos) 2255 size_t *lenp, loff_t *ppos)
2205{ 2256{
2206 size_t len; 2257 size_t len;
@@ -2261,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2261 * proc_dostring - read a string sysctl 2312 * proc_dostring - read a string sysctl
2262 * @table: the sysctl table 2313 * @table: the sysctl table
2263 * @write: %TRUE if this is a write to the sysctl file 2314 * @write: %TRUE if this is a write to the sysctl file
2264 * @filp: the file structure
2265 * @buffer: the user buffer 2315 * @buffer: the user buffer
2266 * @lenp: the size of the user buffer 2316 * @lenp: the size of the user buffer
2267 * @ppos: file position 2317 * @ppos: file position
@@ -2275,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2275 * 2325 *
2276 * Returns 0 on success. 2326 * Returns 0 on success.
2277 */ 2327 */
2278int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2328int proc_dostring(struct ctl_table *table, int write,
2279 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2280{ 2330{
2281 return _proc_do_string(table->data, table->maxlen, write, filp, 2331 return _proc_do_string(table->data, table->maxlen, write,
2282 buffer, lenp, ppos); 2332 buffer, lenp, ppos);
2283} 2333}
2284 2334
@@ -2303,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2303} 2353}
2304 2354
2305static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2355static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2306 int write, struct file *filp, void __user *buffer, 2356 int write, void __user *buffer,
2307 size_t *lenp, loff_t *ppos, 2357 size_t *lenp, loff_t *ppos,
2308 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2358 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2309 int write, void *data), 2359 int write, void *data),
@@ -2410,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2410#undef TMPBUFLEN 2460#undef TMPBUFLEN
2411} 2461}
2412 2462
2413static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2463static int do_proc_dointvec(struct ctl_table *table, int write,
2414 void __user *buffer, size_t *lenp, loff_t *ppos, 2464 void __user *buffer, size_t *lenp, loff_t *ppos,
2415 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2465 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2416 int write, void *data), 2466 int write, void *data),
2417 void *data) 2467 void *data)
2418{ 2468{
2419 return __do_proc_dointvec(table->data, table, write, filp, 2469 return __do_proc_dointvec(table->data, table, write,
2420 buffer, lenp, ppos, conv, data); 2470 buffer, lenp, ppos, conv, data);
2421} 2471}
2422 2472
@@ -2424,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2424 * proc_dointvec - read a vector of integers 2474 * proc_dointvec - read a vector of integers
2425 * @table: the sysctl table 2475 * @table: the sysctl table
2426 * @write: %TRUE if this is a write to the sysctl file 2476 * @write: %TRUE if this is a write to the sysctl file
2427 * @filp: the file structure
2428 * @buffer: the user buffer 2477 * @buffer: the user buffer
2429 * @lenp: the size of the user buffer 2478 * @lenp: the size of the user buffer
2430 * @ppos: file position 2479 * @ppos: file position
@@ -2434,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2434 * 2483 *
2435 * Returns 0 on success. 2484 * Returns 0 on success.
2436 */ 2485 */
2437int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2486int proc_dointvec(struct ctl_table *table, int write,
2438 void __user *buffer, size_t *lenp, loff_t *ppos) 2487 void __user *buffer, size_t *lenp, loff_t *ppos)
2439{ 2488{
2440 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2489 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2441 NULL,NULL); 2490 NULL,NULL);
2442} 2491}
2443 2492
@@ -2445,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2445 * Taint values can only be increased 2494 * Taint values can only be increased
2446 * This means we can safely use a temporary. 2495 * This means we can safely use a temporary.
2447 */ 2496 */
2448static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2497static int proc_taint(struct ctl_table *table, int write,
2449 void __user *buffer, size_t *lenp, loff_t *ppos) 2498 void __user *buffer, size_t *lenp, loff_t *ppos)
2450{ 2499{
2451 struct ctl_table t; 2500 struct ctl_table t;
@@ -2457,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2457 2506
2458 t = *table; 2507 t = *table;
2459 t.data = &tmptaint; 2508 t.data = &tmptaint;
2460 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2509 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2461 if (err < 0) 2510 if (err < 0)
2462 return err; 2511 return err;
2463 2512
@@ -2509,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2509 * proc_dointvec_minmax - read a vector of integers with min/max values 2558 * proc_dointvec_minmax - read a vector of integers with min/max values
2510 * @table: the sysctl table 2559 * @table: the sysctl table
2511 * @write: %TRUE if this is a write to the sysctl file 2560 * @write: %TRUE if this is a write to the sysctl file
2512 * @filp: the file structure
2513 * @buffer: the user buffer 2561 * @buffer: the user buffer
2514 * @lenp: the size of the user buffer 2562 * @lenp: the size of the user buffer
2515 * @ppos: file position 2563 * @ppos: file position
@@ -2522,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2522 * 2570 *
2523 * Returns 0 on success. 2571 * Returns 0 on success.
2524 */ 2572 */
2525int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2573int proc_dointvec_minmax(struct ctl_table *table, int write,
2526 void __user *buffer, size_t *lenp, loff_t *ppos) 2574 void __user *buffer, size_t *lenp, loff_t *ppos)
2527{ 2575{
2528 struct do_proc_dointvec_minmax_conv_param param = { 2576 struct do_proc_dointvec_minmax_conv_param param = {
2529 .min = (int *) table->extra1, 2577 .min = (int *) table->extra1,
2530 .max = (int *) table->extra2, 2578 .max = (int *) table->extra2,
2531 }; 2579 };
2532 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2580 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2533 do_proc_dointvec_minmax_conv, &param); 2581 do_proc_dointvec_minmax_conv, &param);
2534} 2582}
2535 2583
2536static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2584static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2537 struct file *filp,
2538 void __user *buffer, 2585 void __user *buffer,
2539 size_t *lenp, loff_t *ppos, 2586 size_t *lenp, loff_t *ppos,
2540 unsigned long convmul, 2587 unsigned long convmul,
@@ -2639,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2639} 2686}
2640 2687
2641static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2688static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2642 struct file *filp,
2643 void __user *buffer, 2689 void __user *buffer,
2644 size_t *lenp, loff_t *ppos, 2690 size_t *lenp, loff_t *ppos,
2645 unsigned long convmul, 2691 unsigned long convmul,
2646 unsigned long convdiv) 2692 unsigned long convdiv)
2647{ 2693{
2648 return __do_proc_doulongvec_minmax(table->data, table, write, 2694 return __do_proc_doulongvec_minmax(table->data, table, write,
2649 filp, buffer, lenp, ppos, convmul, convdiv); 2695 buffer, lenp, ppos, convmul, convdiv);
2650} 2696}
2651 2697
2652/** 2698/**
2653 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2699 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2654 * @table: the sysctl table 2700 * @table: the sysctl table
2655 * @write: %TRUE if this is a write to the sysctl file 2701 * @write: %TRUE if this is a write to the sysctl file
2656 * @filp: the file structure
2657 * @buffer: the user buffer 2702 * @buffer: the user buffer
2658 * @lenp: the size of the user buffer 2703 * @lenp: the size of the user buffer
2659 * @ppos: file position 2704 * @ppos: file position
@@ -2666,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2666 * 2711 *
2667 * Returns 0 on success. 2712 * Returns 0 on success.
2668 */ 2713 */
2669int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2714int proc_doulongvec_minmax(struct ctl_table *table, int write,
2670 void __user *buffer, size_t *lenp, loff_t *ppos) 2715 void __user *buffer, size_t *lenp, loff_t *ppos)
2671{ 2716{
2672 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2717 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2673} 2718}
2674 2719
2675/** 2720/**
2676 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2721 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2677 * @table: the sysctl table 2722 * @table: the sysctl table
2678 * @write: %TRUE if this is a write to the sysctl file 2723 * @write: %TRUE if this is a write to the sysctl file
2679 * @filp: the file structure
2680 * @buffer: the user buffer 2724 * @buffer: the user buffer
2681 * @lenp: the size of the user buffer 2725 * @lenp: the size of the user buffer
2682 * @ppos: file position 2726 * @ppos: file position
@@ -2691,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2691 * Returns 0 on success. 2735 * Returns 0 on success.
2692 */ 2736 */
2693int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2737int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2694 struct file *filp,
2695 void __user *buffer, 2738 void __user *buffer,
2696 size_t *lenp, loff_t *ppos) 2739 size_t *lenp, loff_t *ppos)
2697{ 2740{
2698 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2741 return do_proc_doulongvec_minmax(table, write, buffer,
2699 lenp, ppos, HZ, 1000l); 2742 lenp, ppos, HZ, 1000l);
2700} 2743}
2701 2744
@@ -2771,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2771 * proc_dointvec_jiffies - read a vector of integers as seconds 2814 * proc_dointvec_jiffies - read a vector of integers as seconds
2772 * @table: the sysctl table 2815 * @table: the sysctl table
2773 * @write: %TRUE if this is a write to the sysctl file 2816 * @write: %TRUE if this is a write to the sysctl file
2774 * @filp: the file structure
2775 * @buffer: the user buffer 2817 * @buffer: the user buffer
2776 * @lenp: the size of the user buffer 2818 * @lenp: the size of the user buffer
2777 * @ppos: file position 2819 * @ppos: file position
@@ -2783,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2783 * 2825 *
2784 * Returns 0 on success. 2826 * Returns 0 on success.
2785 */ 2827 */
2786int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2828int proc_dointvec_jiffies(struct ctl_table *table, int write,
2787 void __user *buffer, size_t *lenp, loff_t *ppos) 2829 void __user *buffer, size_t *lenp, loff_t *ppos)
2788{ 2830{
2789 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2831 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2790 do_proc_dointvec_jiffies_conv,NULL); 2832 do_proc_dointvec_jiffies_conv,NULL);
2791} 2833}
2792 2834
@@ -2794,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2794 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2836 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2795 * @table: the sysctl table 2837 * @table: the sysctl table
2796 * @write: %TRUE if this is a write to the sysctl file 2838 * @write: %TRUE if this is a write to the sysctl file
2797 * @filp: the file structure
2798 * @buffer: the user buffer 2839 * @buffer: the user buffer
2799 * @lenp: the size of the user buffer 2840 * @lenp: the size of the user buffer
2800 * @ppos: pointer to the file position 2841 * @ppos: pointer to the file position
@@ -2806,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2806 * 2847 *
2807 * Returns 0 on success. 2848 * Returns 0 on success.
2808 */ 2849 */
2809int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2850int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2810 void __user *buffer, size_t *lenp, loff_t *ppos) 2851 void __user *buffer, size_t *lenp, loff_t *ppos)
2811{ 2852{
2812 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2853 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2813 do_proc_dointvec_userhz_jiffies_conv,NULL); 2854 do_proc_dointvec_userhz_jiffies_conv,NULL);
2814} 2855}
2815 2856
@@ -2817,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2817 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2858 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2818 * @table: the sysctl table 2859 * @table: the sysctl table
2819 * @write: %TRUE if this is a write to the sysctl file 2860 * @write: %TRUE if this is a write to the sysctl file
2820 * @filp: the file structure
2821 * @buffer: the user buffer 2861 * @buffer: the user buffer
2822 * @lenp: the size of the user buffer 2862 * @lenp: the size of the user buffer
2823 * @ppos: file position 2863 * @ppos: file position
@@ -2830,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2830 * 2870 *
2831 * Returns 0 on success. 2871 * Returns 0 on success.
2832 */ 2872 */
2833int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2873int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2834 void __user *buffer, size_t *lenp, loff_t *ppos) 2874 void __user *buffer, size_t *lenp, loff_t *ppos)
2835{ 2875{
2836 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2876 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2837 do_proc_dointvec_ms_jiffies_conv, NULL); 2877 do_proc_dointvec_ms_jiffies_conv, NULL);
2838} 2878}
2839 2879
2840static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2880static int proc_do_cad_pid(struct ctl_table *table, int write,
2841 void __user *buffer, size_t *lenp, loff_t *ppos) 2881 void __user *buffer, size_t *lenp, loff_t *ppos)
2842{ 2882{
2843 struct pid *new_pid; 2883 struct pid *new_pid;
@@ -2846,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2846 2886
2847 tmp = pid_vnr(cad_pid); 2887 tmp = pid_vnr(cad_pid);
2848 2888
2849 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2889 r = __do_proc_dointvec(&tmp, table, write, buffer,
2850 lenp, ppos, NULL, NULL); 2890 lenp, ppos, NULL, NULL);
2851 if (r || !write) 2891 if (r || !write)
2852 return r; 2892 return r;
@@ -2861,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2861 2901
2862#else /* CONFIG_PROC_FS */ 2902#else /* CONFIG_PROC_FS */
2863 2903
2864int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2904int proc_dostring(struct ctl_table *table, int write,
2865 void __user *buffer, size_t *lenp, loff_t *ppos) 2905 void __user *buffer, size_t *lenp, loff_t *ppos)
2866{ 2906{
2867 return -ENOSYS; 2907 return -ENOSYS;
2868} 2908}
2869 2909
2870int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2910int proc_dointvec(struct ctl_table *table, int write,
2871 void __user *buffer, size_t *lenp, loff_t *ppos) 2911 void __user *buffer, size_t *lenp, loff_t *ppos)
2872{ 2912{
2873 return -ENOSYS; 2913 return -ENOSYS;
2874} 2914}
2875 2915
2876int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2916int proc_dointvec_minmax(struct ctl_table *table, int write,
2877 void __user *buffer, size_t *lenp, loff_t *ppos) 2917 void __user *buffer, size_t *lenp, loff_t *ppos)
2878{ 2918{
2879 return -ENOSYS; 2919 return -ENOSYS;
2880} 2920}
2881 2921
2882int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2922int proc_dointvec_jiffies(struct ctl_table *table, int write,
2883 void __user *buffer, size_t *lenp, loff_t *ppos) 2923 void __user *buffer, size_t *lenp, loff_t *ppos)
2884{ 2924{
2885 return -ENOSYS; 2925 return -ENOSYS;
2886} 2926}
2887 2927
2888int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2928int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2889 void __user *buffer, size_t *lenp, loff_t *ppos) 2929 void __user *buffer, size_t *lenp, loff_t *ppos)
2890{ 2930{
2891 return -ENOSYS; 2931 return -ENOSYS;
2892} 2932}
2893 2933
2894int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2934int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2895 void __user *buffer, size_t *lenp, loff_t *ppos) 2935 void __user *buffer, size_t *lenp, loff_t *ppos)
2896{ 2936{
2897 return -ENOSYS; 2937 return -ENOSYS;
2898} 2938}
2899 2939
2900int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2940int proc_doulongvec_minmax(struct ctl_table *table, int write,
2901 void __user *buffer, size_t *lenp, loff_t *ppos) 2941 void __user *buffer, size_t *lenp, loff_t *ppos)
2902{ 2942{
2903 return -ENOSYS; 2943 return -ENOSYS;
2904} 2944}
2905 2945
2906int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2946int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2907 struct file *filp,
2908 void __user *buffer, 2947 void __user *buffer,
2909 size_t *lenp, loff_t *ppos) 2948 size_t *lenp, loff_t *ppos)
2910{ 2949{
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..5e18c6ab2c6a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
165 162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 list_for_each_entry(cs, &clocksource_list, list)
300
301 list_for_each_entry(cs, &clocksource_list, list) {
302 if (cs->resume) 398 if (cs->resume)
303 cs->resume(); 399 cs->resume();
304 }
305 400
306 clocksource_resume_watchdog(); 401 clocksource_resume_watchdog();
307
308 spin_unlock_irqrestore(&clocksource_lock, flags);
309} 402}
310 403
311/** 404/**
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 413 clocksource_resume_watchdog();
321} 414}
322 415
416#ifdef CONFIG_GENERIC_TIME
417
323/** 418/**
324 * clocksource_get_next - Returns the selected clocksource 419 * clocksource_select - Select the best clocksource available
325 * 420 *
421 * Private function. Must hold clocksource_mutex when called.
422 *
423 * Select the clocksource with the best rating, or the clocksource,
424 * which is selected by userspace override.
326 */ 425 */
327struct clocksource *clocksource_get_next(void) 426static void clocksource_select(void)
328{ 427{
329 unsigned long flags; 428 struct clocksource *best, *cs;
330 429
331 spin_lock_irqsave(&clocksource_lock, flags); 430 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 431 return;
333 curr_clocksource = next_clocksource; 432 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 433 best = list_first_entry(&clocksource_list, struct clocksource, list);
434 /* Check for the override clocksource. */
435 list_for_each_entry(cs, &clocksource_list, list) {
436 if (strcmp(cs->name, override_name) != 0)
437 continue;
438 /*
439 * Check to make sure we don't switch to a non-highres
440 * capable clocksource if the tick code is in oneshot
441 * mode (highres or nohz)
442 */
443 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
444 tick_oneshot_mode_active()) {
445 /* Override clocksource cannot be used. */
446 printk(KERN_WARNING "Override clocksource %s is not "
447 "HRT compatible. Cannot switch while in "
448 "HRT/NOHZ mode\n", cs->name);
449 override_name[0] = 0;
450 } else
451 /* Override clocksource can be used. */
452 best = cs;
453 break;
454 }
455 if (curr_clocksource != best) {
456 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
457 curr_clocksource = best;
458 timekeeping_notify(curr_clocksource);
335 } 459 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 460}
340 461
341/** 462#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 463
343 * 464static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 465
466#endif
467
468/*
469 * clocksource_done_booting - Called near the end of core bootup
345 * 470 *
346 * Select the clocksource with the best rating, or the clocksource, 471 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 472 * We use fs_initcall because we want this to start before
473 * device_initcall but after subsys_initcall.
348 */ 474 */
349static struct clocksource *select_clocksource(void) 475static int __init clocksource_done_booting(void)
350{ 476{
351 struct clocksource *next; 477 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 478
362 if (next == curr_clocksource) 479 /*
363 return NULL; 480 * Run the watchdog first to eliminate unstable clock sources
481 */
482 clocksource_watchdog_kthread(NULL);
364 483
365 return next; 484 mutex_lock(&clocksource_mutex);
485 clocksource_select();
486 mutex_unlock(&clocksource_mutex);
487 return 0;
366} 488}
489fs_initcall(clocksource_done_booting);
367 490
368/* 491/*
369 * Enqueue the clocksource sorted by rating 492 * Enqueue the clocksource sorted by rating
370 */ 493 */
371static int clocksource_enqueue(struct clocksource *c) 494static void clocksource_enqueue(struct clocksource *cs)
372{ 495{
373 struct list_head *tmp, *entry = &clocksource_list; 496 struct list_head *entry = &clocksource_list;
497 struct clocksource *tmp;
374 498
375 list_for_each(tmp, &clocksource_list) { 499 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 500 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 501 if (tmp->rating >= cs->rating)
383 entry = tmp; 502 entry = &tmp->list;
384 } 503 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 504}
393 505
394/** 506/**
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 509 *
398 * Returns -EBUSY if registration fails, zero otherwise. 510 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 511 */
400int clocksource_register(struct clocksource *c) 512int clocksource_register(struct clocksource *cs)
401{ 513{
402 unsigned long flags; 514 mutex_lock(&clocksource_mutex);
403 int ret; 515 clocksource_enqueue(cs);
404 516 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 517 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 518 mutex_unlock(&clocksource_mutex);
407 if (!ret) 519 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 520}
414EXPORT_SYMBOL(clocksource_register); 521EXPORT_SYMBOL(clocksource_register);
415 522
523static void __clocksource_change_rating(struct clocksource *cs, int rating)
524{
525 list_del(&cs->list);
526 cs->rating = rating;
527 clocksource_enqueue(cs);
528 clocksource_select();
529}
530
416/** 531/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 532 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 533 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 534void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 535{
422 unsigned long flags; 536 mutex_lock(&clocksource_mutex);
423 537 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 538 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 539}
540EXPORT_SYMBOL(clocksource_change_rating);
431 541
432/** 542/**
433 * clocksource_unregister - remove a registered clocksource 543 * clocksource_unregister - remove a registered clocksource
434 */ 544 */
435void clocksource_unregister(struct clocksource *cs) 545void clocksource_unregister(struct clocksource *cs)
436{ 546{
437 unsigned long flags; 547 mutex_lock(&clocksource_mutex);
438 548 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 549 list_del(&cs->list);
441 if (clocksource_override == cs) 550 clocksource_select();
442 clocksource_override = NULL; 551 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 552}
553EXPORT_SYMBOL(clocksource_unregister);
446 554
447#ifdef CONFIG_SYSFS 555#ifdef CONFIG_SYSFS
448/** 556/**
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 566{
459 ssize_t count = 0; 567 ssize_t count = 0;
460 568
461 spin_lock_irq(&clocksource_lock); 569 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 570 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 571 mutex_unlock(&clocksource_mutex);
464 572
465 return count; 573 return count;
466} 574}
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 586 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 587 const char *buf, size_t count)
480{ 588{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 589 size_t ret = count;
483 int len;
484 590
485 /* strings from sysfs write are not 0 terminated! */ 591 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 592 if (count >= sizeof(override_name))
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 596 if (buf[count-1] == '\n')
491 count--; 597 count--;
492 598
493 spin_lock_irq(&clocksource_lock); 599 mutex_lock(&clocksource_mutex);
494 600
495 if (count > 0) 601 if (count > 0)
496 memcpy(override_name, buf, count); 602 memcpy(override_name, buf, count);
497 override_name[count] = 0; 603 override_name[count] = 0;
604 clocksource_select();
498 605
499 len = strlen(override_name); 606 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 607
532 return ret; 608 return ret;
533} 609}
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 623 struct clocksource *src;
548 ssize_t count = 0; 624 ssize_t count = 0;
549 625
550 spin_lock_irq(&clocksource_lock); 626 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 627 list_for_each_entry(src, &clocksource_list, list) {
552 /* 628 /*
553 * Don't show non-HRES clocksource if the tick code is 629 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 635 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 636 "%s ", src->name);
561 } 637 }
562 spin_unlock_irq(&clocksource_lock); 638 mutex_unlock(&clocksource_mutex);
563 639
564 count += snprintf(buf + count, 640 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 641 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 690 */
615static int __init boot_override_clocksource(char* str) 691static int __init boot_override_clocksource(char* str)
616{ 692{
617 unsigned long flags; 693 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 694 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 695 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 696 mutex_unlock(&clocksource_mutex);
622 return 1; 697 return 1;
623} 698}
624 699
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..89aed5933ed4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle)
231 if (!inidle && !ts->inidle) 231 if (!inidle && !ts->inidle)
232 goto end; 232 goto end;
233 233
234 /*
235 * Set ts->inidle unconditionally. Even if the system did not
236 * switch to NOHZ mode the cpu frequency governers rely on the
237 * update of the idle time accounting in tick_nohz_start_idle().
238 */
239 ts->inidle = 1;
240
234 now = tick_nohz_start_idle(ts); 241 now = tick_nohz_start_idle(ts);
235 242
236 /* 243 /*
@@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle)
248 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 255 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
249 goto end; 256 goto end;
250 257
251 ts->inidle = 1;
252
253 if (need_resched()) 258 if (need_resched())
254 goto end; 259 goto end;
255 260
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..c3a4e2907eaa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,12 +13,123 @@
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/sched.h>
16#include <linux/sysdev.h> 17#include <linux/sysdev.h>
17#include <linux/clocksource.h> 18#include <linux/clocksource.h>
18#include <linux/jiffies.h> 19#include <linux/jiffies.h>
19#include <linux/time.h> 20#include <linux/time.h>
20#include <linux/tick.h> 21#include <linux/tick.h>
22#include <linux/stop_machine.h>
23
24/* Structure holding internal timekeeping values. */
25struct timekeeper {
26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock;
28 /* The shift value of the current clocksource. */
29 int shift;
30
31 /* Number of clock cycles in one NTP interval. */
32 cycle_t cycle_interval;
33 /* Number of clock shifted nano seconds in one NTP interval. */
34 u64 xtime_interval;
35 /* Raw nano seconds accumulated per NTP interval. */
36 u32 raw_interval;
37
38 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
39 u64 xtime_nsec;
40 /* Difference between accumulated time and NTP time in ntp
41 * shifted nano seconds. */
42 s64 ntp_error;
43 /* Shift conversion between clock shifted nano seconds and
44 * ntp shifted nano seconds. */
45 int ntp_error_shift;
46 /* NTP adjusted clock multiplier */
47 u32 mult;
48};
49
50struct timekeeper timekeeper;
51
52/**
53 * timekeeper_setup_internals - Set up internals to use clocksource clock.
54 *
55 * @clock: Pointer to clocksource.
56 *
57 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
58 * pair and interval request.
59 *
60 * Unless you're the timekeeping code, you should not be using this!
61 */
62static void timekeeper_setup_internals(struct clocksource *clock)
63{
64 cycle_t interval;
65 u64 tmp;
66
67 timekeeper.clock = clock;
68 clock->cycle_last = clock->read(clock);
21 69
70 /* Do the ns -> cycle conversion first, using original mult */
71 tmp = NTP_INTERVAL_LENGTH;
72 tmp <<= clock->shift;
73 tmp += clock->mult/2;
74 do_div(tmp, clock->mult);
75 if (tmp == 0)
76 tmp = 1;
77
78 interval = (cycle_t) tmp;
79 timekeeper.cycle_interval = interval;
80
81 /* Go back from cycles -> shifted ns */
82 timekeeper.xtime_interval = (u64) interval * clock->mult;
83 timekeeper.raw_interval =
84 ((u64) interval * clock->mult) >> clock->shift;
85
86 timekeeper.xtime_nsec = 0;
87 timekeeper.shift = clock->shift;
88
89 timekeeper.ntp_error = 0;
90 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
91
92 /*
93 * The timekeeper keeps its own mult values for the currently
94 * active clocksource. These value will be adjusted via NTP
95 * to counteract clock drifting.
96 */
97 timekeeper.mult = clock->mult;
98}
99
100/* Timekeeper helper functions. */
101static inline s64 timekeeping_get_ns(void)
102{
103 cycle_t cycle_now, cycle_delta;
104 struct clocksource *clock;
105
106 /* read clocksource: */
107 clock = timekeeper.clock;
108 cycle_now = clock->read(clock);
109
110 /* calculate the delta since the last update_wall_time: */
111 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
112
113 /* return delta convert to nanoseconds using ntp adjusted mult. */
114 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
115 timekeeper.shift);
116}
117
118static inline s64 timekeeping_get_ns_raw(void)
119{
120 cycle_t cycle_now, cycle_delta;
121 struct clocksource *clock;
122
123 /* read clocksource: */
124 clock = timekeeper.clock;
125 cycle_now = clock->read(clock);
126
127 /* calculate the delta since the last update_wall_time: */
128 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
129
130 /* return delta convert to nanoseconds using ntp adjusted mult. */
131 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
132}
22 133
23/* 134/*
24 * This read-write spinlock protects us from races in SMP while 135 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +155,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 155 */
45struct timespec xtime __attribute__ ((aligned (16))); 156struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 157struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 158static struct timespec total_sleep_time;
159
160/*
161 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
162 */
163struct timespec raw_time;
48 164
49/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
@@ -56,35 +172,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 172 timespec_add_ns(&xtime_cache, nsec);
57} 173}
58 174
59struct clocksource *clock; 175/* must hold xtime_lock */
60 176void timekeeping_leap_insert(int leapsecond)
177{
178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock);
181}
61 182
62#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
184
63/** 185/**
64 * clocksource_forward_now - update clock to the current time 186 * timekeeping_forward_now - update clock to the current time
65 * 187 *
66 * Forward the current clock to update its state since the last call to 188 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 189 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 190 * as it avoids having to deal with this time offset explicitly.
69 */ 191 */
70static void clocksource_forward_now(void) 192static void timekeeping_forward_now(void)
71{ 193{
72 cycle_t cycle_now, cycle_delta; 194 cycle_t cycle_now, cycle_delta;
195 struct clocksource *clock;
73 s64 nsec; 196 s64 nsec;
74 197
75 cycle_now = clocksource_read(clock); 198 clock = timekeeper.clock;
199 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 200 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 201 clock->cycle_last = cycle_now;
78 202
79 nsec = cyc2ns(clock, cycle_delta); 203 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
204 timekeeper.shift);
80 205
81 /* If arch requires, add in gettimeoffset() */ 206 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 207 nsec += arch_gettimeoffset();
83 208
84 timespec_add_ns(&xtime, nsec); 209 timespec_add_ns(&xtime, nsec);
85 210
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 211 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 212 timespec_add_ns(&raw_time, nsec);
88} 213}
89 214
90/** 215/**
@@ -95,7 +220,6 @@ static void clocksource_forward_now(void)
95 */ 220 */
96void getnstimeofday(struct timespec *ts) 221void getnstimeofday(struct timespec *ts)
97{ 222{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 223 unsigned long seq;
100 s64 nsecs; 224 s64 nsecs;
101 225
@@ -105,15 +229,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 229 seq = read_seqbegin(&xtime_lock);
106 230
107 *ts = xtime; 231 *ts = xtime;
108 232 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 233
118 /* If arch requires, add in gettimeoffset() */ 234 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 235 nsecs += arch_gettimeoffset();
@@ -125,6 +241,57 @@ void getnstimeofday(struct timespec *ts)
125 241
126EXPORT_SYMBOL(getnstimeofday); 242EXPORT_SYMBOL(getnstimeofday);
127 243
244ktime_t ktime_get(void)
245{
246 unsigned int seq;
247 s64 secs, nsecs;
248
249 WARN_ON(timekeeping_suspended);
250
251 do {
252 seq = read_seqbegin(&xtime_lock);
253 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
254 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
255 nsecs += timekeeping_get_ns();
256
257 } while (read_seqretry(&xtime_lock, seq));
258 /*
259 * Use ktime_set/ktime_add_ns to create a proper ktime on
260 * 32-bit architectures without CONFIG_KTIME_SCALAR.
261 */
262 return ktime_add_ns(ktime_set(secs, 0), nsecs);
263}
264EXPORT_SYMBOL_GPL(ktime_get);
265
266/**
267 * ktime_get_ts - get the monotonic clock in timespec format
268 * @ts: pointer to timespec variable
269 *
270 * The function calculates the monotonic clock from the realtime
271 * clock and the wall_to_monotonic offset and stores the result
272 * in normalized timespec format in the variable pointed to by @ts.
273 */
274void ktime_get_ts(struct timespec *ts)
275{
276 struct timespec tomono;
277 unsigned int seq;
278 s64 nsecs;
279
280 WARN_ON(timekeeping_suspended);
281
282 do {
283 seq = read_seqbegin(&xtime_lock);
284 *ts = xtime;
285 tomono = wall_to_monotonic;
286 nsecs = timekeeping_get_ns();
287
288 } while (read_seqretry(&xtime_lock, seq));
289
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs);
292}
293EXPORT_SYMBOL_GPL(ktime_get_ts);
294
128/** 295/**
129 * do_gettimeofday - Returns the time of day in a timeval 296 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 297 * @tv: pointer to the timeval to be set
@@ -157,7 +324,7 @@ int do_settimeofday(struct timespec *tv)
157 324
158 write_seqlock_irqsave(&xtime_lock, flags); 325 write_seqlock_irqsave(&xtime_lock, flags);
159 326
160 clocksource_forward_now(); 327 timekeeping_forward_now();
161 328
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 329 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 330 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +334,10 @@ int do_settimeofday(struct timespec *tv)
167 334
168 update_xtime_cache(0); 335 update_xtime_cache(0);
169 336
170 clock->error = 0; 337 timekeeper.ntp_error = 0;
171 ntp_clear(); 338 ntp_clear();
172 339
173 update_vsyscall(&xtime, clock); 340 update_vsyscall(&xtime, timekeeper.clock);
174 341
175 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
176 343
@@ -187,44 +354,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 354 *
188 * Accumulates current time interval and initializes new clocksource 355 * Accumulates current time interval and initializes new clocksource
189 */ 356 */
190static void change_clocksource(void) 357static int change_clocksource(void *data)
191{ 358{
192 struct clocksource *new, *old; 359 struct clocksource *new, *old;
193 360
194 new = clocksource_get_next(); 361 new = (struct clocksource *) data;
362
363 timekeeping_forward_now();
364 if (!new->enable || new->enable(new) == 0) {
365 old = timekeeper.clock;
366 timekeeper_setup_internals(new);
367 if (old->disable)
368 old->disable(old);
369 }
370 return 0;
371}
195 372
196 if (clock == new) 373/**
374 * timekeeping_notify - Install a new clock source
375 * @clock: pointer to the clock source
376 *
377 * This function is called from clocksource.c after a new, better clock
378 * source has been registered. The caller holds the clocksource_mutex.
379 */
380void timekeeping_notify(struct clocksource *clock)
381{
382 if (timekeeper.clock == clock)
197 return; 383 return;
384 stop_machine(change_clocksource, clock, NULL);
385 tick_clock_notify();
386}
198 387
199 clocksource_forward_now(); 388#else /* GENERIC_TIME */
200 389
201 if (clocksource_enable(new)) 390static inline void timekeeping_forward_now(void) { }
202 return;
203 391
204 new->raw_time = clock->raw_time; 392/**
205 old = clock; 393 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 394 *
207 clocksource_disable(old); 395 * returns the time in ktime_t format
396 */
397ktime_t ktime_get(void)
398{
399 struct timespec now;
208 400
209 clock->cycle_last = 0; 401 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 402
215 tick_clock_notify(); 403 return timespec_to_ktime(now);
404}
405EXPORT_SYMBOL_GPL(ktime_get);
216 406
217 /* 407/**
218 * We're holding xtime lock and waking up klogd would deadlock 408 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 409 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 410 *
221 clock->name); 411 * The function calculates the monotonic clock from the realtime
222 */ 412 * clock and the wall_to_monotonic offset and stores the result
413 * in normalized timespec format in the variable pointed to by @ts.
414 */
415void ktime_get_ts(struct timespec *ts)
416{
417 struct timespec tomono;
418 unsigned long seq;
419
420 do {
421 seq = read_seqbegin(&xtime_lock);
422 getnstimeofday(ts);
423 tomono = wall_to_monotonic;
424
425 } while (read_seqretry(&xtime_lock, seq));
426
427 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
428 ts->tv_nsec + tomono.tv_nsec);
223} 429}
224#else 430EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 431
226static inline void change_clocksource(void) { } 432#endif /* !GENERIC_TIME */
227#endif 433
434/**
435 * ktime_get_real - get the real (wall-) time in ktime_t format
436 *
437 * returns the time in ktime_t format
438 */
439ktime_t ktime_get_real(void)
440{
441 struct timespec now;
442
443 getnstimeofday(&now);
444
445 return timespec_to_ktime(now);
446}
447EXPORT_SYMBOL_GPL(ktime_get_real);
228 448
229/** 449/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 450 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +456,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 456{
237 unsigned long seq; 457 unsigned long seq;
238 s64 nsecs; 458 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 459
241 do { 460 do {
242 seq = read_seqbegin(&xtime_lock); 461 seq = read_seqbegin(&xtime_lock);
243 462 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 463 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 464
255 } while (read_seqretry(&xtime_lock, seq)); 465 } while (read_seqretry(&xtime_lock, seq));
256 466
@@ -270,7 +480,7 @@ int timekeeping_valid_for_hres(void)
270 do { 480 do {
271 seq = read_seqbegin(&xtime_lock); 481 seq = read_seqbegin(&xtime_lock);
272 482
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 483 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 484
275 } while (read_seqretry(&xtime_lock, seq)); 485 } while (read_seqretry(&xtime_lock, seq));
276 486
@@ -278,17 +488,33 @@ int timekeeping_valid_for_hres(void)
278} 488}
279 489
280/** 490/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 491 * read_persistent_clock - Return time from the persistent clock.
282 * 492 *
283 * Weak dummy function for arches that do not yet support it. 493 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 494 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 495 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 496 *
287 * XXX - Do be sure to remove it once all arches implement it. 497 * XXX - Do be sure to remove it once all arches implement it.
288 */ 498 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 499void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 500{
291 return 0; 501 ts->tv_sec = 0;
502 ts->tv_nsec = 0;
503}
504
505/**
506 * read_boot_clock - Return time of the system start.
507 *
508 * Weak dummy function for arches that do not yet support it.
509 * Function to read the exact time the system has been started.
510 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
511 *
512 * XXX - Do be sure to remove it once all arches implement it.
513 */
514void __attribute__((weak)) read_boot_clock(struct timespec *ts)
515{
516 ts->tv_sec = 0;
517 ts->tv_nsec = 0;
292} 518}
293 519
294/* 520/*
@@ -296,29 +522,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 522 */
297void __init timekeeping_init(void) 523void __init timekeeping_init(void)
298{ 524{
525 struct clocksource *clock;
299 unsigned long flags; 526 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 527 struct timespec now, boot;
528
529 read_persistent_clock(&now);
530 read_boot_clock(&boot);
301 531
302 write_seqlock_irqsave(&xtime_lock, flags); 532 write_seqlock_irqsave(&xtime_lock, flags);
303 533
304 ntp_init(); 534 ntp_init();
305 535
306 clock = clocksource_get_next(); 536 clock = clocksource_default_clock();
307 clocksource_enable(clock); 537 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 538 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 539 timekeeper_setup_internals(clock);
310 540
311 xtime.tv_sec = sec; 541 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 542 xtime.tv_nsec = now.tv_nsec;
543 raw_time.tv_sec = 0;
544 raw_time.tv_nsec = 0;
545 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
546 boot.tv_sec = xtime.tv_sec;
547 boot.tv_nsec = xtime.tv_nsec;
548 }
313 set_normalized_timespec(&wall_to_monotonic, 549 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 550 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 551 update_xtime_cache(0);
316 total_sleep_time = 0; 552 total_sleep_time.tv_sec = 0;
553 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 554 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 555}
319 556
320/* time in seconds when suspend began */ 557/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 558static struct timespec timekeeping_suspend_time;
322 559
323/** 560/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 561 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +568,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 568static int timekeeping_resume(struct sys_device *dev)
332{ 569{
333 unsigned long flags; 570 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 571 struct timespec ts;
572
573 read_persistent_clock(&ts);
335 574
336 clocksource_resume(); 575 clocksource_resume();
337 576
338 write_seqlock_irqsave(&xtime_lock, flags); 577 write_seqlock_irqsave(&xtime_lock, flags);
339 578
340 if (now && (now > timekeeping_suspend_time)) { 579 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 580 ts = timespec_sub(ts, timekeeping_suspend_time);
342 581 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 582 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 583 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 584 }
347 update_xtime_cache(0); 585 update_xtime_cache(0);
348 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
349 clock->cycle_last = 0; 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 588 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 589 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 590 write_sequnlock_irqrestore(&xtime_lock, flags);
354 591
@@ -366,10 +603,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 603{
367 unsigned long flags; 604 unsigned long flags;
368 605
369 timekeeping_suspend_time = read_persistent_clock(); 606 read_persistent_clock(&timekeeping_suspend_time);
370 607
371 write_seqlock_irqsave(&xtime_lock, flags); 608 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 609 timekeeping_forward_now();
373 timekeeping_suspended = 1; 610 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&xtime_lock, flags);
375 612
@@ -404,7 +641,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 641 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 642 * to compensate for late or lost adjustments.
406 */ 643 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 644static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 645 s64 *offset)
409{ 646{
410 s64 tick_error, i; 647 s64 tick_error, i;
@@ -420,7 +657,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 657 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 658 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 659 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 660 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 661 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 662 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 663 error2 >>= 2;
@@ -429,8 +666,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 666 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 667 * remove the single look ahead already included in the error.
431 */ 668 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 669 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 670 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 671 error = ((error - tick_error) >> look_ahead) + tick_error;
435 672
436 /* Finally calculate the adjustment shift value. */ 673 /* Finally calculate the adjustment shift value. */
@@ -455,18 +692,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 692 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 693 * for other values we can do a bit more work.
457 */ 694 */
458static void clocksource_adjust(s64 offset) 695static void timekeeping_adjust(s64 offset)
459{ 696{
460 s64 error, interval = clock->cycle_interval; 697 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 698 int adj;
462 699
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 700 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 701 if (error > interval) {
465 error >>= 2; 702 error >>= 2;
466 if (likely(error <= interval)) 703 if (likely(error <= interval))
467 adj = 1; 704 adj = 1;
468 else 705 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 706 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 707 } else if (error < -interval) {
471 error >>= 2; 708 error >>= 2;
472 if (likely(error >= -interval)) { 709 if (likely(error >= -interval)) {
@@ -474,15 +711,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 711 interval = -interval;
475 offset = -offset; 712 offset = -offset;
476 } else 713 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 714 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 715 } else
479 return; 716 return;
480 717
481 clock->mult += adj; 718 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 719 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 720 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 721 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 722 timekeeper.ntp_error_shift;
486} 723}
487 724
488/** 725/**
@@ -492,53 +729,59 @@ static void clocksource_adjust(s64 offset)
492 */ 729 */
493void update_wall_time(void) 730void update_wall_time(void)
494{ 731{
732 struct clocksource *clock;
495 cycle_t offset; 733 cycle_t offset;
734 u64 nsecs;
496 735
497 /* Make sure we're fully resumed: */ 736 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 737 if (unlikely(timekeeping_suspended))
499 return; 738 return;
500 739
740 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 741#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 742 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 743#else
504 offset = clock->cycle_interval; 744 offset = timekeeper.cycle_interval;
505#endif 745#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 747
508 /* normally this loop will run just once, however in the 748 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 749 * case of lost or late ticks, it will accumulate correctly.
510 */ 750 */
511 while (offset >= clock->cycle_interval) { 751 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
753
512 /* accumulate one interval */ 754 /* accumulate one interval */
513 offset -= clock->cycle_interval; 755 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 756 clock->cycle_last += timekeeper.cycle_interval;
515 757
516 clock->xtime_nsec += clock->xtime_interval; 758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 759 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 760 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 761 xtime.tv_sec++;
520 second_overflow(); 762 second_overflow();
521 } 763 }
522 764
523 clock->raw_time.tv_nsec += clock->raw_interval; 765 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 767 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 768 raw_time.tv_sec++;
527 } 769 }
528 770
529 /* accumulate error between NTP and clock interval */ 771 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 772 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
532 } 775 }
533 776
534 /* correct the clock when NTP error is too big */ 777 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 778 timekeeping_adjust(offset);
536 779
537 /* 780 /*
538 * Since in the loop above, we accumulate any amount of time 781 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 782 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 783 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 784 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 785 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 786 * cause it to underflow.
544 * 787 *
@@ -550,24 +793,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 793 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 794 * xtime_nsec is not as small.
552 */ 795 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 796 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 797 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 798 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 799 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 800 }
558 801
559 /* store full nanoseconds into xtime after rounding it up and 802 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 803 * add the remainder to the error difference.
561 */ 804 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 805 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 806 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 807 timekeeper.ntp_error += timekeeper.xtime_nsec <<
808 timekeeper.ntp_error_shift;
565 809
566 update_xtime_cache(cyc2ns(clock, offset)); 810 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
811 update_xtime_cache(nsecs);
567 812
568 /* check to see if there is a new clocksource to use */ 813 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 814 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 815}
572 816
573/** 817/**
@@ -583,9 +827,12 @@ void update_wall_time(void)
583 */ 827 */
584void getboottime(struct timespec *ts) 828void getboottime(struct timespec *ts)
585{ 829{
586 set_normalized_timespec(ts, 830 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 831 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 832 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
833 };
834
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 836}
590 837
591/** 838/**
@@ -594,7 +841,7 @@ void getboottime(struct timespec *ts)
594 */ 841 */
595void monotonic_to_bootbased(struct timespec *ts) 842void monotonic_to_bootbased(struct timespec *ts)
596{ 843{
597 ts->tv_sec += total_sleep_time; 844 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 845}
599 846
600unsigned long get_seconds(void) 847unsigned long get_seconds(void)
@@ -603,6 +850,10 @@ unsigned long get_seconds(void)
603} 850}
604EXPORT_SYMBOL(get_seconds); 851EXPORT_SYMBOL(get_seconds);
605 852
853struct timespec __current_kernel_time(void)
854{
855 return xtime_cache;
856}
606 857
607struct timespec current_kernel_time(void) 858struct timespec current_kernel_time(void)
608{ 859{
@@ -618,3 +869,20 @@ struct timespec current_kernel_time(void)
618 return now; 869 return now;
619} 870}
620EXPORT_SYMBOL(current_kernel_time); 871EXPORT_SYMBOL(current_kernel_time);
872
873struct timespec get_monotonic_coarse(void)
874{
875 struct timespec now, mono;
876 unsigned long seq;
877
878 do {
879 seq = read_seqbegin(&xtime_lock);
880
881 now = xtime_cache;
882 mono = wall_to_monotonic;
883 } while (read_seqretry(&xtime_lock, seq));
884
885 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
886 now.tv_nsec + mono.tv_nsec);
887 return now;
888}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 275 return single_open(filp, timer_list_show, NULL);
276} 276}
277 277
278static struct file_operations timer_list_fops = { 278static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 279 .open = timer_list_open,
280 .read = seq_read, 280 .read = seq_read,
281 .llseek = seq_lseek, 281 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 395 return single_open(filp, tstats_show, NULL);
396} 396}
397 397
398static struct file_operations tstats_fops = { 398static const struct file_operations tstats_fops = {
399 .open = tstats_open, 399 .open = tstats_open,
400 .read = seq_read, 400 .read = seq_read,
401 .write = tstats_write, 401 .write = tstats_write,
diff --git a/kernel/timer.c b/kernel/timer.c
index a3d25f415019..5db5a8d26811 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
46#include <asm/timex.h> 46#include <asm/timex.h>
47#include <asm/io.h> 47#include <asm/io.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/timer.h>
51
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 52u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 53
51EXPORT_SYMBOL(jiffies_64); 54EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
72 spinlock_t lock; 75 spinlock_t lock;
73 struct timer_list *running_timer; 76 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 77 unsigned long timer_jiffies;
78 unsigned long next_timer;
75 struct tvec_root tv1; 79 struct tvec_root tv1;
76 struct tvec tv2; 80 struct tvec tv2;
77 struct tvec tv3; 81 struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
520static inline void debug_timer_deactivate(struct timer_list *timer) { } 524static inline void debug_timer_deactivate(struct timer_list *timer) { }
521#endif 525#endif
522 526
527static inline void debug_init(struct timer_list *timer)
528{
529 debug_timer_init(timer);
530 trace_timer_init(timer);
531}
532
533static inline void
534debug_activate(struct timer_list *timer, unsigned long expires)
535{
536 debug_timer_activate(timer);
537 trace_timer_start(timer, expires);
538}
539
540static inline void debug_deactivate(struct timer_list *timer)
541{
542 debug_timer_deactivate(timer);
543 trace_timer_cancel(timer);
544}
545
523static void __init_timer(struct timer_list *timer, 546static void __init_timer(struct timer_list *timer,
524 const char *name, 547 const char *name,
525 struct lock_class_key *key) 548 struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
548 const char *name, 571 const char *name,
549 struct lock_class_key *key) 572 struct lock_class_key *key)
550{ 573{
551 debug_timer_init(timer); 574 debug_init(timer);
552 __init_timer(timer, name, key); 575 __init_timer(timer, name, key);
553} 576}
554EXPORT_SYMBOL(init_timer_key); 577EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
567{ 590{
568 struct list_head *entry = &timer->entry; 591 struct list_head *entry = &timer->entry;
569 592
570 debug_timer_deactivate(timer); 593 debug_deactivate(timer);
571 594
572 __list_del(entry->prev, entry->next); 595 __list_del(entry->prev, entry->next);
573 if (clear_pending) 596 if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 645
623 if (timer_pending(timer)) { 646 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 647 detach_timer(timer, 0);
648 if (timer->expires == base->next_timer &&
649 !tbase_get_deferrable(timer->base))
650 base->next_timer = base->timer_jiffies;
625 ret = 1; 651 ret = 1;
626 } else { 652 } else {
627 if (pending_only) 653 if (pending_only)
628 goto out_unlock; 654 goto out_unlock;
629 } 655 }
630 656
631 debug_timer_activate(timer); 657 debug_activate(timer, expires);
632 658
633 new_base = __get_cpu_var(tvec_bases); 659 new_base = __get_cpu_var(tvec_bases);
634 660
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 689 }
664 690
665 timer->expires = expires; 691 timer->expires = expires;
692 if (time_before(timer->expires, base->next_timer) &&
693 !tbase_get_deferrable(timer->base))
694 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 695 internal_add_timer(base, timer);
667 696
668out_unlock: 697out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
780 BUG_ON(timer_pending(timer) || !timer->function); 809 BUG_ON(timer_pending(timer) || !timer->function);
781 spin_lock_irqsave(&base->lock, flags); 810 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 811 timer_set_base(timer, base);
783 debug_timer_activate(timer); 812 debug_activate(timer, timer->expires);
813 if (time_before(timer->expires, base->next_timer) &&
814 !tbase_get_deferrable(timer->base))
815 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 816 internal_add_timer(base, timer);
785 /* 817 /*
786 * Check whether the other CPU is idle and needs to be 818 * Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 849 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 850 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 851 detach_timer(timer, 1);
852 if (timer->expires == base->next_timer &&
853 !tbase_get_deferrable(timer->base))
854 base->next_timer = base->timer_jiffies;
820 ret = 1; 855 ret = 1;
821 } 856 }
822 spin_unlock_irqrestore(&base->lock, flags); 857 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 885 ret = 0;
851 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 887 detach_timer(timer, 1);
888 if (timer->expires == base->next_timer &&
889 !tbase_get_deferrable(timer->base))
890 base->next_timer = base->timer_jiffies;
853 ret = 1; 891 ret = 1;
854 } 892 }
855out: 893out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
984 */ 1022 */
985 lock_map_acquire(&lockdep_map); 1023 lock_map_acquire(&lockdep_map);
986 1024
1025 trace_timer_expire_entry(timer);
987 fn(data); 1026 fn(data);
1027 trace_timer_expire_exit(timer);
988 1028
989 lock_map_release(&lockdep_map); 1029 lock_map_release(&lockdep_map);
990 1030
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1047#ifdef CONFIG_NO_HZ
1008/* 1048/*
1009 * Find out when the next timer event is due to happen. This 1049 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1050 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1051 * This function needs to be called with interrupts disabled.
1012 */ 1052 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1053static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1054{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1174 unsigned long expires;
1135 1175
1136 spin_lock(&base->lock); 1176 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1177 if (time_before_eq(base->next_timer, base->timer_jiffies))
1178 base->next_timer = __next_timer_interrupt(base);
1179 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1180 spin_unlock(&base->lock);
1139 1181
1140 if (time_before_eq(expires, now)) 1182 if (time_before_eq(expires, now))
@@ -1169,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
1169{ 1211{
1170 struct tvec_base *base = __get_cpu_var(tvec_bases); 1212 struct tvec_base *base = __get_cpu_var(tvec_bases);
1171 1213
1172 perf_counter_do_pending(); 1214 perf_event_do_pending();
1173 1215
1174 hrtimer_run_pending(); 1216 hrtimer_run_pending();
1175 1217
@@ -1522,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1522 INIT_LIST_HEAD(base->tv1.vec + j); 1564 INIT_LIST_HEAD(base->tv1.vec + j);
1523 1565
1524 base->timer_jiffies = jiffies; 1566 base->timer_jiffies = jiffies;
1567 base->next_timer = base->timer_jiffies;
1525 return 0; 1568 return 0;
1526} 1569}
1527 1570
@@ -1534,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1534 timer = list_first_entry(head, struct timer_list, entry); 1577 timer = list_first_entry(head, struct timer_list, entry);
1535 detach_timer(timer, 0); 1578 detach_timer(timer, 0);
1536 timer_set_base(timer, new_base); 1579 timer_set_base(timer, new_base);
1580 if (time_before(timer->expires, new_base->next_timer) &&
1581 !tbase_get_deferrable(timer->base))
1582 new_base->next_timer = timer->expires;
1537 internal_add_timer(new_base, timer); 1583 internal_add_timer(new_base, timer);
1538 } 1584 }
1539} 1585}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1ea0d1234f4a..b416512ad17f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
73# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
74# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
75# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
76# hidding of the automatic options options. 86# hidding of the automatic options.
77 87
78config TRACING 88config TRACING
79 bool 89 bool
@@ -469,6 +479,18 @@ config FTRACE_STARTUP_TEST
469 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
470 tracers of ftrace. 480 tracers of ftrace.
471 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
472config MMIOTRACE 494config MMIOTRACE
473 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
474 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 57
58libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
856} 856}
857 857
858/** 858/**
859 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
860 * @q: queue the io is for
861 * @rq: the source request
862 * @dev: target device
863 * @from: source sector
864 *
865 * Description:
866 * Device mapper remaps request to other devices.
867 * Add a trace for that action.
868 *
869 **/
870static void blk_add_trace_rq_remap(struct request_queue *q,
871 struct request *rq, dev_t dev,
872 sector_t from)
873{
874 struct blk_trace *bt = q->blk_trace;
875 struct blk_io_trace_remap r;
876
877 if (likely(!bt))
878 return;
879
880 r.device_from = cpu_to_be32(dev);
881 r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
882 r.sector_from = cpu_to_be64(from);
883
884 __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
885 rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
886 sizeof(r), &r);
887}
888
889/**
859 * blk_add_driver_data - Add binary message with driver-specific data 890 * blk_add_driver_data - Add binary message with driver-specific data
860 * @q: queue the io is for 891 * @q: queue the io is for
861 * @rq: io request 892 * @rq: io request
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
922 WARN_ON(ret); 953 WARN_ON(ret);
923 ret = register_trace_block_remap(blk_add_trace_remap); 954 ret = register_trace_block_remap(blk_add_trace_remap);
924 WARN_ON(ret); 955 WARN_ON(ret);
956 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
957 WARN_ON(ret);
925} 958}
926 959
927static void blk_unregister_tracepoints(void) 960static void blk_unregister_tracepoints(void)
928{ 961{
962 unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
929 unregister_trace_block_remap(blk_add_trace_remap); 963 unregister_trace_block_remap(blk_add_trace_remap);
930 unregister_trace_block_split(blk_add_trace_split); 964 unregister_trace_block_split(blk_add_trace_split);
931 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 965 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
@@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev)
1657 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); 1691 return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
1658} 1692}
1659 1693
1694void blk_trace_remove_sysfs(struct device *dev)
1695{
1696 sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
1697}
1698
1660#endif /* CONFIG_BLK_DEV_IO_TRACE */ 1699#endif /* CONFIG_BLK_DEV_IO_TRACE */
1661 1700
1662#ifdef CONFIG_EVENT_TRACING 1701#ifdef CONFIG_EVENT_TRACING
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96f..37ba67e33265 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void)
225 if (ftrace_trace_function == ftrace_stub) 225 if (ftrace_trace_function == ftrace_stub)
226 return; 226 return;
227 227
228#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
228 func = ftrace_trace_function; 229 func = ftrace_trace_function;
230#else
231 func = __ftrace_trace_function;
232#endif
229 233
230 if (ftrace_pid_trace) { 234 if (ftrace_pid_trace) {
231 set_ftrace_pid_function(func); 235 set_ftrace_pid_function(func);
@@ -1074,14 +1078,9 @@ static void ftrace_replace_code(int enable)
1074 failed = __ftrace_replace_code(rec, enable); 1078 failed = __ftrace_replace_code(rec, enable);
1075 if (failed) { 1079 if (failed) {
1076 rec->flags |= FTRACE_FL_FAILED; 1080 rec->flags |= FTRACE_FL_FAILED;
1077 if ((system_state == SYSTEM_BOOTING) || 1081 ftrace_bug(failed, rec->ip);
1078 !core_kernel_text(rec->ip)) { 1082 /* Stop processing */
1079 ftrace_free_rec(rec); 1083 return;
1080 } else {
1081 ftrace_bug(failed, rec->ip);
1082 /* Stop processing */
1083 return;
1084 }
1085 } 1084 }
1086 } while_for_each_ftrace_rec(); 1085 } while_for_each_ftrace_rec();
1087} 1086}
@@ -1323,11 +1322,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1323 1322
1324enum { 1323enum {
1325 FTRACE_ITER_FILTER = (1 << 0), 1324 FTRACE_ITER_FILTER = (1 << 0),
1326 FTRACE_ITER_CONT = (1 << 1), 1325 FTRACE_ITER_NOTRACE = (1 << 1),
1327 FTRACE_ITER_NOTRACE = (1 << 2), 1326 FTRACE_ITER_FAILURES = (1 << 2),
1328 FTRACE_ITER_FAILURES = (1 << 3), 1327 FTRACE_ITER_PRINTALL = (1 << 3),
1329 FTRACE_ITER_PRINTALL = (1 << 4), 1328 FTRACE_ITER_HASH = (1 << 4),
1330 FTRACE_ITER_HASH = (1 << 5),
1331}; 1329};
1332 1330
1333#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1331#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1335,7 @@ struct ftrace_iterator {
1337 int hidx; 1335 int hidx;
1338 int idx; 1336 int idx;
1339 unsigned flags; 1337 unsigned flags;
1340 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1338 struct trace_parser parser;
1341 unsigned buffer_idx;
1342}; 1339};
1343 1340
1344static void * 1341static void *
@@ -1407,7 +1404,7 @@ static int t_hash_show(struct seq_file *m, void *v)
1407 if (rec->ops->print) 1404 if (rec->ops->print)
1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1405 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1409 1406
1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); 1407 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1411 1408
1412 if (rec->data) 1409 if (rec->data)
1413 seq_printf(m, ":%p", rec->data); 1410 seq_printf(m, ":%p", rec->data);
@@ -1517,12 +1514,12 @@ static int t_show(struct seq_file *m, void *v)
1517 if (!rec) 1514 if (!rec)
1518 return 0; 1515 return 0;
1519 1516
1520 seq_printf(m, "%pf\n", (void *)rec->ip); 1517 seq_printf(m, "%ps\n", (void *)rec->ip);
1521 1518
1522 return 0; 1519 return 0;
1523} 1520}
1524 1521
1525static struct seq_operations show_ftrace_seq_ops = { 1522static const struct seq_operations show_ftrace_seq_ops = {
1526 .start = t_start, 1523 .start = t_start,
1527 .next = t_next, 1524 .next = t_next,
1528 .stop = t_stop, 1525 .stop = t_stop,
@@ -1604,6 +1601,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1604 if (!iter) 1601 if (!iter)
1605 return -ENOMEM; 1602 return -ENOMEM;
1606 1603
1604 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1605 kfree(iter);
1606 return -ENOMEM;
1607 }
1608
1607 mutex_lock(&ftrace_regex_lock); 1609 mutex_lock(&ftrace_regex_lock);
1608 if ((file->f_mode & FMODE_WRITE) && 1610 if ((file->f_mode & FMODE_WRITE) &&
1609 (file->f_flags & O_TRUNC)) 1611 (file->f_flags & O_TRUNC))
@@ -1618,8 +1620,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1618 if (!ret) { 1620 if (!ret) {
1619 struct seq_file *m = file->private_data; 1621 struct seq_file *m = file->private_data;
1620 m->private = iter; 1622 m->private = iter;
1621 } else 1623 } else {
1624 trace_parser_put(&iter->parser);
1622 kfree(iter); 1625 kfree(iter);
1626 }
1623 } else 1627 } else
1624 file->private_data = iter; 1628 file->private_data = iter;
1625 mutex_unlock(&ftrace_regex_lock); 1629 mutex_unlock(&ftrace_regex_lock);
@@ -2059,9 +2063,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2059 int i, len = 0; 2063 int i, len = 0;
2060 char *search; 2064 char *search;
2061 2065
2062 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2066 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2063 glob = NULL; 2067 glob = NULL;
2064 else { 2068 else if (glob) {
2065 int not; 2069 int not;
2066 2070
2067 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2071 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2196,11 +2200,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos, int enable) 2200 size_t cnt, loff_t *ppos, int enable)
2197{ 2201{
2198 struct ftrace_iterator *iter; 2202 struct ftrace_iterator *iter;
2199 char ch; 2203 struct trace_parser *parser;
2200 size_t read = 0; 2204 ssize_t ret, read;
2201 ssize_t ret;
2202 2205
2203 if (!cnt || cnt < 0) 2206 if (!cnt)
2204 return 0; 2207 return 0;
2205 2208
2206 mutex_lock(&ftrace_regex_lock); 2209 mutex_lock(&ftrace_regex_lock);
@@ -2211,72 +2214,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2211 } else 2214 } else
2212 iter = file->private_data; 2215 iter = file->private_data;
2213 2216
2214 if (!*ppos) { 2217 parser = &iter->parser;
2215 iter->flags &= ~FTRACE_ITER_CONT; 2218 read = trace_get_user(parser, ubuf, cnt, ppos);
2216 iter->buffer_idx = 0;
2217 }
2218
2219 ret = get_user(ch, ubuf++);
2220 if (ret)
2221 goto out;
2222 read++;
2223 cnt--;
2224
2225 /*
2226 * If the parser haven't finished with the last write,
2227 * continue reading the user input without skipping spaces.
2228 */
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2230 /* skip white space */
2231 while (cnt && isspace(ch)) {
2232 ret = get_user(ch, ubuf++);
2233 if (ret)
2234 goto out;
2235 read++;
2236 cnt--;
2237 }
2238
2239 /* only spaces were written */
2240 if (isspace(ch)) {
2241 *ppos += read;
2242 ret = read;
2243 goto out;
2244 }
2245
2246 iter->buffer_idx = 0;
2247 }
2248 2219
2249 while (cnt && !isspace(ch)) { 2220 if (read >= 0 && trace_parser_loaded(parser) &&
2250 if (iter->buffer_idx < FTRACE_BUFF_MAX) 2221 !trace_parser_cont(parser)) {
2251 iter->buffer[iter->buffer_idx++] = ch; 2222 ret = ftrace_process_regex(parser->buffer,
2252 else { 2223 parser->idx, enable);
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = get_user(ch, ubuf++);
2257 if (ret) 2224 if (ret)
2258 goto out; 2225 goto out;
2259 read++;
2260 cnt--;
2261 }
2262 2226
2263 if (isspace(ch)) { 2227 trace_parser_clear(parser);
2264 iter->buffer[iter->buffer_idx] = 0;
2265 ret = ftrace_process_regex(iter->buffer,
2266 iter->buffer_idx, enable);
2267 if (ret)
2268 goto out;
2269 iter->buffer_idx = 0;
2270 } else {
2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 } 2228 }
2274 2229
2275 *ppos += read;
2276 ret = read; 2230 ret = read;
2277 out:
2278 mutex_unlock(&ftrace_regex_lock);
2279 2231
2232 mutex_unlock(&ftrace_regex_lock);
2233out:
2280 return ret; 2234 return ret;
2281} 2235}
2282 2236
@@ -2381,6 +2335,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2381{ 2335{
2382 struct seq_file *m = (struct seq_file *)file->private_data; 2336 struct seq_file *m = (struct seq_file *)file->private_data;
2383 struct ftrace_iterator *iter; 2337 struct ftrace_iterator *iter;
2338 struct trace_parser *parser;
2384 2339
2385 mutex_lock(&ftrace_regex_lock); 2340 mutex_lock(&ftrace_regex_lock);
2386 if (file->f_mode & FMODE_READ) { 2341 if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2345,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2390 } else 2345 } else
2391 iter = file->private_data; 2346 iter = file->private_data;
2392 2347
2393 if (iter->buffer_idx) { 2348 parser = &iter->parser;
2394 iter->buffer[iter->buffer_idx] = 0; 2349 if (trace_parser_loaded(parser)) {
2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2350 parser->buffer[parser->idx] = 0;
2351 ftrace_match_records(parser->buffer, parser->idx, enable);
2396 } 2352 }
2397 2353
2398 mutex_lock(&ftrace_lock); 2354 mutex_lock(&ftrace_lock);
@@ -2400,7 +2356,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2400 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2356 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2401 mutex_unlock(&ftrace_lock); 2357 mutex_unlock(&ftrace_lock);
2402 2358
2359 trace_parser_put(parser);
2403 kfree(iter); 2360 kfree(iter);
2361
2404 mutex_unlock(&ftrace_regex_lock); 2362 mutex_unlock(&ftrace_regex_lock);
2405 return 0; 2363 return 0;
2406} 2364}
@@ -2457,11 +2415,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2457static void * 2415static void *
2458__g_next(struct seq_file *m, loff_t *pos) 2416__g_next(struct seq_file *m, loff_t *pos)
2459{ 2417{
2460 unsigned long *array = m->private;
2461
2462 if (*pos >= ftrace_graph_count) 2418 if (*pos >= ftrace_graph_count)
2463 return NULL; 2419 return NULL;
2464 return &array[*pos]; 2420 return &ftrace_graph_funcs[*pos];
2465} 2421}
2466 2422
2467static void * 2423static void *
@@ -2499,12 +2455,12 @@ static int g_show(struct seq_file *m, void *v)
2499 return 0; 2455 return 0;
2500 } 2456 }
2501 2457
2502 seq_printf(m, "%pf\n", v); 2458 seq_printf(m, "%ps\n", (void *)*ptr);
2503 2459
2504 return 0; 2460 return 0;
2505} 2461}
2506 2462
2507static struct seq_operations ftrace_graph_seq_ops = { 2463static const struct seq_operations ftrace_graph_seq_ops = {
2508 .start = g_start, 2464 .start = g_start,
2509 .next = g_next, 2465 .next = g_next,
2510 .stop = g_stop, 2466 .stop = g_stop,
@@ -2525,16 +2481,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2525 ftrace_graph_count = 0; 2481 ftrace_graph_count = 0;
2526 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2482 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2527 } 2483 }
2484 mutex_unlock(&graph_lock);
2528 2485
2529 if (file->f_mode & FMODE_READ) { 2486 if (file->f_mode & FMODE_READ)
2530 ret = seq_open(file, &ftrace_graph_seq_ops); 2487 ret = seq_open(file, &ftrace_graph_seq_ops);
2531 if (!ret) {
2532 struct seq_file *m = file->private_data;
2533 m->private = ftrace_graph_funcs;
2534 }
2535 } else
2536 file->private_data = ftrace_graph_funcs;
2537 mutex_unlock(&graph_lock);
2538 2488
2539 return ret; 2489 return ret;
2540} 2490}
@@ -2602,12 +2552,8 @@ static ssize_t
2602ftrace_graph_write(struct file *file, const char __user *ubuf, 2552ftrace_graph_write(struct file *file, const char __user *ubuf,
2603 size_t cnt, loff_t *ppos) 2553 size_t cnt, loff_t *ppos)
2604{ 2554{
2605 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2555 struct trace_parser parser;
2606 unsigned long *array; 2556 ssize_t read, ret;
2607 size_t read = 0;
2608 ssize_t ret;
2609 int index = 0;
2610 char ch;
2611 2557
2612 if (!cnt || cnt < 0) 2558 if (!cnt || cnt < 0)
2613 return 0; 2559 return 0;
@@ -2616,60 +2562,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2616 2562
2617 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2563 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2618 ret = -EBUSY; 2564 ret = -EBUSY;
2619 goto out; 2565 goto out_unlock;
2620 } 2566 }
2621 2567
2622 if (file->f_mode & FMODE_READ) { 2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2623 struct seq_file *m = file->private_data; 2569 ret = -ENOMEM;
2624 array = m->private; 2570 goto out_unlock;
2625 } else
2626 array = file->private_data;
2627
2628 ret = get_user(ch, ubuf++);
2629 if (ret)
2630 goto out;
2631 read++;
2632 cnt--;
2633
2634 /* skip white space */
2635 while (cnt && isspace(ch)) {
2636 ret = get_user(ch, ubuf++);
2637 if (ret)
2638 goto out;
2639 read++;
2640 cnt--;
2641 } 2571 }
2642 2572
2643 if (isspace(ch)) { 2573 read = trace_get_user(&parser, ubuf, cnt, ppos);
2644 *ppos += read;
2645 ret = read;
2646 goto out;
2647 }
2648 2574
2649 while (cnt && !isspace(ch)) { 2575 if (read >= 0 && trace_parser_loaded((&parser))) {
2650 if (index < FTRACE_BUFF_MAX) 2576 parser.buffer[parser.idx] = 0;
2651 buffer[index++] = ch; 2577
2652 else { 2578 /* we allow only one expression at a time */
2653 ret = -EINVAL; 2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2654 goto out; 2580 parser.buffer);
2655 }
2656 ret = get_user(ch, ubuf++);
2657 if (ret) 2581 if (ret)
2658 goto out; 2582 goto out_free;
2659 read++;
2660 cnt--;
2661 } 2583 }
2662 buffer[index] = 0;
2663
2664 /* we allow only one expression at a time */
2665 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2666 if (ret)
2667 goto out;
2668
2669 file->f_pos += read;
2670 2584
2671 ret = read; 2585 ret = read;
2672 out: 2586
2587out_free:
2588 trace_parser_put(&parser);
2589out_unlock:
2673 mutex_unlock(&graph_lock); 2590 mutex_unlock(&graph_lock);
2674 2591
2675 return ret; 2592 return ret;
@@ -2740,19 +2657,17 @@ static int ftrace_convert_nops(struct module *mod,
2740} 2657}
2741 2658
2742#ifdef CONFIG_MODULES 2659#ifdef CONFIG_MODULES
2743void ftrace_release(void *start, void *end) 2660void ftrace_release_mod(struct module *mod)
2744{ 2661{
2745 struct dyn_ftrace *rec; 2662 struct dyn_ftrace *rec;
2746 struct ftrace_page *pg; 2663 struct ftrace_page *pg;
2747 unsigned long s = (unsigned long)start;
2748 unsigned long e = (unsigned long)end;
2749 2664
2750 if (ftrace_disabled || !start || start == end) 2665 if (ftrace_disabled)
2751 return; 2666 return;
2752 2667
2753 mutex_lock(&ftrace_lock); 2668 mutex_lock(&ftrace_lock);
2754 do_for_each_ftrace_rec(pg, rec) { 2669 do_for_each_ftrace_rec(pg, rec) {
2755 if ((rec->ip >= s) && (rec->ip < e)) { 2670 if (within_module_core(rec->ip, mod)) {
2756 /* 2671 /*
2757 * rec->ip is changed in ftrace_free_rec() 2672 * rec->ip is changed in ftrace_free_rec()
2758 * It should not between s and e if record was freed. 2673 * It should not between s and e if record was freed.
@@ -2784,9 +2699,7 @@ static int ftrace_module_notify(struct notifier_block *self,
2784 mod->num_ftrace_callsites); 2699 mod->num_ftrace_callsites);
2785 break; 2700 break;
2786 case MODULE_STATE_GOING: 2701 case MODULE_STATE_GOING:
2787 ftrace_release(mod->ftrace_callsites, 2702 ftrace_release_mod(mod);
2788 mod->ftrace_callsites +
2789 mod->num_ftrace_callsites);
2790 break; 2703 break;
2791 } 2704 }
2792 2705
@@ -3100,7 +3013,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3100 3013
3101int 3014int
3102ftrace_enable_sysctl(struct ctl_table *table, int write, 3015ftrace_enable_sysctl(struct ctl_table *table, int write,
3103 struct file *file, void __user *buffer, size_t *lenp, 3016 void __user *buffer, size_t *lenp,
3104 loff_t *ppos) 3017 loff_t *ppos)
3105{ 3018{
3106 int ret; 3019 int ret;
@@ -3110,7 +3023,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3110 3023
3111 mutex_lock(&ftrace_lock); 3024 mutex_lock(&ftrace_lock);
3112 3025
3113 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3026 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3114 3027
3115 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3028 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3116 goto out; 3029 goto out;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
501 return 1; 501 return 1;
502 } 502 }
503 503
504 if (!register_tracer(&kmem_tracer)) { 504 if (register_tracer(&kmem_tracer) != 0) {
505 pr_warning("Warning: could not register the kmem tracer\n"); 505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1; 506 return 1;
507 } 507 }
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718cf..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -701,8 +699,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
701 699
702 val &= ~RB_FLAG_MASK; 700 val &= ~RB_FLAG_MASK;
703 701
704 ret = (unsigned long)cmpxchg(&list->next, 702 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag); 703 val | old_flag, val | new_flag);
706 704
707 /* check if the reader took the page */ 705 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val) 706 if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +792,7 @@ static int rb_head_page_replace(struct buffer_page *old,
794 val = *ptr & ~RB_FLAG_MASK; 792 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD; 793 val |= RB_PAGE_HEAD;
796 794
797 ret = cmpxchg(ptr, val, &new->list); 795 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798 796
799 return ret == val; 797 return ret == val;
800} 798}
@@ -2997,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2997} 2995}
2998 2996
2999static struct ring_buffer_event * 2997static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 2998rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3001{ 2999{
3002 struct ring_buffer_per_cpu *cpu_buffer;
3003 struct ring_buffer_event *event; 3000 struct ring_buffer_event *event;
3004 struct buffer_page *reader; 3001 struct buffer_page *reader;
3005 int nr_loops = 0; 3002 int nr_loops = 0;
3006 3003
3007 cpu_buffer = buffer->buffers[cpu];
3008
3009 again: 3004 again:
3010 /* 3005 /*
3011 * We repeat when a timestamp is encountered. It is possible 3006 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3049 case RINGBUF_TYPE_DATA: 3044 case RINGBUF_TYPE_DATA:
3050 if (ts) { 3045 if (ts) {
3051 *ts = cpu_buffer->read_stamp + event->time_delta; 3046 *ts = cpu_buffer->read_stamp + event->time_delta;
3052 ring_buffer_normalize_time_stamp(buffer, 3047 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3053 cpu_buffer->cpu, ts); 3048 cpu_buffer->cpu, ts);
3054 } 3049 }
3055 return event; 3050 return event;
@@ -3168,7 +3163,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3168 local_irq_save(flags); 3163 local_irq_save(flags);
3169 if (dolock) 3164 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock); 3165 spin_lock(&cpu_buffer->reader_lock);
3171 event = rb_buffer_peek(buffer, cpu, ts); 3166 event = rb_buffer_peek(cpu_buffer, ts);
3172 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3167 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer); 3168 rb_advance_reader(cpu_buffer);
3174 if (dolock) 3169 if (dolock)
@@ -3237,7 +3232,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3237 if (dolock) 3232 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock); 3233 spin_lock(&cpu_buffer->reader_lock);
3239 3234
3240 event = rb_buffer_peek(buffer, cpu, ts); 3235 event = rb_buffer_peek(cpu_buffer, ts);
3241 if (event) 3236 if (event)
3242 rb_advance_reader(cpu_buffer); 3237 rb_advance_reader(cpu_buffer);
3243 3238
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..c820b0310a12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
125 125
126static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
127 127
128#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
129static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
136 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
242static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
243 243
244/* 244/*
245 * max_tracer_type_len is used to simplify the allocating of
246 * buffers to read userspace tracer names. We keep track of
247 * the longest tracer name registered.
248 */
249static int max_tracer_type_len;
250
251/*
252 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
253 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
254 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
275 */ 268 */
276void trace_wake_up(void) 269void trace_wake_up(void)
277{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
278 /* 275 /*
279 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
280 * have for now: 277 * have for now:
281 */ 278 */
282 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
283 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
284} 283}
285 284
286static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -339,6 +338,112 @@ static struct {
339 338
340int trace_clock_id; 339int trace_clock_id;
341 340
341/*
342 * trace_parser_get_init - gets the buffer for trace parser
343 */
344int trace_parser_get_init(struct trace_parser *parser, int size)
345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
355
356/*
357 * trace_parser_put - frees the buffer for trace parser
358 */
359void trace_parser_put(struct trace_parser *parser)
360{
361 kfree(parser->buffer);
362}
363
364/*
365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
381
382 if (!*ppos)
383 trace_parser_clear(parser);
384
385 ret = get_user(ch, ubuf++);
386 if (ret)
387 goto out;
388
389 read++;
390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
445}
446
342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
343{ 448{
344 int len; 449 int len;
@@ -513,7 +618,6 @@ __releases(kernel_lock)
513__acquires(kernel_lock) 618__acquires(kernel_lock)
514{ 619{
515 struct tracer *t; 620 struct tracer *t;
516 int len;
517 int ret = 0; 621 int ret = 0;
518 622
519 if (!type->name) { 623 if (!type->name) {
@@ -521,6 +625,11 @@ __acquires(kernel_lock)
521 return -1; 625 return -1;
522 } 626 }
523 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
524 /* 633 /*
525 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
526 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -535,7 +644,7 @@ __acquires(kernel_lock)
535 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
536 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
537 /* already found */ 646 /* already found */
538 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
539 type->name); 648 type->name);
540 ret = -1; 649 ret = -1;
541 goto out; 650 goto out;
@@ -586,9 +695,6 @@ __acquires(kernel_lock)
586 695
587 type->next = trace_types; 696 type->next = trace_types;
588 trace_types = type; 697 trace_types = type;
589 len = strlen(type->name);
590 if (len > max_tracer_type_len)
591 max_tracer_type_len = len;
592 698
593 out: 699 out:
594 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -597,7 +703,7 @@ __acquires(kernel_lock)
597 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
598 goto out_unlock; 704 goto out_unlock;
599 705
600 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
601 goto out_unlock; 707 goto out_unlock;
602 708
603 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -619,14 +725,13 @@ __acquires(kernel_lock)
619void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
620{ 726{
621 struct tracer **t; 727 struct tracer **t;
622 int len;
623 728
624 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
625 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
626 if (*t == type) 731 if (*t == type)
627 goto found; 732 goto found;
628 } 733 }
629 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
630 goto out; 735 goto out;
631 736
632 found: 737 found:
@@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type)
639 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
640 current_trace = &nop_trace; 745 current_trace = &nop_trace;
641 } 746 }
642 747out:
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
654} 749}
655 750
@@ -719,6 +814,11 @@ static void trace_init_cmdlines(void)
719 cmdline_idx = 0; 814 cmdline_idx = 0;
720} 815}
721 816
817int is_tracing_stopped(void)
818{
819 return trace_stop_count;
820}
821
722/** 822/**
723 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
724 * 824 *
@@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
886 986
887 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
888 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
889 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
890 entry->flags = 990 entry->flags =
891#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
892 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1068 return; 1168 return;
1069 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1070 1170
1171 entry->tgid = current->tgid;
1071 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1072 1173
1073 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr,
1094 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1095 int pc) 1196 int pc)
1096{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1097 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1098 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer; 1201 struct ring_buffer *buffer = tr->buffer;
@@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1107 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1108 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1109 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1110 trace_buffer_unlock_commit(buffer, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1111} 1215}
1112 1216
1113void 1217void
@@ -1289,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr,
1289 1393
1290int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1291{ 1395{
1292 return trace_array_printk(&global_trace, ip, fmt, args); 1396 return trace_array_vprintk(&global_trace, ip, fmt, args);
1293} 1397}
1294EXPORT_SYMBOL_GPL(trace_vprintk); 1398EXPORT_SYMBOL_GPL(trace_vprintk);
1295 1399
@@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m)
1530 seq_puts(m, "# | / _----=> need-resched \n"); 1634 seq_puts(m, "# | / _----=> need-resched \n");
1531 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1635 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1532 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1636 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1533 seq_puts(m, "# |||| / \n"); 1637 seq_puts(m, "# |||| /_--=> lock-depth \n");
1534 seq_puts(m, "# ||||| delay \n"); 1638 seq_puts(m, "# |||||/ delay \n");
1535 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1639 seq_puts(m, "# cmd pid |||||| time | caller \n");
1536 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1640 seq_puts(m, "# \\ / |||||| \\ | / \n");
1537} 1641}
1538 1642
1539static void print_func_help_header(struct seq_file *m) 1643static void print_func_help_header(struct seq_file *m)
@@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
1845 return 0; 1949 return 0;
1846} 1950}
1847 1951
1848static struct seq_operations tracer_seq_ops = { 1952static const struct seq_operations tracer_seq_ops = {
1849 .start = s_start, 1953 .start = s_start,
1850 .next = s_next, 1954 .next = s_next,
1851 .stop = s_stop, 1955 .stop = s_stop,
@@ -1880,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
1880 if (current_trace) 1984 if (current_trace)
1881 *iter->trace = *current_trace; 1985 *iter->trace = *current_trace;
1882 1986
1883 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 1987 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1884 goto fail; 1988 goto fail;
1885 1989
1886 cpumask_clear(iter->started);
1887
1888 if (current_trace && current_trace->print_max) 1990 if (current_trace && current_trace->print_max)
1889 iter->tr = &max_tr; 1991 iter->tr = &max_tr;
1890 else 1992 else
@@ -2059,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v)
2059 return 0; 2161 return 0;
2060} 2162}
2061 2163
2062static struct seq_operations show_traces_seq_ops = { 2164static const struct seq_operations show_traces_seq_ops = {
2063 .start = t_start, 2165 .start = t_start,
2064 .next = t_next, 2166 .next = t_next,
2065 .stop = t_stop, 2167 .stop = t_stop,
@@ -2489,7 +2591,7 @@ static ssize_t
2489tracing_set_trace_read(struct file *filp, char __user *ubuf, 2591tracing_set_trace_read(struct file *filp, char __user *ubuf,
2490 size_t cnt, loff_t *ppos) 2592 size_t cnt, loff_t *ppos)
2491{ 2593{
2492 char buf[max_tracer_type_len+2]; 2594 char buf[MAX_TRACER_SIZE+2];
2493 int r; 2595 int r;
2494 2596
2495 mutex_lock(&trace_types_lock); 2597 mutex_lock(&trace_types_lock);
@@ -2639,15 +2741,15 @@ static ssize_t
2639tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2741tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2640 size_t cnt, loff_t *ppos) 2742 size_t cnt, loff_t *ppos)
2641{ 2743{
2642 char buf[max_tracer_type_len+1]; 2744 char buf[MAX_TRACER_SIZE+1];
2643 int i; 2745 int i;
2644 size_t ret; 2746 size_t ret;
2645 int err; 2747 int err;
2646 2748
2647 ret = cnt; 2749 ret = cnt;
2648 2750
2649 if (cnt > max_tracer_type_len) 2751 if (cnt > MAX_TRACER_SIZE)
2650 cnt = max_tracer_type_len; 2752 cnt = MAX_TRACER_SIZE;
2651 2753
2652 if (copy_from_user(&buf, ubuf, cnt)) 2754 if (copy_from_user(&buf, ubuf, cnt))
2653 return -EFAULT; 2755 return -EFAULT;
@@ -4285,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
4285 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4286 goto out_free_buffer_mask; 4388 goto out_free_buffer_mask;
4287 4389
4288 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4289 goto out_free_tracing_cpumask; 4391 goto out_free_tracing_cpumask;
4290 4392
4291 /* To save memory, keep the ring buffer size to its minimum */ 4393 /* To save memory, keep the ring buffer size to its minimum */
@@ -4296,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
4296 4398
4297 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4399 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4298 cpumask_copy(tracing_cpumask, cpu_all_mask); 4400 cpumask_copy(tracing_cpumask, cpu_all_mask);
4299 cpumask_clear(tracing_reader_cpumask);
4300 4401
4301 /* TODO: make the number of buffers hot pluggable with CPUS */ 4402 /* TODO: make the number of buffers hot pluggable with CPUS */
4302 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4403 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa1dccb579d5..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,10 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -36,163 +36,59 @@ enum trace_type {
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
39 TRACE_POWER,
40 TRACE_BLK, 39 TRACE_BLK,
41 40
42 __TRACE_LAST_TYPE, 41 __TRACE_LAST_TYPE,
43}; 42};
44 43
45/* 44enum kmemtrace_type_id {
46 * Function trace entry - function address and parent function addres: 45 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 */ 46 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48struct ftrace_entry { 47 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49 struct trace_entry ent;
50 unsigned long ip;
51 unsigned long parent_ip;
52};
53
54/* Function call entry */
55struct ftrace_graph_ent_entry {
56 struct trace_entry ent;
57 struct ftrace_graph_ent graph_ent;
58}; 48};
59 49
60/* Function return entry */
61struct ftrace_graph_ret_entry {
62 struct trace_entry ent;
63 struct ftrace_graph_ret ret;
64};
65extern struct tracer boot_tracer; 50extern struct tracer boot_tracer;
66 51
67/* 52#undef __field
68 * Context switch trace entry - which task (and prio) we switched from/to: 53#define __field(type, item) type item;
69 */
70struct ctx_switch_entry {
71 struct trace_entry ent;
72 unsigned int prev_pid;
73 unsigned char prev_prio;
74 unsigned char prev_state;
75 unsigned int next_pid;
76 unsigned char next_prio;
77 unsigned char next_state;
78 unsigned int next_cpu;
79};
80
81/*
82 * Special (free-form) trace entry:
83 */
84struct special_entry {
85 struct trace_entry ent;
86 unsigned long arg1;
87 unsigned long arg2;
88 unsigned long arg3;
89};
90
91/*
92 * Stack-trace entry:
93 */
94
95#define FTRACE_STACK_ENTRIES 8
96
97struct stack_entry {
98 struct trace_entry ent;
99 unsigned long caller[FTRACE_STACK_ENTRIES];
100};
101
102struct userstack_entry {
103 struct trace_entry ent;
104 unsigned long caller[FTRACE_STACK_ENTRIES];
105};
106
107/*
108 * trace_printk entry:
109 */
110struct bprint_entry {
111 struct trace_entry ent;
112 unsigned long ip;
113 const char *fmt;
114 u32 buf[];
115};
116 54
117struct print_entry { 55#undef __field_struct
118 struct trace_entry ent; 56#define __field_struct(type, item) __field(type, item)
119 unsigned long ip;
120 char buf[];
121};
122 57
123#define TRACE_OLD_SIZE 88 58#undef __field_desc
59#define __field_desc(type, container, item)
124 60
125struct trace_field_cont { 61#undef __array
126 unsigned char type; 62#define __array(type, item, size) type item[size];
127 /* Temporary till we get rid of this completely */
128 char buf[TRACE_OLD_SIZE - 1];
129};
130 63
131struct trace_mmiotrace_rw { 64#undef __array_desc
132 struct trace_entry ent; 65#define __array_desc(type, container, item, size)
133 struct mmiotrace_rw rw;
134};
135 66
136struct trace_mmiotrace_map { 67#undef __dynamic_array
137 struct trace_entry ent; 68#define __dynamic_array(type, item) type item[];
138 struct mmiotrace_map map;
139};
140 69
141struct trace_boot_call { 70#undef F_STRUCT
142 struct trace_entry ent; 71#define F_STRUCT(args...) args
143 struct boot_trace_call boot_call;
144};
145 72
146struct trace_boot_ret { 73#undef FTRACE_ENTRY
147 struct trace_entry ent; 74#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
148 struct boot_trace_ret boot_ret; 75 struct struct_name { \
149}; 76 struct trace_entry ent; \
150 77 tstruct \
151#define TRACE_FUNC_SIZE 30 78 }
152#define TRACE_FILE_SIZE 20
153struct trace_branch {
154 struct trace_entry ent;
155 unsigned line;
156 char func[TRACE_FUNC_SIZE+1];
157 char file[TRACE_FILE_SIZE+1];
158 char correct;
159};
160
161struct hw_branch_entry {
162 struct trace_entry ent;
163 u64 from;
164 u64 to;
165};
166
167struct trace_power {
168 struct trace_entry ent;
169 struct power_trace state_data;
170};
171 79
172enum kmemtrace_type_id { 80#undef TP_ARGS
173 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 81#define TP_ARGS(args...) args
174 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
175 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
176};
177 82
178struct kmemtrace_alloc_entry { 83#undef FTRACE_ENTRY_DUP
179 struct trace_entry ent; 84#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
180 enum kmemtrace_type_id type_id;
181 unsigned long call_site;
182 const void *ptr;
183 size_t bytes_req;
184 size_t bytes_alloc;
185 gfp_t gfp_flags;
186 int node;
187};
188 85
189struct kmemtrace_free_entry { 86#include "trace_entries.h"
190 struct trace_entry ent;
191 enum kmemtrace_type_id type_id;
192 unsigned long call_site;
193 const void *ptr;
194};
195 87
88/*
89 * syscalls are special, and need special handling, this is why
90 * they are not included in trace_entries.h
91 */
196struct syscall_trace_enter { 92struct syscall_trace_enter {
197 struct trace_entry ent; 93 struct trace_entry ent;
198 int nr; 94 int nr;
@@ -205,13 +101,12 @@ struct syscall_trace_exit {
205 unsigned long ret; 101 unsigned long ret;
206}; 102};
207 103
208
209/* 104/*
210 * trace_flag_type is an enumeration that holds different 105 * trace_flag_type is an enumeration that holds different
211 * states when a trace occurs. These are: 106 * states when a trace occurs. These are:
212 * IRQS_OFF - interrupts were disabled 107 * IRQS_OFF - interrupts were disabled
213 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 108 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
214 * NEED_RESCED - reschedule is requested 109 * NEED_RESCHED - reschedule is requested
215 * HARDIRQ - inside an interrupt handler 110 * HARDIRQ - inside an interrupt handler
216 * SOFTIRQ - inside a softirq handler 111 * SOFTIRQ - inside a softirq handler
217 */ 112 */
@@ -310,7 +205,6 @@ extern void __ftrace_bad_type(void);
310 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 205 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
311 TRACE_GRAPH_RET); \ 206 TRACE_GRAPH_RET); \
312 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 207 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
313 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
314 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 208 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
315 TRACE_KMEM_ALLOC); \ 209 TRACE_KMEM_ALLOC); \
316 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -390,7 +284,6 @@ struct tracer {
390 struct tracer *next; 284 struct tracer *next;
391 int print_max; 285 int print_max;
392 struct tracer_flags *flags; 286 struct tracer_flags *flags;
393 struct tracer_stat *stats;
394}; 287};
395 288
396 289
@@ -469,6 +362,7 @@ void tracing_stop_sched_switch_record(void);
469void tracing_start_sched_switch_record(void); 362void tracing_start_sched_switch_record(void);
470int register_tracer(struct tracer *type); 363int register_tracer(struct tracer *type);
471void unregister_tracer(struct tracer *type); 364void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void);
472 366
473extern unsigned long nsecs_to_usecs(unsigned long nsecs); 367extern unsigned long nsecs_to_usecs(unsigned long nsecs);
474 368
@@ -509,20 +403,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
509 403
510extern cycle_t ftrace_now(int cpu); 404extern cycle_t ftrace_now(int cpu);
511 405
512#ifdef CONFIG_CONTEXT_SWITCH_TRACER
513typedef void
514(*tracer_switch_func_t)(void *private,
515 void *__rq,
516 struct task_struct *prev,
517 struct task_struct *next);
518
519struct tracer_switch_ops {
520 tracer_switch_func_t func;
521 void *private;
522 struct tracer_switch_ops *next;
523};
524#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
525
526extern void trace_find_cmdline(int pid, char comm[]); 406extern void trace_find_cmdline(int pid, char comm[]);
527 407
528#ifdef CONFIG_DYNAMIC_FTRACE 408#ifdef CONFIG_DYNAMIC_FTRACE
@@ -638,6 +518,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
638#endif 518#endif
639 519
640/* 520/*
521 * struct trace_parser - servers for reading the user input separated by spaces
522 * @cont: set if the input is not complete - no final space char was found
523 * @buffer: holds the parsed user input
524 * @idx: user input lenght
525 * @size: buffer size
526 */
527struct trace_parser {
528 bool cont;
529 char *buffer;
530 unsigned idx;
531 unsigned size;
532};
533
534static inline bool trace_parser_loaded(struct trace_parser *parser)
535{
536 return (parser->idx != 0);
537}
538
539static inline bool trace_parser_cont(struct trace_parser *parser)
540{
541 return parser->cont;
542}
543
544static inline void trace_parser_clear(struct trace_parser *parser)
545{
546 parser->cont = false;
547 parser->idx = 0;
548}
549
550extern int trace_parser_get_init(struct trace_parser *parser, int size);
551extern void trace_parser_put(struct trace_parser *parser);
552extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
553 size_t cnt, loff_t *ppos);
554
555/*
641 * trace_iterator_flags is an enumeration that defines bit 556 * trace_iterator_flags is an enumeration that defines bit
642 * positions into trace_flags that controls the output. 557 * positions into trace_flags that controls the output.
643 * 558 *
@@ -823,58 +738,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
823 return 0; 738 return 0;
824} 739}
825 740
826#define DEFINE_COMPARISON_PRED(type) \
827static int filter_pred_##type(struct filter_pred *pred, void *event, \
828 int val1, int val2) \
829{ \
830 type *addr = (type *)(event + pred->offset); \
831 type val = (type)pred->val; \
832 int match = 0; \
833 \
834 switch (pred->op) { \
835 case OP_LT: \
836 match = (*addr < val); \
837 break; \
838 case OP_LE: \
839 match = (*addr <= val); \
840 break; \
841 case OP_GT: \
842 match = (*addr > val); \
843 break; \
844 case OP_GE: \
845 match = (*addr >= val); \
846 break; \
847 default: \
848 break; \
849 } \
850 \
851 return match; \
852}
853
854#define DEFINE_EQUALITY_PRED(size) \
855static int filter_pred_##size(struct filter_pred *pred, void *event, \
856 int val1, int val2) \
857{ \
858 u##size *addr = (u##size *)(event + pred->offset); \
859 u##size val = (u##size)pred->val; \
860 int match; \
861 \
862 match = (val == *addr) ^ pred->not; \
863 \
864 return match; \
865}
866
867extern struct mutex event_mutex; 741extern struct mutex event_mutex;
868extern struct list_head ftrace_events; 742extern struct list_head ftrace_events;
869 743
870extern const char *__start___trace_bprintk_fmt[]; 744extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[]; 745extern const char *__stop___trace_bprintk_fmt[];
872 746
873#undef TRACE_EVENT_FORMAT 747#undef FTRACE_ENTRY
874#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 748#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
875 extern struct ftrace_event_call event_##call; 749 extern struct ftrace_event_call event_##call;
876#undef TRACE_EVENT_FORMAT_NOFILTER 750#undef FTRACE_ENTRY_DUP
877#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 751#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
878#include "trace_event_types.h" 752 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
753#include "trace_entries.h"
879 754
880#endif /* _LINUX_KERNEL_TRACE_H */ 755#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467e..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
129 129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
132 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
133 struct ring_buffer *buffer; 134 struct ring_buffer *buffer;
134 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(buffer, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
161 struct ring_buffer *buffer; 164 struct ring_buffer *buffer;
162 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
175 goto out; 178 goto out;
176 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
177 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
178 trace_buffer_unlock_commit(buffer, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
179 out: 183 out:
180 preempt_enable(); 184 preempt_enable();
181} 185}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
34 struct trace_array *tr = branch_tracer; 34 struct trace_array *tr = branch_tracer;
35 struct ring_buffer_event *event; 35 struct ring_buffer_event *event;
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer;
37 unsigned long flags; 38 unsigned long flags;
38 int cpu, pc; 39 int cpu, pc;
39 const char *p; 40 const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
54 goto out; 55 goto out;
55 56
56 pc = preempt_count(); 57 pc = preempt_count();
57 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, 58 buffer = tr->buffer;
59 event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
58 sizeof(*entry), flags, pc); 60 sizeof(*entry), flags, pc);
59 if (!event) 61 if (!event)
60 goto out; 62 goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
74 entry->line = f->line; 76 entry->line = f->line;
75 entry->correct = val == expect; 77 entry->correct = val == expect;
76 78
77 if (!filter_check_discard(call, entry, tr->buffer, event)) 79 if (!filter_check_discard(call, entry, buffer, event))
78 ring_buffer_unlock_commit(tr->buffer, event); 80 ring_buffer_unlock_commit(buffer, event);
79 81
80 out: 82 out:
81 atomic_dec(&tr->data[cpu]->disabled); 83 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..ead3d724599d
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,366 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..8d5c171cc998 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,65 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret) {
50 total_profile_count++;
51 return 0;
52 }
53
54fail_buf_nmi:
55 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi);
57 free_percpu(trace_profile_buf);
58 trace_profile_buf_nmi = NULL;
59 trace_profile_buf = NULL;
60 }
61fail_buf:
62 atomic_dec(&event->profile_count);
63
64 return ret;
65}
66
10int ftrace_profile_enable(int event_id) 67int ftrace_profile_enable(int event_id)
11{ 68{
12 struct ftrace_event_call *event; 69 struct ftrace_event_call *event;
@@ -14,8 +71,9 @@ int ftrace_profile_enable(int event_id)
14 71
15 mutex_lock(&event_mutex); 72 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 73 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 74 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 75 try_module_get(event->mod)) {
76 ret = ftrace_profile_enable_event(event);
19 break; 77 break;
20 } 78 }
21 } 79 }
@@ -24,6 +82,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 82 return ret;
25} 83}
26 84
85static void ftrace_profile_disable_event(struct ftrace_event_call *event)
86{
87 char *buf, *nmi_buf;
88
89 if (!atomic_add_negative(-1, &event->profile_count))
90 return;
91
92 event->profile_disable();
93
94 if (!--total_profile_count) {
95 buf = trace_profile_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL);
97
98 nmi_buf = trace_profile_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
100
101 /*
102 * Ensure every events in profiling have finished before
103 * releasing the buffers
104 */
105 synchronize_sched();
106
107 free_percpu(buf);
108 free_percpu(nmi_buf);
109 }
110}
111
27void ftrace_profile_disable(int event_id) 112void ftrace_profile_disable(int event_id)
28{ 113{
29 struct ftrace_event_call *event; 114 struct ftrace_event_call *event;
@@ -31,7 +116,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 116 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 117 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 118 if (event->id == event_id) {
34 event->profile_disable(event); 119 ftrace_profile_disable_event(event);
120 module_put(event->mod);
35 break; 121 break;
36 } 122 }
37 } 123 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97e2c4d2e9eb..d128f65778e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
21 21
22#include "trace_output.h" 22#include "trace_output.h"
23 23
24#undef TRACE_SYSTEM
24#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
25 26
26DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
@@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
86 __common_field(unsigned char, flags); 87 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count); 88 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid); 89 __common_field(int, pid);
89 __common_field(int, tgid); 90 __common_field(int, lock_depth);
90 91
91 return ret; 92 return ret;
92} 93}
@@ -230,73 +231,38 @@ static ssize_t
230ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
231 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
232{ 233{
233 size_t read = 0; 234 struct trace_parser parser;
234 int i, set = 1; 235 ssize_t read, ret;
235 ssize_t ret;
236 char *buf;
237 char ch;
238 236
239 if (!cnt || cnt < 0) 237 if (!cnt)
240 return 0; 238 return 0;
241 239
242 ret = tracing_update_buffers(); 240 ret = tracing_update_buffers();
243 if (ret < 0) 241 if (ret < 0)
244 return ret; 242 return ret;
245 243
246 ret = get_user(ch, ubuf++); 244 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
247 if (ret)
248 return ret;
249 read++;
250 cnt--;
251
252 /* skip white space */
253 while (cnt && isspace(ch)) {
254 ret = get_user(ch, ubuf++);
255 if (ret)
256 return ret;
257 read++;
258 cnt--;
259 }
260
261 /* Only white space found? */
262 if (isspace(ch)) {
263 file->f_pos += read;
264 ret = read;
265 return ret;
266 }
267
268 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
269 if (!buf)
270 return -ENOMEM; 245 return -ENOMEM;
271 246
272 if (cnt > EVENT_BUF_SIZE) 247 read = trace_get_user(&parser, ubuf, cnt, ppos);
273 cnt = EVENT_BUF_SIZE;
274 248
275 i = 0; 249 if (read >= 0 && trace_parser_loaded((&parser))) {
276 while (cnt && !isspace(ch)) { 250 int set = 1;
277 if (!i && ch == '!') 251
252 if (*parser.buffer == '!')
278 set = 0; 253 set = 0;
279 else
280 buf[i++] = ch;
281 254
282 ret = get_user(ch, ubuf++); 255 parser.buffer[parser.idx] = 0;
256
257 ret = ftrace_set_clr_event(parser.buffer + !set, set);
283 if (ret) 258 if (ret)
284 goto out_free; 259 goto out_put;
285 read++;
286 cnt--;
287 } 260 }
288 buf[i] = 0;
289
290 file->f_pos += read;
291
292 ret = ftrace_set_clr_event(buf, set);
293 if (ret)
294 goto out_free;
295 261
296 ret = read; 262 ret = read;
297 263
298 out_free: 264 out_put:
299 kfree(buf); 265 trace_parser_put(&parser);
300 266
301 return ret; 267 return ret;
302} 268}
@@ -304,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
304static void * 270static void *
305t_next(struct seq_file *m, void *v, loff_t *pos) 271t_next(struct seq_file *m, void *v, loff_t *pos)
306{ 272{
307 struct list_head *list = m->private; 273 struct ftrace_event_call *call = v;
308 struct ftrace_event_call *call;
309 274
310 (*pos)++; 275 (*pos)++;
311 276
312 for (;;) { 277 list_for_each_entry_continue(call, &ftrace_events, list) {
313 if (list == &ftrace_events)
314 return NULL;
315
316 call = list_entry(list, struct ftrace_event_call, list);
317
318 /* 278 /*
319 * The ftrace subsystem is for showing formats only. 279 * The ftrace subsystem is for showing formats only.
320 * They can not be enabled or disabled via the event files. 280 * They can not be enabled or disabled via the event files.
321 */ 281 */
322 if (call->regfunc) 282 if (call->regfunc)
323 break; 283 return call;
324
325 list = list->next;
326 } 284 }
327 285
328 m->private = list->next; 286 return NULL;
329
330 return call;
331} 287}
332 288
333static void *t_start(struct seq_file *m, loff_t *pos) 289static void *t_start(struct seq_file *m, loff_t *pos)
334{ 290{
335 struct ftrace_event_call *call = NULL; 291 struct ftrace_event_call *call;
336 loff_t l; 292 loff_t l;
337 293
338 mutex_lock(&event_mutex); 294 mutex_lock(&event_mutex);
339 295
340 m->private = ftrace_events.next; 296 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
341 for (l = 0; l <= *pos; ) { 297 for (l = 0; l <= *pos; ) {
342 call = t_next(m, NULL, &l); 298 call = t_next(m, call, &l);
343 if (!call) 299 if (!call)
344 break; 300 break;
345 } 301 }
@@ -349,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
349static void * 305static void *
350s_next(struct seq_file *m, void *v, loff_t *pos) 306s_next(struct seq_file *m, void *v, loff_t *pos)
351{ 307{
352 struct list_head *list = m->private; 308 struct ftrace_event_call *call = v;
353 struct ftrace_event_call *call;
354 309
355 (*pos)++; 310 (*pos)++;
356 311
357 retry: 312 list_for_each_entry_continue(call, &ftrace_events, list) {
358 if (list == &ftrace_events) 313 if (call->enabled)
359 return NULL; 314 return call;
360
361 call = list_entry(list, struct ftrace_event_call, list);
362
363 if (!call->enabled) {
364 list = list->next;
365 goto retry;
366 } 315 }
367 316
368 m->private = list->next; 317 return NULL;
369
370 return call;
371} 318}
372 319
373static void *s_start(struct seq_file *m, loff_t *pos) 320static void *s_start(struct seq_file *m, loff_t *pos)
374{ 321{
375 struct ftrace_event_call *call = NULL; 322 struct ftrace_event_call *call;
376 loff_t l; 323 loff_t l;
377 324
378 mutex_lock(&event_mutex); 325 mutex_lock(&event_mutex);
379 326
380 m->private = ftrace_events.next; 327 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
381 for (l = 0; l <= *pos; ) { 328 for (l = 0; l <= *pos; ) {
382 call = s_next(m, NULL, &l); 329 call = s_next(m, call, &l);
383 if (!call) 330 if (!call)
384 break; 331 break;
385 } 332 }
@@ -578,7 +525,7 @@ static int trace_write_header(struct trace_seq *s)
578 FIELD(unsigned char, flags), 525 FIELD(unsigned char, flags),
579 FIELD(unsigned char, preempt_count), 526 FIELD(unsigned char, preempt_count),
580 FIELD(int, pid), 527 FIELD(int, pid),
581 FIELD(int, tgid)); 528 FIELD(int, lock_depth));
582} 529}
583 530
584static ssize_t 531static ssize_t
@@ -1187,7 +1134,7 @@ static int trace_module_notify(struct notifier_block *self,
1187} 1134}
1188#endif /* CONFIG_MODULES */ 1135#endif /* CONFIG_MODULES */
1189 1136
1190struct notifier_block trace_module_nb = { 1137static struct notifier_block trace_module_nb = {
1191 .notifier_call = trace_module_notify, 1138 .notifier_call = trace_module_notify,
1192 .priority = 0, 1139 .priority = 0,
1193}; 1140};
@@ -1359,6 +1306,18 @@ static __init void event_trace_self_tests(void)
1359 if (!call->regfunc) 1306 if (!call->regfunc)
1360 continue; 1307 continue;
1361 1308
1309/*
1310 * Testing syscall events here is pretty useless, but
1311 * we still do it if configured. But this is time consuming.
1312 * What we really need is a user thread to perform the
1313 * syscalls as we test.
1314 */
1315#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1316 if (call->system &&
1317 strcmp(call->system, "syscalls") == 0)
1318 continue;
1319#endif
1320
1362 pr_info("Testing event %s: ", call->name); 1321 pr_info("Testing event %s: ", call->name);
1363 1322
1364 /* 1323 /*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf629..98a6cc5c64ed 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
@@ -892,8 +933,9 @@ static void postfix_clear(struct filter_parse_state *ps)
892 933
893 while (!list_empty(&ps->postfix)) { 934 while (!list_empty(&ps->postfix)) {
894 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 935 elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
895 kfree(elt->operand);
896 list_del(&elt->list); 936 list_del(&elt->list);
937 kfree(elt->operand);
938 kfree(elt);
897 } 939 }
898} 940}
899 941
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index df1bf6e48bb9..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,146 +15,125 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
35 83
36#undef TRACE_FIELD_SPECIAL 84#undef __array
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 85#define __array(type, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 87 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
41 (unsigned int)sizeof(field.item)); \ 89 sizeof(field.item)); \
42 if (!ret) \ 90 if (!ret) \
43 return 0; 91 return 0;
44 92
45#undef TRACE_FIELD_ZERO_CHAR 93#undef __array_desc
46#define TRACE_FIELD_ZERO_CHAR(item) \ 94#define __array_desc(type, container, item, len) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
48 "offset:%u;\tsize:0;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 97 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \
50 if (!ret) \ 99 if (!ret) \
51 return 0; 100 return 0;
52 101
53#undef TRACE_FIELD_SIGN 102#undef __dynamic_array
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 103#define __dynamic_array(type, item) \
55 TRACE_FIELD(type, item, assign) 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \
106 offsetof(typeof(field), item)); \
107 if (!ret) \
108 return 0;
56 109
57#undef TP_RAW_FMT 110#undef F_printk
58#define TP_RAW_FMT(args...) args 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
59 112
60#undef TRACE_EVENT_FORMAT 113#undef __entry
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 114#define __entry REC
62static int \
63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
65{ \
66 struct args field; \
67 int ret; \
68 \
69 tstruct; \
70 \
71 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
72 \
73 return ret; \
74}
75 115
76#undef TRACE_EVENT_FORMAT_NOFILTER 116#undef FTRACE_ENTRY
77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
78 tpfmt) \
79static int \ 118static int \
80ftrace_format_##call(struct ftrace_event_call *unused, \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \ 120 struct trace_seq *s) \
82{ \ 121{ \
83 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
84 int ret; \ 123 int ret = 0; \
85 \ 124 \
86 tstruct; \ 125 tstruct; \
87 \ 126 \
88 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
89 \ 128 \
90 return ret; \ 129 return ret; \
91} 130}
92 131
93#include "trace_event_types.h" 132#include "trace_entries.h"
94
95#undef TRACE_ZERO_CHAR
96#define TRACE_ZERO_CHAR(arg)
97
98#undef TRACE_FIELD
99#define TRACE_FIELD(type, item, assign)\
100 entry->item = assign;
101
102#undef TRACE_FIELD
103#define TRACE_FIELD(type, item, assign)\
104 entry->item = assign;
105
106#undef TRACE_FIELD_SIGN
107#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
108 TRACE_FIELD(type, item, assign)
109
110#undef TP_CMD
111#define TP_CMD(cmd...) cmd
112
113#undef TRACE_ENTRY
114#define TRACE_ENTRY entry
115
116#undef TRACE_FIELD_SPECIAL
117#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
118 cmd;
119
120#undef TRACE_EVENT_FORMAT
121#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
122int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
123static int ftrace_raw_init_event_##call(void); \
124 \
125struct ftrace_event_call __used \
126__attribute__((__aligned__(4))) \
127__attribute__((section("_ftrace_events"))) event_##call = { \
128 .name = #call, \
129 .id = proto, \
130 .system = __stringify(TRACE_SYSTEM), \
131 .raw_init = ftrace_raw_init_event_##call, \
132 .show_format = ftrace_format_##call, \
133 .define_fields = ftrace_define_fields_##call, \
134}; \
135static int ftrace_raw_init_event_##call(void) \
136{ \
137 INIT_LIST_HEAD(&event_##call.fields); \
138 return 0; \
139} \
140
141#undef TRACE_EVENT_FORMAT_NOFILTER
142#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
143 tpfmt) \
144 \
145struct ftrace_event_call __used \
146__attribute__((__aligned__(4))) \
147__attribute__((section("_ftrace_events"))) event_##call = { \
148 .name = #call, \
149 .id = proto, \
150 .system = __stringify(TRACE_SYSTEM), \
151 .show_format = ftrace_format_##call, \
152};
153 133
154#include "trace_event_types.h"
155 134
156#undef TRACE_FIELD 135#undef __field
157#define TRACE_FIELD(type, item, assign) \ 136#define __field(type, item) \
158 ret = trace_define_field(event_call, #type, #item, \ 137 ret = trace_define_field(event_call, #type, #item, \
159 offsetof(typeof(field), item), \ 138 offsetof(typeof(field), item), \
160 sizeof(field.item), \ 139 sizeof(field.item), \
@@ -162,32 +141,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
162 if (ret) \ 141 if (ret) \
163 return ret; 142 return ret;
164 143
165#undef TRACE_FIELD_SPECIAL 144#undef __field_desc
166#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
167 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
168 offsetof(typeof(field), item), \ 158 offsetof(typeof(field), item), \
169 sizeof(field.item), 0, FILTER_OTHER); \ 159 sizeof(field.item), 0, FILTER_OTHER); \
170 if (ret) \ 160 if (ret) \
171 return ret; 161 return ret;
172 162
173#undef TRACE_FIELD_SIGN 163#undef __array_desc
174#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 164#define __array_desc(type, container, item, len) \
175 ret = trace_define_field(event_call, #type, #item, \ 165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
176 offsetof(typeof(field), item), \ 166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
177 sizeof(field.item), is_signed, \ 167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
178 FILTER_OTHER); \ 170 FILTER_OTHER); \
179 if (ret) \ 171 if (ret) \
180 return ret; 172 return ret;
181 173
182#undef TRACE_FIELD_ZERO_CHAR 174#undef __dynamic_array
183#define TRACE_FIELD_ZERO_CHAR(item) 175#define __dynamic_array(type, item)
184 176
185#undef TRACE_EVENT_FORMAT 177#undef FTRACE_ENTRY
186#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
187int \ 179int \
188ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
189{ \ 181{ \
190 struct args field; \ 182 struct struct_name field; \
191 int ret; \ 183 int ret; \
192 \ 184 \
193 ret = trace_define_common_fields(event_call); \ 185 ret = trace_define_common_fields(event_call); \
@@ -199,8 +191,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
199 return ret; \ 191 return ret; \
200} 192}
201 193
202#undef TRACE_EVENT_FORMAT_NOFILTER 194#include "trace_entries.h"
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 195
204 tpfmt) 196
197#undef __field
198#define __field(type, item)
199
200#undef __field_desc
201#define __field_desc(type, container, item)
202
203#undef __array
204#define __array(type, item, len)
205
206#undef __array_desc
207#define __array_desc(type, container, item, len)
208
209#undef __dynamic_array
210#define __dynamic_array(type, item)
211
212#undef FTRACE_ENTRY
213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
214static int ftrace_raw_init_event_##call(void); \
215 \
216struct ftrace_event_call __used \
217__attribute__((__aligned__(4))) \
218__attribute__((section("_ftrace_events"))) event_##call = { \
219 .name = #call, \
220 .id = type, \
221 .system = __stringify(TRACE_SYSTEM), \
222 .raw_init = ftrace_raw_init_event_##call, \
223 .show_format = ftrace_format_##call, \
224 .define_fields = ftrace_define_fields_##call, \
225}; \
226static int ftrace_raw_init_event_##call(void) \
227{ \
228 INIT_LIST_HEAD(&event_##call.fields); \
229 return 0; \
230} \
205 231
206#include "trace_event_types.h" 232#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518fc..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
290{ 290{
291 long count = (long)data; 291 long count = (long)data;
292 292
293 seq_printf(m, "%pf:", (void *)ip); 293 seq_printf(m, "%ps:", (void *)ip);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c3132..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
364} 364}
365 365
366 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
367/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
368static enum print_line_t 377static enum print_line_t
369verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
521 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
522 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
523 } 532 }
533
524 /* Proc */ 534 /* Proc */
525 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
526 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
659 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
660 } 670 }
661 671
662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
663 if (!ret) 673 if (!ret)
664 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
665 675
@@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
702 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
703 } 713 }
704 714
705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
706 if (!ret) 716 if (!ret)
707 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
708 718
@@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
758 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
759 } 769 }
760 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
761 return 0; 778 return 0;
762} 779}
763 780
@@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
952 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
953} 970}
954 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
955static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
956{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
957 /* 1st line */ 1001 /* 1st line */
958 seq_printf(s, "# "); 1002 seq_printf(s, "#");
959 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
960 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
961 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
962 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
963 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
964 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
965 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
966 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
967 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
968 1014
969 /* 2nd line */ 1015 /* 2nd line */
970 seq_printf(s, "# "); 1016 seq_printf(s, "#");
971 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
972 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
973 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
974 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
975 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
976 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
977 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
978 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
979 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..69543a905cd5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
@@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to)
165 struct ftrace_event_call *call = &event_hw_branch; 165 struct ftrace_event_call *call = &event_hw_branch;
166 struct trace_array *tr = hw_branch_trace; 166 struct trace_array *tr = hw_branch_trace;
167 struct ring_buffer_event *event; 167 struct ring_buffer_event *event;
168 struct ring_buffer *buf;
168 struct hw_branch_entry *entry; 169 struct hw_branch_entry *entry;
169 unsigned long irq1; 170 unsigned long irq1;
170 int cpu; 171 int cpu;
@@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to)
180 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 181 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
181 goto out; 182 goto out;
182 183
183 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, 184 buf = tr->buffer;
185 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
184 sizeof(*entry), 0, 0); 186 sizeof(*entry), 0, 0);
185 if (!event) 187 if (!event)
186 goto out; 188 goto out;
@@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to)
189 entry->ent.type = TRACE_HW_BRANCHES; 191 entry->ent.type = TRACE_HW_BRANCHES;
190 entry->from = from; 192 entry->from = from;
191 entry->to = to; 193 entry->to = to;
192 if (!filter_check_discard(call, entry, tr->buffer, event)) 194 if (!filter_check_discard(call, entry, buf, event))
193 trace_buffer_unlock_commit(tr, event, 0, 0); 195 trace_buffer_unlock_commit(buf, event, 0, 0);
194 196
195 out: 197 out:
196 atomic_dec(&tr->data[cpu]->disabled); 198 atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d12..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d3..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
310 struct ring_buffer *buffer = tr->buffer; 311 struct ring_buffer *buffer = tr->buffer;
311 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
312 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
320 } 321 }
321 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
322 entry->rw = *rw; 323 entry->rw = *rw;
323 trace_buffer_unlock_commit(buffer, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
324} 327}
325 328
326void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
335 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
336{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
337 struct ring_buffer *buffer = tr->buffer; 341 struct ring_buffer *buffer = tr->buffer;
338 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
339 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
347 } 351 }
348 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
349 entry->map = *map; 353 entry->map = *map;
350 trace_buffer_unlock_commit(buffer, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
351} 357}
352 358
353void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..ed17565826b0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -482,8 +487,31 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
482 return 0; 487 return 0;
483 488
484 if (entry->preempt_count) 489 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 490 ret = trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 491 else
492 ret = trace_seq_putc(s, '.');
493
494 if (!ret)
495 return 0;
496
497 if (entry->lock_depth < 0)
498 return trace_seq_putc(s, '.');
499
500 return trace_seq_printf(s, "%d", entry->lock_depth);
501}
502
503static int
504lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
505{
506 char comm[TASK_COMM_LEN];
507
508 trace_find_cmdline(entry->pid, comm);
509
510 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
511 comm, entry->pid, cpu))
512 return 0;
513
514 return trace_print_lat_fmt(s, entry);
487} 515}
488 516
489static unsigned long preempt_mark_thresh = 100; 517static unsigned long preempt_mark_thresh = 100;
@@ -857,7 +885,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
857 trace_assign_type(field, iter->ent); 885 trace_assign_type(field, iter->ent);
858 886
859 if (!S) 887 if (!S)
860 task_state_char(field->prev_state); 888 S = task_state_char(field->prev_state);
861 T = task_state_char(field->next_state); 889 T = task_state_char(field->next_state);
862 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", 890 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
863 field->prev_pid, 891 field->prev_pid,
@@ -892,7 +920,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
892 trace_assign_type(field, iter->ent); 920 trace_assign_type(field, iter->ent);
893 921
894 if (!S) 922 if (!S)
895 task_state_char(field->prev_state); 923 S = task_state_char(field->prev_state);
896 T = task_state_char(field->next_state); 924 T = task_state_char(field->next_state);
897 925
898 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); 926 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
42 struct trace_power *entry;
43 struct trace_array_cpu *data;
44 struct trace_array *tr = power_trace;
45
46 if (!trace_power_enabled)
47 return;
48
49 buffer = tr->buffer;
50
51 preempt_disable();
52 it->end = ktime_get();
53 data = tr->data[smp_processor_id()];
54
55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
56 sizeof(*entry), 0, 0);
57 if (!event)
58 goto out;
59 entry = ring_buffer_event_data(event);
60 entry->state_data = *it;
61 if (!filter_check_discard(call, entry, buffer, event))
62 trace_buffer_unlock_commit(buffer, event, 0, 0);
63 out:
64 preempt_enable();
65}
66
67static void probe_power_mark(struct power_trace *it, unsigned int type,
68 unsigned int level)
69{
70 struct ftrace_event_call *call = &event_power;
71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
73 struct trace_power *entry;
74 struct trace_array_cpu *data;
75 struct trace_array *tr = power_trace;
76
77 if (!trace_power_enabled)
78 return;
79
80 buffer = tr->buffer;
81
82 memset(it, 0, sizeof(struct power_trace));
83 it->state = level;
84 it->type = type;
85 it->stamp = ktime_get();
86 preempt_disable();
87 it->end = it->stamp;
88 data = tr->data[smp_processor_id()];
89
90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
91 sizeof(*entry), 0, 0);
92 if (!event)
93 goto out;
94 entry = ring_buffer_event_data(event);
95 entry->state_data = *it;
96 if (!filter_check_discard(call, entry, buffer, event))
97 trace_buffer_unlock_commit(buffer, event, 0, 0);
98 out:
99 preempt_enable();
100}
101
102static int tracing_power_register(void)
103{
104 int ret;
105
106 ret = register_trace_power_start(probe_power_start);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_start\n");
110 return ret;
111 }
112 ret = register_trace_power_end(probe_power_end);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_end\n");
116 goto fail_start;
117 }
118 ret = register_trace_power_mark(probe_power_mark);
119 if (ret) {
120 pr_info("power trace: Couldn't activate tracepoint"
121 " probe to trace_power_mark\n");
122 goto fail_end;
123 }
124 return ret;
125fail_end:
126 unregister_trace_power_end(probe_power_end);
127fail_start:
128 unregister_trace_power_start(probe_power_start);
129 return ret;
130}
131
132static void start_power_trace(struct trace_array *tr)
133{
134 trace_power_enabled = 1;
135}
136
137static void stop_power_trace(struct trace_array *tr)
138{
139 trace_power_enabled = 0;
140}
141
142static void power_trace_reset(struct trace_array *tr)
143{
144 trace_power_enabled = 0;
145 unregister_trace_power_start(probe_power_start);
146 unregister_trace_power_end(probe_power_end);
147 unregister_trace_power_mark(probe_power_mark);
148}
149
150
151static int power_trace_init(struct trace_array *tr)
152{
153 power_trace = tr;
154
155 trace_power_enabled = 1;
156 tracing_power_register();
157
158 tracing_reset_online_cpus(tr);
159 return 0;
160}
161
162static enum print_line_t power_print_line(struct trace_iterator *iter)
163{
164 int ret = 0;
165 struct trace_entry *entry = iter->ent;
166 struct trace_power *field ;
167 struct power_trace *it;
168 struct trace_seq *s = &iter->seq;
169 struct timespec stamp;
170 struct timespec duration;
171
172 trace_assign_type(field, entry);
173 it = &field->state_data;
174 stamp = ktime_to_timespec(it->stamp);
175 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
176
177 if (entry->type == TRACE_POWER) {
178 if (it->type == POWER_CSTATE)
179 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
180 stamp.tv_sec,
181 stamp.tv_nsec,
182 it->state, iter->cpu,
183 duration.tv_sec,
184 duration.tv_nsec);
185 if (it->type == POWER_PSTATE)
186 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
187 stamp.tv_sec,
188 stamp.tv_nsec,
189 it->state, iter->cpu);
190 if (!ret)
191 return TRACE_TYPE_PARTIAL_LINE;
192 return TRACE_TYPE_HANDLED;
193 }
194 return TRACE_TYPE_UNHANDLED;
195}
196
197static void power_print_header(struct seq_file *s)
198{
199 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
200 seq_puts(s, "# | | |\n");
201}
202
203static struct tracer power_tracer __read_mostly =
204{
205 .name = "power",
206 .init = power_trace_init,
207 .start = start_power_trace,
208 .stop = stop_power_trace,
209 .reset = power_trace_reset,
210 .print_line = power_print_line,
211 .print_header = power_print_header,
212};
213
214static int init_power_trace(void)
215{
216 return register_tracer(&power_tracer);
217}
218device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c6..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
245 236
246 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
247 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
248 240
249 wakeup_task = p; 241 wakeup_task = p;
@@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
293 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
294 } 286 }
295 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
296 wakeup_reset(tr); 295 wakeup_reset(tr);
297 296
298 /* 297 /*
@@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0f6facb050a1..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
296 296
297int 297int
298stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
299 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
300 loff_t *ppos) 300 loff_t *ppos)
301{ 301{
302 int ret; 302 int ret;
303 303
304 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
305 305
306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
307 307
308 if (ret || !write || 308 if (ret || !write ||
309 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..527e17eae575 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h> 4#include <linux/ftrace.h>
5#include <linux/perf_counter.h> 5#include <linux/perf_event.h>
6#include <asm/syscall.h> 6#include <asm/syscall.h>
7 7
8#include "trace_output.h" 8#include "trace_output.h"
@@ -166,7 +166,7 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" 166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", 167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr), 168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret)); 169 SYSCALL_FIELD(long, ret));
170 if (!ret) 170 if (!ret)
171 return 0; 171 return 0;
172 172
@@ -212,7 +212,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
212 if (ret) 212 if (ret)
213 return ret; 213 return ret;
214 214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, 215 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
216 FILTER_OTHER); 216 FILTER_OTHER);
217 217
218 return ret; 218 return ret;
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
384 384
385static void prof_syscall_enter(struct pt_regs *regs, long id) 385static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 386{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
389 int syscall_nr; 391 int syscall_nr;
390 int size; 392 int size;
393 int cpu;
391 394
392 syscall_nr = syscall_get_nr(current, regs); 395 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 405 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 406 size -= sizeof(u32);
404 407
405 do { 408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
407 414
408 /* zero the dead bytes from align to not leak stack to user */ 415 cpu = smp_processor_id();
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
410 424
411 rec = (struct syscall_trace_enter *) raw_data; 425 raw_data = per_cpu_ptr(raw_data, cpu);
412 tracing_generic_entry_update(&rec->ent, 0, 0); 426
413 rec->ent.type = sys_data->enter_id; 427 /* zero the dead bytes from align to not leak stack to user */
414 rec->nr = syscall_nr; 428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 429
416 (unsigned long *)&rec->args); 430 rec = (struct syscall_trace_enter *) raw_data;
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); 431 tracing_generic_entry_update(&rec->ent, 0, 0);
418 } while(0); 432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
419} 440}
420 441
421int reg_prof_syscall_enter(char *name) 442int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 481static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 482{
462 struct syscall_metadata *sys_data; 483 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 484 struct syscall_trace_exit *rec;
485 unsigned long flags;
464 int syscall_nr; 486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
465 490
466 syscall_nr = syscall_get_nr(current, regs); 491 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 496 if (!sys_data)
472 return; 497 return;
473 498
474 tracing_generic_entry_update(&rec.ent, 0, 0); 499 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 501 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs);
478 502
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
480} 539}
481 540
482int reg_prof_syscall_exit(char *name) 541int reg_prof_syscall_exit(char *name)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9489a0a9b1be..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
48 48
49/* 49/*
50 * Note about RCU : 50 * Note about RCU :
51 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
52 * state is reached. 52 * state is reached.
53 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
54 */ 54 */
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..47cdd7e76f2b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
640EXPORT_SYMBOL(schedule_delayed_work); 640EXPORT_SYMBOL(schedule_delayed_work);
641 641
642/** 642/**
643 * flush_delayed_work - block until a dwork_struct's callback has terminated
644 * @dwork: the delayed work which is to be flushed
645 *
646 * Any timeout is cancelled, and any pending work is run immediately.
647 */
648void flush_delayed_work(struct delayed_work *dwork)
649{
650 if (del_timer_sync(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu());
653 __queue_work(cwq, &dwork->work);
654 put_cpu();
655 }
656 flush_work(&dwork->work);
657}
658EXPORT_SYMBOL(flush_delayed_work);
659
660/**
643 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay 661 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
644 * @cpu: cpu to use 662 * @cpu: cpu to use
645 * @dwork: job to be done 663 * @dwork: job to be done