aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile11
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/audit.c18
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditsc.c6
-rw-r--r--kernel/cgroup.c1129
-rw-r--r--kernel/cgroup_debug.c105
-rw-r--r--kernel/cgroup_freezer.c15
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cpuset.c66
-rw-r--r--kernel/cred.c315
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c169
-rw-r--r--kernel/fork.c86
-rw-r--r--kernel/futex.c180
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c150
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/itimer.c169
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c36
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c795
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c130
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c201
-rw-r--r--kernel/ns_cgroup.c16
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c7
-rw-r--r--kernel/perf_counter.c4861
-rw-r--r--kernel/perf_event.c5000
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/posix-cpu-timers.c155
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c414
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/printk.c208
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c92
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c241
-rw-r--r--kernel/rcutree.c369
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h566
-rw-r--r--kernel/rcutree_trace.c100
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/res_counter.c3
-rw-r--r--kernel/resource.c23
-rw-r--r--kernel/sched.c1738
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c526
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c82
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/slow-work.c12
-rw-r--r--kernel/smp.c76
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sys.c46
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c188
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/timeconv.c127
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c67
-rw-r--r--kernel/trace/Kconfig43
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c307
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c1125
-rw-r--r--kernel/trace/trace.c850
-rw-r--r--kernel/trace/trace.h355
-rw-r--r--kernel/trace/trace_boot.c20
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h366
-rw-r--r--kernel/trace/trace_event_profile.c87
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c283
-rw-r--r--kernel/trace/trace_events_filter.c302
-rw-r--r--kernel/trace/trace_export.c290
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c228
-rw-r--r--kernel/trace/trace_hw_branches.c2
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c42
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c214
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c59
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c47
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c530
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c52
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/workqueue.c9
135 files changed, 16116 insertions, 14469 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..b8d4cd8ac0b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 58obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
59obj-$(CONFIG_COMPAT) += compat.o 59obj-$(CONFIG_COMPAT) += compat.o
60obj-$(CONFIG_CGROUPS) += cgroup.o 60obj-$(CONFIG_CGROUPS) += cgroup.o
61obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 61obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 62obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 63obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
@@ -80,26 +79,22 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 79obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 80obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 81obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 82obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 83obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 84obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 85obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 88obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
92obj-$(CONFIG_MARKERS) += marker.o
93obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 89obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
94obj-$(CONFIG_LATENCYTOP) += latencytop.o 90obj-$(CONFIG_LATENCYTOP) += latencytop.o
95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 91obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 92obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 93obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/ 94obj-$(CONFIG_RING_BUFFER) += trace/
100obj-$(CONFIG_SMP) += sched_cpupri.o 95obj-$(CONFIG_SMP) += sched_cpupri.o
101obj-$(CONFIG_SLOW_WORK) += slow-work.o 96obj-$(CONFIG_SLOW_WORK) += slow-work.o
102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 97obj-$(CONFIG_PERF_EVENTS) += perf_event.o
103 98
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -119,7 +114,7 @@ $(obj)/config_data.gz: .config FORCE
119 $(call if_changed,gzip) 114 $(call if_changed,gzip)
120 115
121quiet_cmd_ikconfiggz = IKCFG $@ 116quiet_cmd_ikconfiggz = IKCFG $@
122 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 117 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
123targets += config_data.h 118targets += config_data.h
124$(obj)/config_data.h: $(obj)/config_data.gz FORCE 119$(obj)/config_data.h: $(obj)/config_data.gz FORCE
125 $(call if_changed,ikconfiggz) 120 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3e..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
491 u64 run_time; 491 u64 run_time;
492 struct timespec uptime; 492 struct timespec uptime;
493 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
494 498
495 /* 499 /*
496 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
497 * the process accounting system. 501 * the process accounting system.
498 */ 502 */
499 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
500 return; 504 goto out;
501 505
502 /* 506 /*
503 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
578 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
579 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
580 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
581} 587}
582 588
583/** 589/**
diff --git a/kernel/audit.c b/kernel/audit.c
index defc2e6f1e3b..5feed232be9d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
855 break; 855 break;
856 } 856 }
857 case AUDIT_SIGNAL_INFO: 857 case AUDIT_SIGNAL_INFO:
858 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); 858 len = 0;
859 if (err) 859 if (audit_sig_sid) {
860 return err; 860 err = security_secid_to_secctx(audit_sig_sid, &ctx, &len);
861 if (err)
862 return err;
863 }
861 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 864 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
862 if (!sig_data) { 865 if (!sig_data) {
863 security_release_secctx(ctx, len); 866 if (audit_sig_sid)
867 security_release_secctx(ctx, len);
864 return -ENOMEM; 868 return -ENOMEM;
865 } 869 }
866 sig_data->uid = audit_sig_uid; 870 sig_data->uid = audit_sig_uid;
867 sig_data->pid = audit_sig_pid; 871 sig_data->pid = audit_sig_pid;
868 memcpy(sig_data->ctx, ctx, len); 872 if (audit_sig_sid) {
869 security_release_secctx(ctx, len); 873 memcpy(sig_data->ctx, ctx, len);
874 security_release_secctx(ctx, len);
875 }
870 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 876 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
871 0, 0, sig_data, sizeof(*sig_data) + len); 877 0, 0, sig_data, sizeof(*sig_data) + len);
872 kfree(sig_data); 878 kfree(sig_data);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0e96dbc60ea9..cc7e87936cbc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -45,8 +45,8 @@
45 45
46struct audit_watch { 46struct audit_watch {
47 atomic_t count; /* reference count */ 47 atomic_t count; /* reference count */
48 char *path; /* insertion path */
49 dev_t dev; /* associated superblock device */ 48 dev_t dev; /* associated superblock device */
49 char *path; /* insertion path */
50 unsigned long ino; /* associated inode number */ 50 unsigned long ino; /* associated inode number */
51 struct audit_parent *parent; /* associated parent */ 51 struct audit_parent *parent; /* associated parent */
52 struct list_head wlist; /* entry in parent->watches list */ 52 struct list_head wlist; /* entry in parent->watches list */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 68d3c6a0ecd6..267e484f0198 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -168,12 +168,12 @@ struct audit_context {
168 int in_syscall; /* 1 if task is in a syscall */ 168 int in_syscall; /* 1 if task is in a syscall */
169 enum audit_state state, current_state; 169 enum audit_state state, current_state;
170 unsigned int serial; /* serial number for record */ 170 unsigned int serial; /* serial number for record */
171 struct timespec ctime; /* time of syscall entry */
172 int major; /* syscall number */ 171 int major; /* syscall number */
172 struct timespec ctime; /* time of syscall entry */
173 unsigned long argv[4]; /* syscall arguments */ 173 unsigned long argv[4]; /* syscall arguments */
174 int return_valid; /* return code is valid */
175 long return_code;/* syscall return code */ 174 long return_code;/* syscall return code */
176 u64 prio; 175 u64 prio;
176 int return_valid; /* return code is valid */
177 int name_count; 177 int name_count;
178 struct audit_names names[AUDIT_NAMES]; 178 struct audit_names names[AUDIT_NAMES];
179 char * filterkey; /* key for rule that triggered record */ 179 char * filterkey; /* key for rule that triggered record */
@@ -198,8 +198,8 @@ struct audit_context {
198 char target_comm[TASK_COMM_LEN]; 198 char target_comm[TASK_COMM_LEN];
199 199
200 struct audit_tree_refs *trees, *first_trees; 200 struct audit_tree_refs *trees, *first_trees;
201 int tree_count;
202 struct list_head killed_trees; 201 struct list_head killed_trees;
202 int tree_count;
203 203
204 int type; 204 int type;
205 union { 205 union {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..ca83b73fba19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/ctype.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
@@ -48,6 +49,8 @@
48#include <linux/namei.h> 49#include <linux/namei.h>
49#include <linux/smp_lock.h> 50#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h> 51#include <linux/pid_namespace.h>
52#include <linux/idr.h>
53#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
51 54
52#include <asm/atomic.h> 55#include <asm/atomic.h>
53 56
@@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = {
60#include <linux/cgroup_subsys.h> 63#include <linux/cgroup_subsys.h>
61}; 64};
62 65
66#define MAX_CGROUP_ROOT_NAMELEN 64
67
63/* 68/*
64 * A cgroupfs_root represents the root of a cgroup hierarchy, 69 * A cgroupfs_root represents the root of a cgroup hierarchy,
65 * and may be associated with a superblock to form an active 70 * and may be associated with a superblock to form an active
@@ -74,6 +79,9 @@ struct cgroupfs_root {
74 */ 79 */
75 unsigned long subsys_bits; 80 unsigned long subsys_bits;
76 81
82 /* Unique id for this hierarchy. */
83 int hierarchy_id;
84
77 /* The bitmask of subsystems currently attached to this hierarchy */ 85 /* The bitmask of subsystems currently attached to this hierarchy */
78 unsigned long actual_subsys_bits; 86 unsigned long actual_subsys_bits;
79 87
@@ -94,6 +102,9 @@ struct cgroupfs_root {
94 102
95 /* The path to use for release notifications. */ 103 /* The path to use for release notifications. */
96 char release_agent_path[PATH_MAX]; 104 char release_agent_path[PATH_MAX];
105
106 /* The name for this hierarchy - may be empty */
107 char name[MAX_CGROUP_ROOT_NAMELEN];
97}; 108};
98 109
99/* 110/*
@@ -141,6 +152,10 @@ struct css_id {
141static LIST_HEAD(roots); 152static LIST_HEAD(roots);
142static int root_count; 153static int root_count;
143 154
155static DEFINE_IDA(hierarchy_ida);
156static int next_hierarchy_id;
157static DEFINE_SPINLOCK(hierarchy_id_lock);
158
144/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 159/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
145#define dummytop (&rootnode.top_cgroup) 160#define dummytop (&rootnode.top_cgroup)
146 161
@@ -201,6 +216,7 @@ struct cg_cgroup_link {
201 * cgroup, anchored on cgroup->css_sets 216 * cgroup, anchored on cgroup->css_sets
202 */ 217 */
203 struct list_head cgrp_link_list; 218 struct list_head cgrp_link_list;
219 struct cgroup *cgrp;
204 /* 220 /*
205 * List running through cg_cgroup_links pointing at a 221 * List running through cg_cgroup_links pointing at a
206 * single css_set object, anchored on css_set->cg_links 222 * single css_set object, anchored on css_set->cg_links
@@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
227static DEFINE_RWLOCK(css_set_lock); 243static DEFINE_RWLOCK(css_set_lock);
228static int css_set_count; 244static int css_set_count;
229 245
230/* hash table for cgroup groups. This improves the performance to 246/*
231 * find an existing css_set */ 247 * hash table for cgroup groups. This improves the performance to find
248 * an existing css_set. This hash doesn't (currently) take into
249 * account cgroups in empty hierarchies.
250 */
232#define CSS_SET_HASH_BITS 7 251#define CSS_SET_HASH_BITS 7
233#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) 252#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
234static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; 253static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
@@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
248 return &css_set_table[index]; 267 return &css_set_table[index];
249} 268}
250 269
270static void free_css_set_rcu(struct rcu_head *obj)
271{
272 struct css_set *cg = container_of(obj, struct css_set, rcu_head);
273 kfree(cg);
274}
275
251/* We don't maintain the lists running through each css_set to its 276/* We don't maintain the lists running through each css_set to its
252 * task until after the first call to cgroup_iter_start(). This 277 * task until after the first call to cgroup_iter_start(). This
253 * reduces the fork()/exit() overhead for people who have cgroups 278 * reduces the fork()/exit() overhead for people who have cgroups
254 * compiled into their kernel but not actually in use */ 279 * compiled into their kernel but not actually in use */
255static int use_task_css_set_links __read_mostly; 280static int use_task_css_set_links __read_mostly;
256 281
257/* When we create or destroy a css_set, the operation simply 282static void __put_css_set(struct css_set *cg, int taskexit)
258 * takes/releases a reference count on all the cgroups referenced
259 * by subsystems in this css_set. This can end up multiple-counting
260 * some cgroups, but that's OK - the ref-count is just a
261 * busy/not-busy indicator; ensuring that we only count each cgroup
262 * once would require taking a global lock to ensure that no
263 * subsystems moved between hierarchies while we were doing so.
264 *
265 * Possible TODO: decide at boot time based on the number of
266 * registered subsystems and the number of CPUs or NUMA nodes whether
267 * it's better for performance to ref-count every subsystem, or to
268 * take a global lock and only add one ref count to each hierarchy.
269 */
270
271/*
272 * unlink a css_set from the list and free it
273 */
274static void unlink_css_set(struct css_set *cg)
275{ 283{
276 struct cg_cgroup_link *link; 284 struct cg_cgroup_link *link;
277 struct cg_cgroup_link *saved_link; 285 struct cg_cgroup_link *saved_link;
278
279 hlist_del(&cg->hlist);
280 css_set_count--;
281
282 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
283 cg_link_list) {
284 list_del(&link->cg_link_list);
285 list_del(&link->cgrp_link_list);
286 kfree(link);
287 }
288}
289
290static void __put_css_set(struct css_set *cg, int taskexit)
291{
292 int i;
293 /* 286 /*
294 * Ensure that the refcount doesn't hit zero while any readers 287 * Ensure that the refcount doesn't hit zero while any readers
295 * can see it. Similar to atomic_dec_and_lock(), but for an 288 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit)
302 write_unlock(&css_set_lock); 295 write_unlock(&css_set_lock);
303 return; 296 return;
304 } 297 }
305 unlink_css_set(cg);
306 write_unlock(&css_set_lock);
307 298
308 rcu_read_lock(); 299 /* This css_set is dead. unlink it and release cgroup refcounts */
309 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 300 hlist_del(&cg->hlist);
310 struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); 301 css_set_count--;
302
303 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
304 cg_link_list) {
305 struct cgroup *cgrp = link->cgrp;
306 list_del(&link->cg_link_list);
307 list_del(&link->cgrp_link_list);
311 if (atomic_dec_and_test(&cgrp->count) && 308 if (atomic_dec_and_test(&cgrp->count) &&
312 notify_on_release(cgrp)) { 309 notify_on_release(cgrp)) {
313 if (taskexit) 310 if (taskexit)
314 set_bit(CGRP_RELEASABLE, &cgrp->flags); 311 set_bit(CGRP_RELEASABLE, &cgrp->flags);
315 check_for_release(cgrp); 312 check_for_release(cgrp);
316 } 313 }
314
315 kfree(link);
317 } 316 }
318 rcu_read_unlock(); 317
319 kfree(cg); 318 write_unlock(&css_set_lock);
319 call_rcu(&cg->rcu_head, free_css_set_rcu);
320} 320}
321 321
322/* 322/*
@@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg)
338} 338}
339 339
340/* 340/*
341 * compare_css_sets - helper function for find_existing_css_set().
342 * @cg: candidate css_set being tested
343 * @old_cg: existing css_set for a task
344 * @new_cgrp: cgroup that's being entered by the task
345 * @template: desired set of css pointers in css_set (pre-calculated)
346 *
347 * Returns true if "cg" matches "old_cg" except for the hierarchy
348 * which "new_cgrp" belongs to, for which it should match "new_cgrp".
349 */
350static bool compare_css_sets(struct css_set *cg,
351 struct css_set *old_cg,
352 struct cgroup *new_cgrp,
353 struct cgroup_subsys_state *template[])
354{
355 struct list_head *l1, *l2;
356
357 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) {
358 /* Not all subsystems matched */
359 return false;
360 }
361
362 /*
363 * Compare cgroup pointers in order to distinguish between
364 * different cgroups in heirarchies with no subsystems. We
365 * could get by with just this check alone (and skip the
366 * memcmp above) but on most setups the memcmp check will
367 * avoid the need for this more expensive check on almost all
368 * candidates.
369 */
370
371 l1 = &cg->cg_links;
372 l2 = &old_cg->cg_links;
373 while (1) {
374 struct cg_cgroup_link *cgl1, *cgl2;
375 struct cgroup *cg1, *cg2;
376
377 l1 = l1->next;
378 l2 = l2->next;
379 /* See if we reached the end - both lists are equal length. */
380 if (l1 == &cg->cg_links) {
381 BUG_ON(l2 != &old_cg->cg_links);
382 break;
383 } else {
384 BUG_ON(l2 == &old_cg->cg_links);
385 }
386 /* Locate the cgroups associated with these links. */
387 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
388 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
389 cg1 = cgl1->cgrp;
390 cg2 = cgl2->cgrp;
391 /* Hierarchies should be linked in the same order. */
392 BUG_ON(cg1->root != cg2->root);
393
394 /*
395 * If this hierarchy is the hierarchy of the cgroup
396 * that's changing, then we need to check that this
397 * css_set points to the new cgroup; if it's any other
398 * hierarchy, then this css_set should point to the
399 * same cgroup as the old css_set.
400 */
401 if (cg1->root == new_cgrp->root) {
402 if (cg1 != new_cgrp)
403 return false;
404 } else {
405 if (cg1 != cg2)
406 return false;
407 }
408 }
409 return true;
410}
411
412/*
341 * find_existing_css_set() is a helper for 413 * find_existing_css_set() is a helper for
342 * find_css_set(), and checks to see whether an existing 414 * find_css_set(), and checks to see whether an existing
343 * css_set is suitable. 415 * css_set is suitable.
@@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set(
378 450
379 hhead = css_set_hash(template); 451 hhead = css_set_hash(template);
380 hlist_for_each_entry(cg, node, hhead, hlist) { 452 hlist_for_each_entry(cg, node, hhead, hlist) {
381 if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { 453 if (!compare_css_sets(cg, oldcg, cgrp, template))
382 /* All subsystems matched */ 454 continue;
383 return cg; 455
384 } 456 /* This css_set matches what we need */
457 return cg;
385 } 458 }
386 459
387 /* No existing cgroup group matched */ 460 /* No existing cgroup group matched */
@@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links,
435 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 508 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
436 cgrp_link_list); 509 cgrp_link_list);
437 link->cg = cg; 510 link->cg = cg;
511 link->cgrp = cgrp;
512 atomic_inc(&cgrp->count);
438 list_move(&link->cgrp_link_list, &cgrp->css_sets); 513 list_move(&link->cgrp_link_list, &cgrp->css_sets);
439 list_add(&link->cg_link_list, &cg->cg_links); 514 /*
515 * Always add links to the tail of the list so that the list
516 * is sorted by order of hierarchy creation
517 */
518 list_add_tail(&link->cg_link_list, &cg->cg_links);
440} 519}
441 520
442/* 521/*
@@ -451,11 +530,11 @@ static struct css_set *find_css_set(
451{ 530{
452 struct css_set *res; 531 struct css_set *res;
453 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 532 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
454 int i;
455 533
456 struct list_head tmp_cg_links; 534 struct list_head tmp_cg_links;
457 535
458 struct hlist_head *hhead; 536 struct hlist_head *hhead;
537 struct cg_cgroup_link *link;
459 538
460 /* First see if we already have a cgroup group that matches 539 /* First see if we already have a cgroup group that matches
461 * the desired set */ 540 * the desired set */
@@ -489,20 +568,12 @@ static struct css_set *find_css_set(
489 568
490 write_lock(&css_set_lock); 569 write_lock(&css_set_lock);
491 /* Add reference counts and links from the new css_set. */ 570 /* Add reference counts and links from the new css_set. */
492 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 571 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) {
493 struct cgroup *cgrp = res->subsys[i]->cgroup; 572 struct cgroup *c = link->cgrp;
494 struct cgroup_subsys *ss = subsys[i]; 573 if (c->root == cgrp->root)
495 atomic_inc(&cgrp->count); 574 c = cgrp;
496 /* 575 link_css_set(&tmp_cg_links, res, c);
497 * We want to add a link once per cgroup, so we
498 * only do it for the first subsystem in each
499 * hierarchy
500 */
501 if (ss->root->subsys_list.next == &ss->sibling)
502 link_css_set(&tmp_cg_links, res, cgrp);
503 } 576 }
504 if (list_empty(&rootnode.subsys_list))
505 link_css_set(&tmp_cg_links, res, dummytop);
506 577
507 BUG_ON(!list_empty(&tmp_cg_links)); 578 BUG_ON(!list_empty(&tmp_cg_links));
508 579
@@ -518,6 +589,41 @@ static struct css_set *find_css_set(
518} 589}
519 590
520/* 591/*
592 * Return the cgroup for "task" from the given hierarchy. Must be
593 * called with cgroup_mutex held.
594 */
595static struct cgroup *task_cgroup_from_root(struct task_struct *task,
596 struct cgroupfs_root *root)
597{
598 struct css_set *css;
599 struct cgroup *res = NULL;
600
601 BUG_ON(!mutex_is_locked(&cgroup_mutex));
602 read_lock(&css_set_lock);
603 /*
604 * No need to lock the task - since we hold cgroup_mutex the
605 * task can't change groups, so the only thing that can happen
606 * is that it exits and its css is set back to init_css_set.
607 */
608 css = task->cgroups;
609 if (css == &init_css_set) {
610 res = &root->top_cgroup;
611 } else {
612 struct cg_cgroup_link *link;
613 list_for_each_entry(link, &css->cg_links, cg_link_list) {
614 struct cgroup *c = link->cgrp;
615 if (c->root == root) {
616 res = c;
617 break;
618 }
619 }
620 }
621 read_unlock(&css_set_lock);
622 BUG_ON(!res);
623 return res;
624}
625
626/*
521 * There is one global cgroup mutex. We also require taking 627 * There is one global cgroup mutex. We also require taking
522 * task_lock() when dereferencing a task's cgroup subsys pointers. 628 * task_lock() when dereferencing a task's cgroup subsys pointers.
523 * See "The task_lock() exception", at the end of this comment. 629 * See "The task_lock() exception", at the end of this comment.
@@ -596,10 +702,11 @@ void cgroup_unlock(void)
596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 702static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 703static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 704static int cgroup_populate_dir(struct cgroup *cgrp);
599static struct inode_operations cgroup_dir_inode_operations; 705static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 706static const struct file_operations proc_cgroupstats_operations;
601 707
602static struct backing_dev_info cgroup_backing_dev_info = { 708static struct backing_dev_info cgroup_backing_dev_info = {
709 .name = "cgroup",
603 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 710 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
604}; 711};
605 712
@@ -676,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
676 */ 783 */
677 deactivate_super(cgrp->root->sb); 784 deactivate_super(cgrp->root->sb);
678 785
786 /*
787 * if we're getting rid of the cgroup, refcount should ensure
788 * that there are no pidlists left.
789 */
790 BUG_ON(!list_empty(&cgrp->pidlists));
791
679 call_rcu(&cgrp->rcu_head, free_cgroup_rcu); 792 call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
680 } 793 }
681 iput(inode); 794 iput(inode);
@@ -840,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
840 seq_puts(seq, ",noprefix"); 953 seq_puts(seq, ",noprefix");
841 if (strlen(root->release_agent_path)) 954 if (strlen(root->release_agent_path))
842 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 955 seq_printf(seq, ",release_agent=%s", root->release_agent_path);
956 if (strlen(root->name))
957 seq_printf(seq, ",name=%s", root->name);
843 mutex_unlock(&cgroup_mutex); 958 mutex_unlock(&cgroup_mutex);
844 return 0; 959 return 0;
845} 960}
@@ -848,6 +963,12 @@ struct cgroup_sb_opts {
848 unsigned long subsys_bits; 963 unsigned long subsys_bits;
849 unsigned long flags; 964 unsigned long flags;
850 char *release_agent; 965 char *release_agent;
966 char *name;
967 /* User explicitly requested empty subsystem */
968 bool none;
969
970 struct cgroupfs_root *new_root;
971
851}; 972};
852 973
853/* Convert a hierarchy specifier into a bitmask of subsystems and 974/* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -862,9 +983,7 @@ static int parse_cgroupfs_options(char *data,
862 mask = ~(1UL << cpuset_subsys_id); 983 mask = ~(1UL << cpuset_subsys_id);
863#endif 984#endif
864 985
865 opts->subsys_bits = 0; 986 memset(opts, 0, sizeof(*opts));
866 opts->flags = 0;
867 opts->release_agent = NULL;
868 987
869 while ((token = strsep(&o, ",")) != NULL) { 988 while ((token = strsep(&o, ",")) != NULL) {
870 if (!*token) 989 if (!*token)
@@ -878,17 +997,42 @@ static int parse_cgroupfs_options(char *data,
878 if (!ss->disabled) 997 if (!ss->disabled)
879 opts->subsys_bits |= 1ul << i; 998 opts->subsys_bits |= 1ul << i;
880 } 999 }
1000 } else if (!strcmp(token, "none")) {
1001 /* Explicitly have no subsystems */
1002 opts->none = true;
881 } else if (!strcmp(token, "noprefix")) { 1003 } else if (!strcmp(token, "noprefix")) {
882 set_bit(ROOT_NOPREFIX, &opts->flags); 1004 set_bit(ROOT_NOPREFIX, &opts->flags);
883 } else if (!strncmp(token, "release_agent=", 14)) { 1005 } else if (!strncmp(token, "release_agent=", 14)) {
884 /* Specifying two release agents is forbidden */ 1006 /* Specifying two release agents is forbidden */
885 if (opts->release_agent) 1007 if (opts->release_agent)
886 return -EINVAL; 1008 return -EINVAL;
887 opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); 1009 opts->release_agent =
1010 kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
888 if (!opts->release_agent) 1011 if (!opts->release_agent)
889 return -ENOMEM; 1012 return -ENOMEM;
890 strncpy(opts->release_agent, token + 14, PATH_MAX - 1); 1013 } else if (!strncmp(token, "name=", 5)) {
891 opts->release_agent[PATH_MAX - 1] = 0; 1014 int i;
1015 const char *name = token + 5;
1016 /* Can't specify an empty name */
1017 if (!strlen(name))
1018 return -EINVAL;
1019 /* Must match [\w.-]+ */
1020 for (i = 0; i < strlen(name); i++) {
1021 char c = name[i];
1022 if (isalnum(c))
1023 continue;
1024 if ((c == '.') || (c == '-') || (c == '_'))
1025 continue;
1026 return -EINVAL;
1027 }
1028 /* Specifying two names is forbidden */
1029 if (opts->name)
1030 return -EINVAL;
1031 opts->name = kstrndup(name,
1032 MAX_CGROUP_ROOT_NAMELEN,
1033 GFP_KERNEL);
1034 if (!opts->name)
1035 return -ENOMEM;
892 } else { 1036 } else {
893 struct cgroup_subsys *ss; 1037 struct cgroup_subsys *ss;
894 int i; 1038 int i;
@@ -905,6 +1049,8 @@ static int parse_cgroupfs_options(char *data,
905 } 1049 }
906 } 1050 }
907 1051
1052 /* Consistency checks */
1053
908 /* 1054 /*
909 * Option noprefix was introduced just for backward compatibility 1055 * Option noprefix was introduced just for backward compatibility
910 * with the old cpuset, so we allow noprefix only if mounting just 1056 * with the old cpuset, so we allow noprefix only if mounting just
@@ -914,8 +1060,16 @@ static int parse_cgroupfs_options(char *data,
914 (opts->subsys_bits & mask)) 1060 (opts->subsys_bits & mask))
915 return -EINVAL; 1061 return -EINVAL;
916 1062
917 /* We can't have an empty hierarchy */ 1063
918 if (!opts->subsys_bits) 1064 /* Can't specify "none" and some subsystems */
1065 if (opts->subsys_bits && opts->none)
1066 return -EINVAL;
1067
1068 /*
1069 * We either have to specify by name or by subsystems. (So all
1070 * empty hierarchies must have a name).
1071 */
1072 if (!opts->subsys_bits && !opts->name)
919 return -EINVAL; 1073 return -EINVAL;
920 1074
921 return 0; 1075 return 0;
@@ -943,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
943 goto out_unlock; 1097 goto out_unlock;
944 } 1098 }
945 1099
1100 /* Don't allow name to change at remount */
1101 if (opts.name && strcmp(opts.name, root->name)) {
1102 ret = -EINVAL;
1103 goto out_unlock;
1104 }
1105
946 ret = rebind_subsystems(root, opts.subsys_bits); 1106 ret = rebind_subsystems(root, opts.subsys_bits);
947 if (ret) 1107 if (ret)
948 goto out_unlock; 1108 goto out_unlock;
@@ -954,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
954 strcpy(root->release_agent_path, opts.release_agent); 1114 strcpy(root->release_agent_path, opts.release_agent);
955 out_unlock: 1115 out_unlock:
956 kfree(opts.release_agent); 1116 kfree(opts.release_agent);
1117 kfree(opts.name);
957 mutex_unlock(&cgroup_mutex); 1118 mutex_unlock(&cgroup_mutex);
958 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1119 mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
959 unlock_kernel(); 1120 unlock_kernel();
960 return ret; 1121 return ret;
961} 1122}
962 1123
963static struct super_operations cgroup_ops = { 1124static const struct super_operations cgroup_ops = {
964 .statfs = simple_statfs, 1125 .statfs = simple_statfs,
965 .drop_inode = generic_delete_inode, 1126 .drop_inode = generic_delete_inode,
966 .show_options = cgroup_show_options, 1127 .show_options = cgroup_show_options,
@@ -973,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
973 INIT_LIST_HEAD(&cgrp->children); 1134 INIT_LIST_HEAD(&cgrp->children);
974 INIT_LIST_HEAD(&cgrp->css_sets); 1135 INIT_LIST_HEAD(&cgrp->css_sets);
975 INIT_LIST_HEAD(&cgrp->release_list); 1136 INIT_LIST_HEAD(&cgrp->release_list);
976 INIT_LIST_HEAD(&cgrp->pids_list); 1137 INIT_LIST_HEAD(&cgrp->pidlists);
977 init_rwsem(&cgrp->pids_mutex); 1138 mutex_init(&cgrp->pidlist_mutex);
978} 1139}
1140
979static void init_cgroup_root(struct cgroupfs_root *root) 1141static void init_cgroup_root(struct cgroupfs_root *root)
980{ 1142{
981 struct cgroup *cgrp = &root->top_cgroup; 1143 struct cgroup *cgrp = &root->top_cgroup;
@@ -987,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root)
987 init_cgroup_housekeeping(cgrp); 1149 init_cgroup_housekeeping(cgrp);
988} 1150}
989 1151
1152static bool init_root_id(struct cgroupfs_root *root)
1153{
1154 int ret = 0;
1155
1156 do {
1157 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL))
1158 return false;
1159 spin_lock(&hierarchy_id_lock);
1160 /* Try to allocate the next unused ID */
1161 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id,
1162 &root->hierarchy_id);
1163 if (ret == -ENOSPC)
1164 /* Try again starting from 0 */
1165 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id);
1166 if (!ret) {
1167 next_hierarchy_id = root->hierarchy_id + 1;
1168 } else if (ret != -EAGAIN) {
1169 /* Can only get here if the 31-bit IDR is full ... */
1170 BUG_ON(ret);
1171 }
1172 spin_unlock(&hierarchy_id_lock);
1173 } while (ret);
1174 return true;
1175}
1176
990static int cgroup_test_super(struct super_block *sb, void *data) 1177static int cgroup_test_super(struct super_block *sb, void *data)
991{ 1178{
992 struct cgroupfs_root *new = data; 1179 struct cgroup_sb_opts *opts = data;
993 struct cgroupfs_root *root = sb->s_fs_info; 1180 struct cgroupfs_root *root = sb->s_fs_info;
994 1181
995 /* First check subsystems */ 1182 /* If we asked for a name then it must match */
996 if (new->subsys_bits != root->subsys_bits) 1183 if (opts->name && strcmp(opts->name, root->name))
997 return 0; 1184 return 0;
998 1185
999 /* Next check flags */ 1186 /*
1000 if (new->flags != root->flags) 1187 * If we asked for subsystems (or explicitly for no
1188 * subsystems) then they must match
1189 */
1190 if ((opts->subsys_bits || opts->none)
1191 && (opts->subsys_bits != root->subsys_bits))
1001 return 0; 1192 return 0;
1002 1193
1003 return 1; 1194 return 1;
1004} 1195}
1005 1196
1197static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
1198{
1199 struct cgroupfs_root *root;
1200
1201 if (!opts->subsys_bits && !opts->none)
1202 return NULL;
1203
1204 root = kzalloc(sizeof(*root), GFP_KERNEL);
1205 if (!root)
1206 return ERR_PTR(-ENOMEM);
1207
1208 if (!init_root_id(root)) {
1209 kfree(root);
1210 return ERR_PTR(-ENOMEM);
1211 }
1212 init_cgroup_root(root);
1213
1214 root->subsys_bits = opts->subsys_bits;
1215 root->flags = opts->flags;
1216 if (opts->release_agent)
1217 strcpy(root->release_agent_path, opts->release_agent);
1218 if (opts->name)
1219 strcpy(root->name, opts->name);
1220 return root;
1221}
1222
1223static void cgroup_drop_root(struct cgroupfs_root *root)
1224{
1225 if (!root)
1226 return;
1227
1228 BUG_ON(!root->hierarchy_id);
1229 spin_lock(&hierarchy_id_lock);
1230 ida_remove(&hierarchy_ida, root->hierarchy_id);
1231 spin_unlock(&hierarchy_id_lock);
1232 kfree(root);
1233}
1234
1006static int cgroup_set_super(struct super_block *sb, void *data) 1235static int cgroup_set_super(struct super_block *sb, void *data)
1007{ 1236{
1008 int ret; 1237 int ret;
1009 struct cgroupfs_root *root = data; 1238 struct cgroup_sb_opts *opts = data;
1239
1240 /* If we don't have a new root, we can't set up a new sb */
1241 if (!opts->new_root)
1242 return -EINVAL;
1243
1244 BUG_ON(!opts->subsys_bits && !opts->none);
1010 1245
1011 ret = set_anon_super(sb, NULL); 1246 ret = set_anon_super(sb, NULL);
1012 if (ret) 1247 if (ret)
1013 return ret; 1248 return ret;
1014 1249
1015 sb->s_fs_info = root; 1250 sb->s_fs_info = opts->new_root;
1016 root->sb = sb; 1251 opts->new_root->sb = sb;
1017 1252
1018 sb->s_blocksize = PAGE_CACHE_SIZE; 1253 sb->s_blocksize = PAGE_CACHE_SIZE;
1019 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1254 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -1050,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1050 void *data, struct vfsmount *mnt) 1285 void *data, struct vfsmount *mnt)
1051{ 1286{
1052 struct cgroup_sb_opts opts; 1287 struct cgroup_sb_opts opts;
1288 struct cgroupfs_root *root;
1053 int ret = 0; 1289 int ret = 0;
1054 struct super_block *sb; 1290 struct super_block *sb;
1055 struct cgroupfs_root *root; 1291 struct cgroupfs_root *new_root;
1056 struct list_head tmp_cg_links;
1057 1292
1058 /* First find the desired set of subsystems */ 1293 /* First find the desired set of subsystems */
1059 ret = parse_cgroupfs_options(data, &opts); 1294 ret = parse_cgroupfs_options(data, &opts);
1060 if (ret) { 1295 if (ret)
1061 kfree(opts.release_agent); 1296 goto out_err;
1062 return ret;
1063 }
1064
1065 root = kzalloc(sizeof(*root), GFP_KERNEL);
1066 if (!root) {
1067 kfree(opts.release_agent);
1068 return -ENOMEM;
1069 }
1070 1297
1071 init_cgroup_root(root); 1298 /*
1072 root->subsys_bits = opts.subsys_bits; 1299 * Allocate a new cgroup root. We may not need it if we're
1073 root->flags = opts.flags; 1300 * reusing an existing hierarchy.
1074 if (opts.release_agent) { 1301 */
1075 strcpy(root->release_agent_path, opts.release_agent); 1302 new_root = cgroup_root_from_opts(&opts);
1076 kfree(opts.release_agent); 1303 if (IS_ERR(new_root)) {
1304 ret = PTR_ERR(new_root);
1305 goto out_err;
1077 } 1306 }
1307 opts.new_root = new_root;
1078 1308
1079 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); 1309 /* Locate an existing or new sb for this hierarchy */
1080 1310 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts);
1081 if (IS_ERR(sb)) { 1311 if (IS_ERR(sb)) {
1082 kfree(root); 1312 ret = PTR_ERR(sb);
1083 return PTR_ERR(sb); 1313 cgroup_drop_root(opts.new_root);
1314 goto out_err;
1084 } 1315 }
1085 1316
1086 if (sb->s_fs_info != root) { 1317 root = sb->s_fs_info;
1087 /* Reusing an existing superblock */ 1318 BUG_ON(!root);
1088 BUG_ON(sb->s_root == NULL); 1319 if (root == opts.new_root) {
1089 kfree(root); 1320 /* We used the new root structure, so this is a new hierarchy */
1090 root = NULL; 1321 struct list_head tmp_cg_links;
1091 } else {
1092 /* New superblock */
1093 struct cgroup *root_cgrp = &root->top_cgroup; 1322 struct cgroup *root_cgrp = &root->top_cgroup;
1094 struct inode *inode; 1323 struct inode *inode;
1324 struct cgroupfs_root *existing_root;
1095 int i; 1325 int i;
1096 1326
1097 BUG_ON(sb->s_root != NULL); 1327 BUG_ON(sb->s_root != NULL);
@@ -1104,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1104 mutex_lock(&inode->i_mutex); 1334 mutex_lock(&inode->i_mutex);
1105 mutex_lock(&cgroup_mutex); 1335 mutex_lock(&cgroup_mutex);
1106 1336
1337 if (strlen(root->name)) {
1338 /* Check for name clashes with existing mounts */
1339 for_each_active_root(existing_root) {
1340 if (!strcmp(existing_root->name, root->name)) {
1341 ret = -EBUSY;
1342 mutex_unlock(&cgroup_mutex);
1343 mutex_unlock(&inode->i_mutex);
1344 goto drop_new_super;
1345 }
1346 }
1347 }
1348
1107 /* 1349 /*
1108 * We're accessing css_set_count without locking 1350 * We're accessing css_set_count without locking
1109 * css_set_lock here, but that's OK - it can only be 1351 * css_set_lock here, but that's OK - it can only be
@@ -1122,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1122 if (ret == -EBUSY) { 1364 if (ret == -EBUSY) {
1123 mutex_unlock(&cgroup_mutex); 1365 mutex_unlock(&cgroup_mutex);
1124 mutex_unlock(&inode->i_mutex); 1366 mutex_unlock(&inode->i_mutex);
1125 goto free_cg_links; 1367 free_cg_links(&tmp_cg_links);
1368 goto drop_new_super;
1126 } 1369 }
1127 1370
1128 /* EBUSY should be the only error here */ 1371 /* EBUSY should be the only error here */
@@ -1154,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1154 BUG_ON(root->number_of_cgroups != 1); 1397 BUG_ON(root->number_of_cgroups != 1);
1155 1398
1156 cgroup_populate_dir(root_cgrp); 1399 cgroup_populate_dir(root_cgrp);
1157 mutex_unlock(&inode->i_mutex);
1158 mutex_unlock(&cgroup_mutex); 1400 mutex_unlock(&cgroup_mutex);
1401 mutex_unlock(&inode->i_mutex);
1402 } else {
1403 /*
1404 * We re-used an existing hierarchy - the new root (if
1405 * any) is not needed
1406 */
1407 cgroup_drop_root(opts.new_root);
1159 } 1408 }
1160 1409
1161 simple_set_mnt(mnt, sb); 1410 simple_set_mnt(mnt, sb);
1411 kfree(opts.release_agent);
1412 kfree(opts.name);
1162 return 0; 1413 return 0;
1163 1414
1164 free_cg_links:
1165 free_cg_links(&tmp_cg_links);
1166 drop_new_super: 1415 drop_new_super:
1167 deactivate_locked_super(sb); 1416 deactivate_locked_super(sb);
1417 out_err:
1418 kfree(opts.release_agent);
1419 kfree(opts.name);
1420
1168 return ret; 1421 return ret;
1169} 1422}
1170 1423
@@ -1210,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
1210 mutex_unlock(&cgroup_mutex); 1463 mutex_unlock(&cgroup_mutex);
1211 1464
1212 kill_litter_super(sb); 1465 kill_litter_super(sb);
1213 kfree(root); 1466 cgroup_drop_root(root);
1214} 1467}
1215 1468
1216static struct file_system_type cgroup_fs_type = { 1469static struct file_system_type cgroup_fs_type = {
@@ -1275,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1275 return 0; 1528 return 0;
1276} 1529}
1277 1530
1278/*
1279 * Return the first subsystem attached to a cgroup's hierarchy, and
1280 * its subsystem id.
1281 */
1282
1283static void get_first_subsys(const struct cgroup *cgrp,
1284 struct cgroup_subsys_state **css, int *subsys_id)
1285{
1286 const struct cgroupfs_root *root = cgrp->root;
1287 const struct cgroup_subsys *test_ss;
1288 BUG_ON(list_empty(&root->subsys_list));
1289 test_ss = list_entry(root->subsys_list.next,
1290 struct cgroup_subsys, sibling);
1291 if (css) {
1292 *css = cgrp->subsys[test_ss->subsys_id];
1293 BUG_ON(!*css);
1294 }
1295 if (subsys_id)
1296 *subsys_id = test_ss->subsys_id;
1297}
1298
1299/** 1531/**
1300 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1532 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1301 * @cgrp: the cgroup the task is attaching to 1533 * @cgrp: the cgroup the task is attaching to
@@ -1312,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1312 struct css_set *cg; 1544 struct css_set *cg;
1313 struct css_set *newcg; 1545 struct css_set *newcg;
1314 struct cgroupfs_root *root = cgrp->root; 1546 struct cgroupfs_root *root = cgrp->root;
1315 int subsys_id;
1316
1317 get_first_subsys(cgrp, NULL, &subsys_id);
1318 1547
1319 /* Nothing to do if the task is already in that cgroup */ 1548 /* Nothing to do if the task is already in that cgroup */
1320 oldcgrp = task_cgroup(tsk, subsys_id); 1549 oldcgrp = task_cgroup_from_root(tsk, root);
1321 if (cgrp == oldcgrp) 1550 if (cgrp == oldcgrp)
1322 return 0; 1551 return 0;
1323 1552
1324 for_each_subsys(root, ss) { 1553 for_each_subsys(root, ss) {
1325 if (ss->can_attach) { 1554 if (ss->can_attach) {
1326 retval = ss->can_attach(ss, cgrp, tsk); 1555 retval = ss->can_attach(ss, cgrp, tsk, false);
1327 if (retval) 1556 if (retval)
1328 return retval; 1557 return retval;
1329 } 1558 }
@@ -1361,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1361 1590
1362 for_each_subsys(root, ss) { 1591 for_each_subsys(root, ss) {
1363 if (ss->attach) 1592 if (ss->attach)
1364 ss->attach(ss, cgrp, oldcgrp, tsk); 1593 ss->attach(ss, cgrp, oldcgrp, tsk, false);
1365 } 1594 }
1366 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1595 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1367 synchronize_rcu(); 1596 synchronize_rcu();
@@ -1422,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1422 return ret; 1651 return ret;
1423} 1652}
1424 1653
1425/* The various types of files and directories in a cgroup file system */
1426enum cgroup_filetype {
1427 FILE_ROOT,
1428 FILE_DIR,
1429 FILE_TASKLIST,
1430 FILE_NOTIFY_ON_RELEASE,
1431 FILE_RELEASE_AGENT,
1432};
1433
1434/** 1654/**
1435 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 1655 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1436 * @cgrp: the cgroup to be checked for liveness 1656 * @cgrp: the cgroup to be checked for liveness
@@ -1643,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1643 return single_release(inode, file); 1863 return single_release(inode, file);
1644} 1864}
1645 1865
1646static struct file_operations cgroup_seqfile_operations = { 1866static const struct file_operations cgroup_seqfile_operations = {
1647 .read = seq_read, 1867 .read = seq_read,
1648 .write = cgroup_file_write, 1868 .write = cgroup_file_write,
1649 .llseek = seq_lseek, 1869 .llseek = seq_lseek,
@@ -1702,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
1702 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 1922 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1703} 1923}
1704 1924
1705static struct file_operations cgroup_file_operations = { 1925static const struct file_operations cgroup_file_operations = {
1706 .read = cgroup_file_read, 1926 .read = cgroup_file_read,
1707 .write = cgroup_file_write, 1927 .write = cgroup_file_write,
1708 .llseek = generic_file_llseek, 1928 .llseek = generic_file_llseek,
@@ -1710,7 +1930,7 @@ static struct file_operations cgroup_file_operations = {
1710 .release = cgroup_file_release, 1930 .release = cgroup_file_release,
1711}; 1931};
1712 1932
1713static struct inode_operations cgroup_dir_inode_operations = { 1933static const struct inode_operations cgroup_dir_inode_operations = {
1714 .lookup = simple_lookup, 1934 .lookup = simple_lookup,
1715 .mkdir = cgroup_mkdir, 1935 .mkdir = cgroup_mkdir,
1716 .rmdir = cgroup_rmdir, 1936 .rmdir = cgroup_rmdir,
@@ -1875,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
1875 * the start of a css_set 2095 * the start of a css_set
1876 */ 2096 */
1877static void cgroup_advance_iter(struct cgroup *cgrp, 2097static void cgroup_advance_iter(struct cgroup *cgrp,
1878 struct cgroup_iter *it) 2098 struct cgroup_iter *it)
1879{ 2099{
1880 struct list_head *l = it->cg_link; 2100 struct list_head *l = it->cg_link;
1881 struct cg_cgroup_link *link; 2101 struct cg_cgroup_link *link;
@@ -2128,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2128} 2348}
2129 2349
2130/* 2350/*
2131 * Stuff for reading the 'tasks' file. 2351 * Stuff for reading the 'tasks'/'procs' files.
2132 * 2352 *
2133 * Reading this file can return large amounts of data if a cgroup has 2353 * Reading this file can return large amounts of data if a cgroup has
2134 * *lots* of attached tasks. So it may need several calls to read(), 2354 * *lots* of attached tasks. So it may need several calls to read(),
@@ -2138,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2138 */ 2358 */
2139 2359
2140/* 2360/*
2141 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2361 * The following two functions "fix" the issue where there are more pids
2142 * 'cgrp'. Return actual number of pids loaded. No need to 2362 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
2143 * task_lock(p) when reading out p->cgroup, since we're in an RCU 2363 * TODO: replace with a kernel-wide solution to this problem
2144 * read section, so the css_set can't go away, and is 2364 */
2145 * immutable after creation. 2365#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
2366static void *pidlist_allocate(int count)
2367{
2368 if (PIDLIST_TOO_LARGE(count))
2369 return vmalloc(count * sizeof(pid_t));
2370 else
2371 return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
2372}
2373static void pidlist_free(void *p)
2374{
2375 if (is_vmalloc_addr(p))
2376 vfree(p);
2377 else
2378 kfree(p);
2379}
2380static void *pidlist_resize(void *p, int newcount)
2381{
2382 void *newlist;
2383 /* note: if new alloc fails, old p will still be valid either way */
2384 if (is_vmalloc_addr(p)) {
2385 newlist = vmalloc(newcount * sizeof(pid_t));
2386 if (!newlist)
2387 return NULL;
2388 memcpy(newlist, p, newcount * sizeof(pid_t));
2389 vfree(p);
2390 } else {
2391 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
2392 }
2393 return newlist;
2394}
2395
2396/*
2397 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2398 * If the new stripped list is sufficiently smaller and there's enough memory
2399 * to allocate a new buffer, will let go of the unneeded memory. Returns the
2400 * number of unique elements.
2401 */
2402/* is the size difference enough that we should re-allocate the array? */
2403#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
2404static int pidlist_uniq(pid_t **p, int length)
2405{
2406 int src, dest = 1;
2407 pid_t *list = *p;
2408 pid_t *newlist;
2409
2410 /*
2411 * we presume the 0th element is unique, so i starts at 1. trivial
2412 * edge cases first; no work needs to be done for either
2413 */
2414 if (length == 0 || length == 1)
2415 return length;
2416 /* src and dest walk down the list; dest counts unique elements */
2417 for (src = 1; src < length; src++) {
2418 /* find next unique element */
2419 while (list[src] == list[src-1]) {
2420 src++;
2421 if (src == length)
2422 goto after;
2423 }
2424 /* dest always points to where the next unique element goes */
2425 list[dest] = list[src];
2426 dest++;
2427 }
2428after:
2429 /*
2430 * if the length difference is large enough, we want to allocate a
2431 * smaller buffer to save memory. if this fails due to out of memory,
2432 * we'll just stay with what we've got.
2433 */
2434 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
2435 newlist = pidlist_resize(list, dest);
2436 if (newlist)
2437 *p = newlist;
2438 }
2439 return dest;
2440}
2441
2442static int cmppid(const void *a, const void *b)
2443{
2444 return *(pid_t *)a - *(pid_t *)b;
2445}
2446
2447/*
2448 * find the appropriate pidlist for our purpose (given procs vs tasks)
2449 * returns with the lock on that pidlist already held, and takes care
2450 * of the use count, or returns NULL with no locks held if we're out of
2451 * memory.
2146 */ 2452 */
2147static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) 2453static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2454 enum cgroup_filetype type)
2148{ 2455{
2149 int n = 0, pid; 2456 struct cgroup_pidlist *l;
2457 /* don't need task_nsproxy() if we're looking at ourself */
2458 struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns);
2459 /*
2460 * We can't drop the pidlist_mutex before taking the l->mutex in case
2461 * the last ref-holder is trying to remove l from the list at the same
2462 * time. Holding the pidlist_mutex precludes somebody taking whichever
2463 * list we find out from under us - compare release_pid_array().
2464 */
2465 mutex_lock(&cgrp->pidlist_mutex);
2466 list_for_each_entry(l, &cgrp->pidlists, links) {
2467 if (l->key.type == type && l->key.ns == ns) {
2468 /* found a matching list - drop the extra refcount */
2469 put_pid_ns(ns);
2470 /* make sure l doesn't vanish out from under us */
2471 down_write(&l->mutex);
2472 mutex_unlock(&cgrp->pidlist_mutex);
2473 l->use_count++;
2474 return l;
2475 }
2476 }
2477 /* entry not found; create a new one */
2478 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
2479 if (!l) {
2480 mutex_unlock(&cgrp->pidlist_mutex);
2481 put_pid_ns(ns);
2482 return l;
2483 }
2484 init_rwsem(&l->mutex);
2485 down_write(&l->mutex);
2486 l->key.type = type;
2487 l->key.ns = ns;
2488 l->use_count = 0; /* don't increment here */
2489 l->list = NULL;
2490 l->owner = cgrp;
2491 list_add(&l->links, &cgrp->pidlists);
2492 mutex_unlock(&cgrp->pidlist_mutex);
2493 return l;
2494}
2495
2496/*
2497 * Load a cgroup's pidarray with either procs' tgids or tasks' pids
2498 */
2499static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
2500 struct cgroup_pidlist **lp)
2501{
2502 pid_t *array;
2503 int length;
2504 int pid, n = 0; /* used for populating the array */
2150 struct cgroup_iter it; 2505 struct cgroup_iter it;
2151 struct task_struct *tsk; 2506 struct task_struct *tsk;
2507 struct cgroup_pidlist *l;
2508
2509 /*
2510 * If cgroup gets more users after we read count, we won't have
2511 * enough space - tough. This race is indistinguishable to the
2512 * caller from the case that the additional cgroup users didn't
2513 * show up until sometime later on.
2514 */
2515 length = cgroup_task_count(cgrp);
2516 array = pidlist_allocate(length);
2517 if (!array)
2518 return -ENOMEM;
2519 /* now, populate the array */
2152 cgroup_iter_start(cgrp, &it); 2520 cgroup_iter_start(cgrp, &it);
2153 while ((tsk = cgroup_iter_next(cgrp, &it))) { 2521 while ((tsk = cgroup_iter_next(cgrp, &it))) {
2154 if (unlikely(n == npids)) 2522 if (unlikely(n == length))
2155 break; 2523 break;
2156 pid = task_pid_vnr(tsk); 2524 /* get tgid or pid for procs or tasks file respectively */
2157 if (pid > 0) 2525 if (type == CGROUP_FILE_PROCS)
2158 pidarray[n++] = pid; 2526 pid = task_tgid_vnr(tsk);
2527 else
2528 pid = task_pid_vnr(tsk);
2529 if (pid > 0) /* make sure to only use valid results */
2530 array[n++] = pid;
2159 } 2531 }
2160 cgroup_iter_end(cgrp, &it); 2532 cgroup_iter_end(cgrp, &it);
2161 return n; 2533 length = n;
2534 /* now sort & (if procs) strip out duplicates */
2535 sort(array, length, sizeof(pid_t), cmppid, NULL);
2536 if (type == CGROUP_FILE_PROCS)
2537 length = pidlist_uniq(&array, length);
2538 l = cgroup_pidlist_find(cgrp, type);
2539 if (!l) {
2540 pidlist_free(array);
2541 return -ENOMEM;
2542 }
2543 /* store array, freeing old if necessary - lock already held */
2544 pidlist_free(l->list);
2545 l->list = array;
2546 l->length = length;
2547 l->use_count++;
2548 up_write(&l->mutex);
2549 *lp = l;
2550 return 0;
2162} 2551}
2163 2552
2164/** 2553/**
@@ -2215,37 +2604,14 @@ err:
2215 return ret; 2604 return ret;
2216} 2605}
2217 2606
2218/*
2219 * Cache pids for all threads in the same pid namespace that are
2220 * opening the same "tasks" file.
2221 */
2222struct cgroup_pids {
2223 /* The node in cgrp->pids_list */
2224 struct list_head list;
2225 /* The cgroup those pids belong to */
2226 struct cgroup *cgrp;
2227 /* The namepsace those pids belong to */
2228 struct pid_namespace *ns;
2229 /* Array of process ids in the cgroup */
2230 pid_t *tasks_pids;
2231 /* How many files are using the this tasks_pids array */
2232 int use_count;
2233 /* Length of the current tasks_pids array */
2234 int length;
2235};
2236
2237static int cmppid(const void *a, const void *b)
2238{
2239 return *(pid_t *)a - *(pid_t *)b;
2240}
2241 2607
2242/* 2608/*
2243 * seq_file methods for the "tasks" file. The seq_file position is the 2609 * seq_file methods for the tasks/procs files. The seq_file position is the
2244 * next pid to display; the seq_file iterator is a pointer to the pid 2610 * next pid to display; the seq_file iterator is a pointer to the pid
2245 * in the cgroup->tasks_pids array. 2611 * in the cgroup->l->list array.
2246 */ 2612 */
2247 2613
2248static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) 2614static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
2249{ 2615{
2250 /* 2616 /*
2251 * Initially we receive a position value that corresponds to 2617 * Initially we receive a position value that corresponds to
@@ -2253,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2253 * after a seek to the start). Use a binary-search to find the 2619 * after a seek to the start). Use a binary-search to find the
2254 * next pid to display, if any 2620 * next pid to display, if any
2255 */ 2621 */
2256 struct cgroup_pids *cp = s->private; 2622 struct cgroup_pidlist *l = s->private;
2257 struct cgroup *cgrp = cp->cgrp;
2258 int index = 0, pid = *pos; 2623 int index = 0, pid = *pos;
2259 int *iter; 2624 int *iter;
2260 2625
2261 down_read(&cgrp->pids_mutex); 2626 down_read(&l->mutex);
2262 if (pid) { 2627 if (pid) {
2263 int end = cp->length; 2628 int end = l->length;
2264 2629
2265 while (index < end) { 2630 while (index < end) {
2266 int mid = (index + end) / 2; 2631 int mid = (index + end) / 2;
2267 if (cp->tasks_pids[mid] == pid) { 2632 if (l->list[mid] == pid) {
2268 index = mid; 2633 index = mid;
2269 break; 2634 break;
2270 } else if (cp->tasks_pids[mid] <= pid) 2635 } else if (l->list[mid] <= pid)
2271 index = mid + 1; 2636 index = mid + 1;
2272 else 2637 else
2273 end = mid; 2638 end = mid;
2274 } 2639 }
2275 } 2640 }
2276 /* If we're off the end of the array, we're done */ 2641 /* If we're off the end of the array, we're done */
2277 if (index >= cp->length) 2642 if (index >= l->length)
2278 return NULL; 2643 return NULL;
2279 /* Update the abstract position to be the actual pid that we found */ 2644 /* Update the abstract position to be the actual pid that we found */
2280 iter = cp->tasks_pids + index; 2645 iter = l->list + index;
2281 *pos = *iter; 2646 *pos = *iter;
2282 return iter; 2647 return iter;
2283} 2648}
2284 2649
2285static void cgroup_tasks_stop(struct seq_file *s, void *v) 2650static void cgroup_pidlist_stop(struct seq_file *s, void *v)
2286{ 2651{
2287 struct cgroup_pids *cp = s->private; 2652 struct cgroup_pidlist *l = s->private;
2288 struct cgroup *cgrp = cp->cgrp; 2653 up_read(&l->mutex);
2289 up_read(&cgrp->pids_mutex);
2290} 2654}
2291 2655
2292static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2656static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
2293{ 2657{
2294 struct cgroup_pids *cp = s->private; 2658 struct cgroup_pidlist *l = s->private;
2295 int *p = v; 2659 pid_t *p = v;
2296 int *end = cp->tasks_pids + cp->length; 2660 pid_t *end = l->list + l->length;
2297
2298 /* 2661 /*
2299 * Advance to the next pid in the array. If this goes off the 2662 * Advance to the next pid in the array. If this goes off the
2300 * end, we're done 2663 * end, we're done
@@ -2308,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2308 } 2671 }
2309} 2672}
2310 2673
2311static int cgroup_tasks_show(struct seq_file *s, void *v) 2674static int cgroup_pidlist_show(struct seq_file *s, void *v)
2312{ 2675{
2313 return seq_printf(s, "%d\n", *(int *)v); 2676 return seq_printf(s, "%d\n", *(int *)v);
2314} 2677}
2315 2678
2316static struct seq_operations cgroup_tasks_seq_operations = { 2679/*
2317 .start = cgroup_tasks_start, 2680 * seq_operations functions for iterating on pidlists through seq_file -
2318 .stop = cgroup_tasks_stop, 2681 * independent of whether it's tasks or procs
2319 .next = cgroup_tasks_next, 2682 */
2320 .show = cgroup_tasks_show, 2683static const struct seq_operations cgroup_pidlist_seq_operations = {
2684 .start = cgroup_pidlist_start,
2685 .stop = cgroup_pidlist_stop,
2686 .next = cgroup_pidlist_next,
2687 .show = cgroup_pidlist_show,
2321}; 2688};
2322 2689
2323static void release_cgroup_pid_array(struct cgroup_pids *cp) 2690static void cgroup_release_pid_array(struct cgroup_pidlist *l)
2324{ 2691{
2325 struct cgroup *cgrp = cp->cgrp; 2692 /*
2326 2693 * the case where we're the last user of this particular pidlist will
2327 down_write(&cgrp->pids_mutex); 2694 * have us remove it from the cgroup's list, which entails taking the
2328 BUG_ON(!cp->use_count); 2695 * mutex. since in pidlist_find the pidlist->lock depends on cgroup->
2329 if (!--cp->use_count) { 2696 * pidlist_mutex, we have to take pidlist_mutex first.
2330 list_del(&cp->list); 2697 */
2331 put_pid_ns(cp->ns); 2698 mutex_lock(&l->owner->pidlist_mutex);
2332 kfree(cp->tasks_pids); 2699 down_write(&l->mutex);
2333 kfree(cp); 2700 BUG_ON(!l->use_count);
2701 if (!--l->use_count) {
2702 /* we're the last user if refcount is 0; remove and free */
2703 list_del(&l->links);
2704 mutex_unlock(&l->owner->pidlist_mutex);
2705 pidlist_free(l->list);
2706 put_pid_ns(l->key.ns);
2707 up_write(&l->mutex);
2708 kfree(l);
2709 return;
2334 } 2710 }
2335 up_write(&cgrp->pids_mutex); 2711 mutex_unlock(&l->owner->pidlist_mutex);
2712 up_write(&l->mutex);
2336} 2713}
2337 2714
2338static int cgroup_tasks_release(struct inode *inode, struct file *file) 2715static int cgroup_pidlist_release(struct inode *inode, struct file *file)
2339{ 2716{
2340 struct seq_file *seq; 2717 struct cgroup_pidlist *l;
2341 struct cgroup_pids *cp;
2342
2343 if (!(file->f_mode & FMODE_READ)) 2718 if (!(file->f_mode & FMODE_READ))
2344 return 0; 2719 return 0;
2345 2720 /*
2346 seq = file->private_data; 2721 * the seq_file will only be initialized if the file was opened for
2347 cp = seq->private; 2722 * reading; hence we check if it's not null only in that case.
2348 2723 */
2349 release_cgroup_pid_array(cp); 2724 l = ((struct seq_file *)file->private_data)->private;
2725 cgroup_release_pid_array(l);
2350 return seq_release(inode, file); 2726 return seq_release(inode, file);
2351} 2727}
2352 2728
2353static struct file_operations cgroup_tasks_operations = { 2729static const struct file_operations cgroup_pidlist_operations = {
2354 .read = seq_read, 2730 .read = seq_read,
2355 .llseek = seq_lseek, 2731 .llseek = seq_lseek,
2356 .write = cgroup_file_write, 2732 .write = cgroup_file_write,
2357 .release = cgroup_tasks_release, 2733 .release = cgroup_pidlist_release,
2358}; 2734};
2359 2735
2360/* 2736/*
2361 * Handle an open on 'tasks' file. Prepare an array containing the 2737 * The following functions handle opens on a file that displays a pidlist
2362 * process id's of tasks currently attached to the cgroup being opened. 2738 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's
2739 * in the cgroup.
2363 */ 2740 */
2364 2741/* helper function for the two below it */
2365static int cgroup_tasks_open(struct inode *unused, struct file *file) 2742static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
2366{ 2743{
2367 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2744 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2368 struct pid_namespace *ns = current->nsproxy->pid_ns; 2745 struct cgroup_pidlist *l;
2369 struct cgroup_pids *cp;
2370 pid_t *pidarray;
2371 int npids;
2372 int retval; 2746 int retval;
2373 2747
2374 /* Nothing to do for write-only files */ 2748 /* Nothing to do for write-only files */
2375 if (!(file->f_mode & FMODE_READ)) 2749 if (!(file->f_mode & FMODE_READ))
2376 return 0; 2750 return 0;
2377 2751
2378 /* 2752 /* have the array populated */
2379 * If cgroup gets more users after we read count, we won't have 2753 retval = pidlist_array_load(cgrp, type, &l);
2380 * enough space - tough. This race is indistinguishable to the 2754 if (retval)
2381 * caller from the case that the additional cgroup users didn't 2755 return retval;
2382 * show up until sometime later on. 2756 /* configure file information */
2383 */ 2757 file->f_op = &cgroup_pidlist_operations;
2384 npids = cgroup_task_count(cgrp);
2385 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2386 if (!pidarray)
2387 return -ENOMEM;
2388 npids = pid_array_load(pidarray, npids, cgrp);
2389 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2390
2391 /*
2392 * Store the array in the cgroup, freeing the old
2393 * array if necessary
2394 */
2395 down_write(&cgrp->pids_mutex);
2396
2397 list_for_each_entry(cp, &cgrp->pids_list, list) {
2398 if (ns == cp->ns)
2399 goto found;
2400 }
2401
2402 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2403 if (!cp) {
2404 up_write(&cgrp->pids_mutex);
2405 kfree(pidarray);
2406 return -ENOMEM;
2407 }
2408 cp->cgrp = cgrp;
2409 cp->ns = ns;
2410 get_pid_ns(ns);
2411 list_add(&cp->list, &cgrp->pids_list);
2412found:
2413 kfree(cp->tasks_pids);
2414 cp->tasks_pids = pidarray;
2415 cp->length = npids;
2416 cp->use_count++;
2417 up_write(&cgrp->pids_mutex);
2418
2419 file->f_op = &cgroup_tasks_operations;
2420 2758
2421 retval = seq_open(file, &cgroup_tasks_seq_operations); 2759 retval = seq_open(file, &cgroup_pidlist_seq_operations);
2422 if (retval) { 2760 if (retval) {
2423 release_cgroup_pid_array(cp); 2761 cgroup_release_pid_array(l);
2424 return retval; 2762 return retval;
2425 } 2763 }
2426 ((struct seq_file *)file->private_data)->private = cp; 2764 ((struct seq_file *)file->private_data)->private = l;
2427 return 0; 2765 return 0;
2428} 2766}
2767static int cgroup_tasks_open(struct inode *unused, struct file *file)
2768{
2769 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
2770}
2771static int cgroup_procs_open(struct inode *unused, struct file *file)
2772{
2773 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
2774}
2429 2775
2430static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 2776static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2431 struct cftype *cft) 2777 struct cftype *cft)
@@ -2448,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2448/* 2794/*
2449 * for the common functions, 'private' gives the type of file 2795 * for the common functions, 'private' gives the type of file
2450 */ 2796 */
2797/* for hysterical raisins, we can't put this on the older files */
2798#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
2451static struct cftype files[] = { 2799static struct cftype files[] = {
2452 { 2800 {
2453 .name = "tasks", 2801 .name = "tasks",
2454 .open = cgroup_tasks_open, 2802 .open = cgroup_tasks_open,
2455 .write_u64 = cgroup_tasks_write, 2803 .write_u64 = cgroup_tasks_write,
2456 .release = cgroup_tasks_release, 2804 .release = cgroup_pidlist_release,
2457 .private = FILE_TASKLIST,
2458 .mode = S_IRUGO | S_IWUSR, 2805 .mode = S_IRUGO | S_IWUSR,
2459 }, 2806 },
2460 2807 {
2808 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
2809 .open = cgroup_procs_open,
2810 /* .write_u64 = cgroup_procs_write, TODO */
2811 .release = cgroup_pidlist_release,
2812 .mode = S_IRUGO,
2813 },
2461 { 2814 {
2462 .name = "notify_on_release", 2815 .name = "notify_on_release",
2463 .read_u64 = cgroup_read_notify_on_release, 2816 .read_u64 = cgroup_read_notify_on_release,
2464 .write_u64 = cgroup_write_notify_on_release, 2817 .write_u64 = cgroup_write_notify_on_release,
2465 .private = FILE_NOTIFY_ON_RELEASE,
2466 }, 2818 },
2467}; 2819};
2468 2820
@@ -2471,7 +2823,6 @@ static struct cftype cft_release_agent = {
2471 .read_seq_string = cgroup_release_agent_show, 2823 .read_seq_string = cgroup_release_agent_show,
2472 .write_string = cgroup_release_agent_write, 2824 .write_string = cgroup_release_agent_write,
2473 .max_write_len = PATH_MAX, 2825 .max_write_len = PATH_MAX,
2474 .private = FILE_RELEASE_AGENT,
2475}; 2826};
2476 2827
2477static int cgroup_populate_dir(struct cgroup *cgrp) 2828static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -2878,6 +3229,7 @@ int __init cgroup_init_early(void)
2878 init_task.cgroups = &init_css_set; 3229 init_task.cgroups = &init_css_set;
2879 3230
2880 init_css_set_link.cg = &init_css_set; 3231 init_css_set_link.cg = &init_css_set;
3232 init_css_set_link.cgrp = dummytop;
2881 list_add(&init_css_set_link.cgrp_link_list, 3233 list_add(&init_css_set_link.cgrp_link_list,
2882 &rootnode.top_cgroup.css_sets); 3234 &rootnode.top_cgroup.css_sets);
2883 list_add(&init_css_set_link.cg_link_list, 3235 list_add(&init_css_set_link.cg_link_list,
@@ -2932,7 +3284,7 @@ int __init cgroup_init(void)
2932 /* Add init_css_set to the hash table */ 3284 /* Add init_css_set to the hash table */
2933 hhead = css_set_hash(init_css_set.subsys); 3285 hhead = css_set_hash(init_css_set.subsys);
2934 hlist_add_head(&init_css_set.hlist, hhead); 3286 hlist_add_head(&init_css_set.hlist, hhead);
2935 3287 BUG_ON(!init_root_id(&rootnode));
2936 err = register_filesystem(&cgroup_fs_type); 3288 err = register_filesystem(&cgroup_fs_type);
2937 if (err < 0) 3289 if (err < 0)
2938 goto out; 3290 goto out;
@@ -2985,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
2985 for_each_active_root(root) { 3337 for_each_active_root(root) {
2986 struct cgroup_subsys *ss; 3338 struct cgroup_subsys *ss;
2987 struct cgroup *cgrp; 3339 struct cgroup *cgrp;
2988 int subsys_id;
2989 int count = 0; 3340 int count = 0;
2990 3341
2991 seq_printf(m, "%lu:", root->subsys_bits); 3342 seq_printf(m, "%d:", root->hierarchy_id);
2992 for_each_subsys(root, ss) 3343 for_each_subsys(root, ss)
2993 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 3344 seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
3345 if (strlen(root->name))
3346 seq_printf(m, "%sname=%s", count ? "," : "",
3347 root->name);
2994 seq_putc(m, ':'); 3348 seq_putc(m, ':');
2995 get_first_subsys(&root->top_cgroup, NULL, &subsys_id); 3349 cgrp = task_cgroup_from_root(tsk, root);
2996 cgrp = task_cgroup(tsk, subsys_id);
2997 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 3350 retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2998 if (retval < 0) 3351 if (retval < 0)
2999 goto out_unlock; 3352 goto out_unlock;
@@ -3016,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
3016 return single_open(file, proc_cgroup_show, pid); 3369 return single_open(file, proc_cgroup_show, pid);
3017} 3370}
3018 3371
3019struct file_operations proc_cgroup_operations = { 3372const struct file_operations proc_cgroup_operations = {
3020 .open = cgroup_open, 3373 .open = cgroup_open,
3021 .read = seq_read, 3374 .read = seq_read,
3022 .llseek = seq_lseek, 3375 .llseek = seq_lseek,
@@ -3032,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
3032 mutex_lock(&cgroup_mutex); 3385 mutex_lock(&cgroup_mutex);
3033 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 3386 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
3034 struct cgroup_subsys *ss = subsys[i]; 3387 struct cgroup_subsys *ss = subsys[i];
3035 seq_printf(m, "%s\t%lu\t%d\t%d\n", 3388 seq_printf(m, "%s\t%d\t%d\t%d\n",
3036 ss->name, ss->root->subsys_bits, 3389 ss->name, ss->root->hierarchy_id,
3037 ss->root->number_of_cgroups, !ss->disabled); 3390 ss->root->number_of_cgroups, !ss->disabled);
3038 } 3391 }
3039 mutex_unlock(&cgroup_mutex); 3392 mutex_unlock(&cgroup_mutex);
@@ -3045,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
3045 return single_open(file, proc_cgroupstats_show, NULL); 3398 return single_open(file, proc_cgroupstats_show, NULL);
3046} 3399}
3047 3400
3048static struct file_operations proc_cgroupstats_operations = { 3401static const struct file_operations proc_cgroupstats_operations = {
3049 .open = cgroupstats_open, 3402 .open = cgroupstats_open,
3050 .read = seq_read, 3403 .read = seq_read,
3051 .llseek = seq_lseek, 3404 .llseek = seq_lseek,
@@ -3319,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
3319{ 3672{
3320 int ret; 3673 int ret;
3321 struct cgroup *target; 3674 struct cgroup *target;
3322 int subsys_id;
3323 3675
3324 if (cgrp == dummytop) 3676 if (cgrp == dummytop)
3325 return 1; 3677 return 1;
3326 3678
3327 get_first_subsys(cgrp, NULL, &subsys_id); 3679 target = task_cgroup_from_root(task, cgrp->root);
3328 target = task_cgroup(task, subsys_id);
3329 while (cgrp != target && cgrp!= cgrp->top_cgroup) 3680 while (cgrp != target && cgrp!= cgrp->top_cgroup)
3330 cgrp = cgrp->parent; 3681 cgrp = cgrp->parent;
3331 ret = (cgrp == target); 3682 ret = (cgrp == target);
@@ -3357,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp)
3357void __css_put(struct cgroup_subsys_state *css) 3708void __css_put(struct cgroup_subsys_state *css)
3358{ 3709{
3359 struct cgroup *cgrp = css->cgroup; 3710 struct cgroup *cgrp = css->cgroup;
3711 int val;
3360 rcu_read_lock(); 3712 rcu_read_lock();
3361 if (atomic_dec_return(&css->refcnt) == 1) { 3713 val = atomic_dec_return(&css->refcnt);
3714 if (val == 1) {
3362 if (notify_on_release(cgrp)) { 3715 if (notify_on_release(cgrp)) {
3363 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3716 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3364 check_for_release(cgrp); 3717 check_for_release(cgrp);
@@ -3366,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css)
3366 cgroup_wakeup_rmdir_waiter(cgrp); 3719 cgroup_wakeup_rmdir_waiter(cgrp);
3367 } 3720 }
3368 rcu_read_unlock(); 3721 rcu_read_unlock();
3722 WARN_ON_ONCE(val < 1);
3369} 3723}
3370 3724
3371/* 3725/*
@@ -3692,3 +4046,154 @@ css_get_next(struct cgroup_subsys *ss, int id,
3692 return ret; 4046 return ret;
3693} 4047}
3694 4048
4049#ifdef CONFIG_CGROUP_DEBUG
4050static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4051 struct cgroup *cont)
4052{
4053 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
4054
4055 if (!css)
4056 return ERR_PTR(-ENOMEM);
4057
4058 return css;
4059}
4060
4061static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
4062{
4063 kfree(cont->subsys[debug_subsys_id]);
4064}
4065
4066static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
4067{
4068 return atomic_read(&cont->count);
4069}
4070
4071static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
4072{
4073 return cgroup_task_count(cont);
4074}
4075
4076static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
4077{
4078 return (u64)(unsigned long)current->cgroups;
4079}
4080
4081static u64 current_css_set_refcount_read(struct cgroup *cont,
4082 struct cftype *cft)
4083{
4084 u64 count;
4085
4086 rcu_read_lock();
4087 count = atomic_read(&current->cgroups->refcount);
4088 rcu_read_unlock();
4089 return count;
4090}
4091
4092static int current_css_set_cg_links_read(struct cgroup *cont,
4093 struct cftype *cft,
4094 struct seq_file *seq)
4095{
4096 struct cg_cgroup_link *link;
4097 struct css_set *cg;
4098
4099 read_lock(&css_set_lock);
4100 rcu_read_lock();
4101 cg = rcu_dereference(current->cgroups);
4102 list_for_each_entry(link, &cg->cg_links, cg_link_list) {
4103 struct cgroup *c = link->cgrp;
4104 const char *name;
4105
4106 if (c->dentry)
4107 name = c->dentry->d_name.name;
4108 else
4109 name = "?";
4110 seq_printf(seq, "Root %d group %s\n",
4111 c->root->hierarchy_id, name);
4112 }
4113 rcu_read_unlock();
4114 read_unlock(&css_set_lock);
4115 return 0;
4116}
4117
4118#define MAX_TASKS_SHOWN_PER_CSS 25
4119static int cgroup_css_links_read(struct cgroup *cont,
4120 struct cftype *cft,
4121 struct seq_file *seq)
4122{
4123 struct cg_cgroup_link *link;
4124
4125 read_lock(&css_set_lock);
4126 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
4127 struct css_set *cg = link->cg;
4128 struct task_struct *task;
4129 int count = 0;
4130 seq_printf(seq, "css_set %p\n", cg);
4131 list_for_each_entry(task, &cg->tasks, cg_list) {
4132 if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
4133 seq_puts(seq, " ...\n");
4134 break;
4135 } else {
4136 seq_printf(seq, " task %d\n",
4137 task_pid_vnr(task));
4138 }
4139 }
4140 }
4141 read_unlock(&css_set_lock);
4142 return 0;
4143}
4144
4145static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
4146{
4147 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
4148}
4149
4150static struct cftype debug_files[] = {
4151 {
4152 .name = "cgroup_refcount",
4153 .read_u64 = cgroup_refcount_read,
4154 },
4155 {
4156 .name = "taskcount",
4157 .read_u64 = debug_taskcount_read,
4158 },
4159
4160 {
4161 .name = "current_css_set",
4162 .read_u64 = current_css_set_read,
4163 },
4164
4165 {
4166 .name = "current_css_set_refcount",
4167 .read_u64 = current_css_set_refcount_read,
4168 },
4169
4170 {
4171 .name = "current_css_set_cg_links",
4172 .read_seq_string = current_css_set_cg_links_read,
4173 },
4174
4175 {
4176 .name = "cgroup_css_links",
4177 .read_seq_string = cgroup_css_links_read,
4178 },
4179
4180 {
4181 .name = "releasable",
4182 .read_u64 = releasable_read,
4183 },
4184};
4185
4186static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
4187{
4188 return cgroup_add_files(cont, ss, debug_files,
4189 ARRAY_SIZE(debug_files));
4190}
4191
4192struct cgroup_subsys debug_subsys = {
4193 .name = "debug",
4194 .create = debug_create,
4195 .destroy = debug_destroy,
4196 .populate = debug_populate,
4197 .subsys_id = debug_subsys_id,
4198};
4199#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
deleted file mode 100644
index 0c92d797baa6..000000000000
--- a/kernel/cgroup_debug.c
+++ /dev/null
@@ -1,105 +0,0 @@
1/*
2 * kernel/cgroup_debug.c - Example cgroup subsystem that
3 * exposes debug info
4 *
5 * Copyright (C) Google Inc, 2007
6 *
7 * Developed by Paul Menage (menage@google.com)
8 *
9 */
10
11#include <linux/cgroup.h>
12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/rcupdate.h>
15
16#include <asm/atomic.h>
17
18static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
19 struct cgroup *cont)
20{
21 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
22
23 if (!css)
24 return ERR_PTR(-ENOMEM);
25
26 return css;
27}
28
29static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
30{
31 kfree(cont->subsys[debug_subsys_id]);
32}
33
34static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
35{
36 return atomic_read(&cont->count);
37}
38
39static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
40{
41 u64 count;
42
43 count = cgroup_task_count(cont);
44 return count;
45}
46
47static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
48{
49 return (u64)(long)current->cgroups;
50}
51
52static u64 current_css_set_refcount_read(struct cgroup *cont,
53 struct cftype *cft)
54{
55 u64 count;
56
57 rcu_read_lock();
58 count = atomic_read(&current->cgroups->refcount);
59 rcu_read_unlock();
60 return count;
61}
62
63static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
64{
65 return test_bit(CGRP_RELEASABLE, &cgrp->flags);
66}
67
68static struct cftype files[] = {
69 {
70 .name = "cgroup_refcount",
71 .read_u64 = cgroup_refcount_read,
72 },
73 {
74 .name = "taskcount",
75 .read_u64 = taskcount_read,
76 },
77
78 {
79 .name = "current_css_set",
80 .read_u64 = current_css_set_read,
81 },
82
83 {
84 .name = "current_css_set_refcount",
85 .read_u64 = current_css_set_refcount_read,
86 },
87
88 {
89 .name = "releasable",
90 .read_u64 = releasable_read,
91 },
92};
93
94static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
95{
96 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
97}
98
99struct cgroup_subsys debug_subsys = {
100 .name = "debug",
101 .create = debug_create,
102 .destroy = debug_destroy,
103 .populate = debug_populate,
104 .subsys_id = debug_subsys_id,
105};
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fb249e2bcada..59e9ef6aab40 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
159 */ 159 */
160static int freezer_can_attach(struct cgroup_subsys *ss, 160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup, 161 struct cgroup *new_cgroup,
162 struct task_struct *task) 162 struct task_struct *task, bool threadgroup)
163{ 163{
164 struct freezer *freezer; 164 struct freezer *freezer;
165 165
@@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
177 if (freezer->state == CGROUP_FROZEN) 177 if (freezer->state == CGROUP_FROZEN)
178 return -EBUSY; 178 return -EBUSY;
179 179
180 if (threadgroup) {
181 struct task_struct *c;
182
183 rcu_read_lock();
184 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
185 if (is_task_frozen_enough(c)) {
186 rcu_read_unlock();
187 return -EBUSY;
188 }
189 }
190 rcu_read_unlock();
191 }
192
180 return 0; 193 return 0;
181} 194}
182 195
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7e75a41bd508..b5cb469d2545 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1324static cpumask_var_t cpus_attach; 1324static cpumask_var_t cpus_attach;
1325 1325
1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1326/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1327static int cpuset_can_attach(struct cgroup_subsys *ss, 1327static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1328 struct cgroup *cont, struct task_struct *tsk) 1328 struct task_struct *tsk, bool threadgroup)
1329{ 1329{
1330 int ret;
1330 struct cpuset *cs = cgroup_cs(cont); 1331 struct cpuset *cs = cgroup_cs(cont);
1331 1332
1332 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1333 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1343 if (tsk->flags & PF_THREAD_BOUND) 1344 if (tsk->flags & PF_THREAD_BOUND)
1344 return -EINVAL; 1345 return -EINVAL;
1345 1346
1346 return security_task_setscheduler(tsk, 0, NULL); 1347 ret = security_task_setscheduler(tsk, 0, NULL);
1348 if (ret)
1349 return ret;
1350 if (threadgroup) {
1351 struct task_struct *c;
1352
1353 rcu_read_lock();
1354 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1355 ret = security_task_setscheduler(c, 0, NULL);
1356 if (ret) {
1357 rcu_read_unlock();
1358 return ret;
1359 }
1360 }
1361 rcu_read_unlock();
1362 }
1363 return 0;
1364}
1365
1366static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1367 struct cpuset *cs)
1368{
1369 int err;
1370 /*
1371 * can_attach beforehand should guarantee that this doesn't fail.
1372 * TODO: have a better way to handle failure here
1373 */
1374 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1375 WARN_ON_ONCE(err);
1376
1377 task_lock(tsk);
1378 cpuset_change_task_nodemask(tsk, to);
1379 task_unlock(tsk);
1380 cpuset_update_task_spread_flag(cs, tsk);
1381
1347} 1382}
1348 1383
1349static void cpuset_attach(struct cgroup_subsys *ss, 1384static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1350 struct cgroup *cont, struct cgroup *oldcont, 1385 struct cgroup *oldcont, struct task_struct *tsk,
1351 struct task_struct *tsk) 1386 bool threadgroup)
1352{ 1387{
1353 nodemask_t from, to; 1388 nodemask_t from, to;
1354 struct mm_struct *mm; 1389 struct mm_struct *mm;
1355 struct cpuset *cs = cgroup_cs(cont); 1390 struct cpuset *cs = cgroup_cs(cont);
1356 struct cpuset *oldcs = cgroup_cs(oldcont); 1391 struct cpuset *oldcs = cgroup_cs(oldcont);
1357 int err;
1358 1392
1359 if (cs == &top_cpuset) { 1393 if (cs == &top_cpuset) {
1360 cpumask_copy(cpus_attach, cpu_possible_mask); 1394 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1363 guarantee_online_cpus(cs, cpus_attach); 1397 guarantee_online_cpus(cs, cpus_attach);
1364 guarantee_online_mems(cs, &to); 1398 guarantee_online_mems(cs, &to);
1365 } 1399 }
1366 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1367 if (err)
1368 return;
1369 1400
1370 task_lock(tsk); 1401 /* do per-task migration stuff possibly for each in the threadgroup */
1371 cpuset_change_task_nodemask(tsk, &to); 1402 cpuset_attach_task(tsk, &to, cs);
1372 task_unlock(tsk); 1403 if (threadgroup) {
1373 cpuset_update_task_spread_flag(cs, tsk); 1404 struct task_struct *c;
1405 rcu_read_lock();
1406 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1407 cpuset_attach_task(c, &to, cs);
1408 }
1409 rcu_read_unlock();
1410 }
1374 1411
1412 /* change mm; only needs to be done once even if threadgroup */
1375 from = oldcs->mems_allowed; 1413 from = oldcs->mems_allowed;
1376 to = cs->mems_allowed; 1414 to = cs->mems_allowed;
1377 mm = get_task_mm(tsk); 1415 mm = get_task_mm(tsk);
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..dd76cfe5f5b0 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
95 release_tgcred(cred); 149 release_tgcred(cred);
96 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
97 free_uid(cred->user); 152 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
99} 154}
@@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 161 */
107void __put_cred(struct cred *cred) 162void __put_cred(struct cred *cred)
108{ 163{
164 kdebug("__put_cred(%p{%d,%d})", cred,
165 atomic_read(&cred->usage),
166 read_cred_subscribers(cred));
167
109 BUG_ON(atomic_read(&cred->usage) != 0); 168 BUG_ON(atomic_read(&cred->usage) != 0);
169#ifdef CONFIG_DEBUG_CREDENTIALS
170 BUG_ON(read_cred_subscribers(cred) != 0);
171 cred->magic = CRED_MAGIC_DEAD;
172 cred->put_addr = __builtin_return_address(0);
173#endif
174 BUG_ON(cred == current->cred);
175 BUG_ON(cred == current->real_cred);
110 176
111 call_rcu(&cred->rcu, put_cred_rcu); 177 call_rcu(&cred->rcu, put_cred_rcu);
112} 178}
113EXPORT_SYMBOL(__put_cred); 179EXPORT_SYMBOL(__put_cred);
114 180
181/*
182 * Clean up a task's credentials when it exits
183 */
184void exit_creds(struct task_struct *tsk)
185{
186 struct cred *cred;
187
188 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
189 atomic_read(&tsk->cred->usage),
190 read_cred_subscribers(tsk->cred));
191
192 cred = (struct cred *) tsk->real_cred;
193 tsk->real_cred = NULL;
194 validate_creds(cred);
195 alter_cred_subscribers(cred, -1);
196 put_cred(cred);
197
198 cred = (struct cred *) tsk->cred;
199 tsk->cred = NULL;
200 validate_creds(cred);
201 alter_cred_subscribers(cred, -1);
202 put_cred(cred);
203
204 cred = (struct cred *) tsk->replacement_session_keyring;
205 if (cred) {
206 tsk->replacement_session_keyring = NULL;
207 validate_creds(cred);
208 put_cred(cred);
209 }
210}
211
212/*
213 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM.
215 */
216struct cred *cred_alloc_blank(void)
217{
218 struct cred *new;
219
220 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
221 if (!new)
222 return NULL;
223
224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) {
227 kfree(new);
228 return NULL;
229 }
230 atomic_set(&new->tgcred->usage, 1);
231#endif
232
233 atomic_set(&new->usage, 1);
234
235 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
236 goto error;
237
238#ifdef CONFIG_DEBUG_CREDENTIALS
239 new->magic = CRED_MAGIC;
240#endif
241 return new;
242
243error:
244 abort_creds(new);
245 return NULL;
246}
247
115/** 248/**
116 * prepare_creds - Prepare a new set of credentials for modification 249 * prepare_creds - Prepare a new set of credentials for modification
117 * 250 *
@@ -132,16 +265,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 265 const struct cred *old;
133 struct cred *new; 266 struct cred *new;
134 267
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 268 validate_process_creds();
136 269
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 270 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 271 if (!new)
139 return NULL; 272 return NULL;
140 273
274 kdebug("prepare_creds() alloc %p", new);
275
141 old = task->cred; 276 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 277 memcpy(new, old, sizeof(struct cred));
143 278
144 atomic_set(&new->usage, 1); 279 atomic_set(&new->usage, 1);
280 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 281 get_group_info(new->group_info);
146 get_uid(new->user); 282 get_uid(new->user);
147 283
@@ -157,6 +293,7 @@ struct cred *prepare_creds(void)
157 293
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 294 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 295 goto error;
296 validate_creds(new);
160 return new; 297 return new;
161 298
162error: 299error:
@@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 366 if (!new)
230 return NULL; 367 return NULL;
231 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
232 memcpy(new, &init_cred, sizeof(struct cred)); 371 memcpy(new, &init_cred, sizeof(struct cred));
233 372
234 atomic_set(&new->usage, 1); 373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 375 get_group_info(new->group_info);
236 get_uid(new->user); 376 get_uid(new->user);
237 377
@@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 390#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 392 goto error;
393 validate_creds(new);
253 394
254 BUG_ON(atomic_read(&new->usage) != 1); 395 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 396 return new;
@@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 427 ) {
287 p->real_cred = get_cred(p->cred); 428 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 429 get_cred(p->cred);
430 alter_cred_subscribers(p->cred, 2);
431 kdebug("share_creds(%p{%d,%d})",
432 p->cred, atomic_read(&p->cred->usage),
433 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 434 atomic_inc(&p->cred->user->processes);
290 return 0; 435 return 0;
291 } 436 }
@@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 476
332 atomic_inc(&new->user->processes); 477 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 478 p->cred = p->real_cred = get_cred(new);
479 alter_cred_subscribers(new, 2);
480 validate_creds(new);
334 return 0; 481 return 0;
335 482
336error_put: 483error_put:
@@ -355,13 +502,20 @@ error_put:
355int commit_creds(struct cred *new) 502int commit_creds(struct cred *new)
356{ 503{
357 struct task_struct *task = current; 504 struct task_struct *task = current;
358 const struct cred *old; 505 const struct cred *old = task->real_cred;
506
507 kdebug("commit_creds(%p{%d,%d})", new,
508 atomic_read(&new->usage),
509 read_cred_subscribers(new));
359 510
360 BUG_ON(task->cred != task->real_cred); 511 BUG_ON(task->cred != old);
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 512#ifdef CONFIG_DEBUG_CREDENTIALS
513 BUG_ON(read_cred_subscribers(old) < 2);
514 validate_creds(old);
515 validate_creds(new);
516#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 517 BUG_ON(atomic_read(&new->usage) < 1);
363 518
364 old = task->real_cred;
365 security_commit_creds(new, old); 519 security_commit_creds(new, old);
366 520
367 get_cred(new); /* we will require a ref for the subj creds too */ 521 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +544,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 544 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 545 * we should be checking for it. -DaveM
392 */ 546 */
547 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 548 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 549 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 550 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 551 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 552 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 553 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2);
399 555
400 sched_switch_user(task); 556 sched_switch_user(task);
401 557
@@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 584 */
429void abort_creds(struct cred *new) 585void abort_creds(struct cred *new)
430{ 586{
587 kdebug("abort_creds(%p{%d,%d})", new,
588 atomic_read(&new->usage),
589 read_cred_subscribers(new));
590
591#ifdef CONFIG_DEBUG_CREDENTIALS
592 BUG_ON(read_cred_subscribers(new) != 0);
593#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 594 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 595 put_cred(new);
433} 596}
@@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 607{
445 const struct cred *old = current->cred; 608 const struct cred *old = current->cred;
446 609
447 rcu_assign_pointer(current->cred, get_cred(new)); 610 kdebug("override_creds(%p{%d,%d})", new,
611 atomic_read(&new->usage),
612 read_cred_subscribers(new));
613
614 validate_creds(old);
615 validate_creds(new);
616 get_cred(new);
617 alter_cred_subscribers(new, 1);
618 rcu_assign_pointer(current->cred, new);
619 alter_cred_subscribers(old, -1);
620
621 kdebug("override_creds() = %p{%d,%d}", old,
622 atomic_read(&old->usage),
623 read_cred_subscribers(old));
448 return old; 624 return old;
449} 625}
450EXPORT_SYMBOL(override_creds); 626EXPORT_SYMBOL(override_creds);
@@ -460,7 +636,15 @@ void revert_creds(const struct cred *old)
460{ 636{
461 const struct cred *override = current->cred; 637 const struct cred *override = current->cred;
462 638
639 kdebug("revert_creds(%p{%d,%d})", old,
640 atomic_read(&old->usage),
641 read_cred_subscribers(old));
642
643 validate_creds(old);
644 validate_creds(override);
645 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 646 rcu_assign_pointer(current->cred, old);
647 alter_cred_subscribers(override, -1);
464 put_cred(override); 648 put_cred(override);
465} 649}
466EXPORT_SYMBOL(revert_creds); 650EXPORT_SYMBOL(revert_creds);
@@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 686 if (!new)
503 return NULL; 687 return NULL;
504 688
689 kdebug("prepare_kernel_cred() alloc %p", new);
690
505 if (daemon) 691 if (daemon)
506 old = get_task_cred(daemon); 692 old = get_task_cred(daemon);
507 else 693 else
508 old = get_cred(&init_cred); 694 old = get_cred(&init_cred);
509 695
696 validate_creds(old);
697
510 *new = *old; 698 *new = *old;
511 get_uid(new->user); 699 get_uid(new->user);
512 get_group_info(new->group_info); 700 get_group_info(new->group_info);
@@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 714 goto error;
527 715
528 atomic_set(&new->usage, 1); 716 atomic_set(&new->usage, 1);
717 set_cred_subscribers(new, 0);
529 put_cred(old); 718 put_cred(old);
719 validate_creds(new);
530 return new; 720 return new;
531 721
532error: 722error:
@@ -589,3 +779,114 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 779 return security_kernel_create_files_as(new, inode);
590} 780}
591EXPORT_SYMBOL(set_create_files_as); 781EXPORT_SYMBOL(set_create_files_as);
782
783#ifdef CONFIG_DEBUG_CREDENTIALS
784
785bool creds_are_invalid(const struct cred *cred)
786{
787 if (cred->magic != CRED_MAGIC)
788 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE)
794 return true;
795 if ((*(u32 *)cred->security & 0xffffff00) ==
796 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))
797 return true;
798 }
799#endif
800 return false;
801}
802EXPORT_SYMBOL(creds_are_invalid);
803
804/*
805 * dump invalid credentials
806 */
807static void dump_invalid_creds(const struct cred *cred, const char *label,
808 const struct task_struct *tsk)
809{
810 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
811 label, cred,
812 cred == &init_cred ? "[init]" : "",
813 cred == tsk->real_cred ? "[real]" : "",
814 cred == tsk->cred ? "[eff]" : "");
815 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
816 cred->magic, cred->put_addr);
817 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
818 atomic_read(&cred->usage),
819 read_cred_subscribers(cred));
820 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
821 cred->uid, cred->euid, cred->suid, cred->fsuid);
822 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
823 cred->gid, cred->egid, cred->sgid, cred->fsgid);
824#ifdef CONFIG_SECURITY
825 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
826 if ((unsigned long) cred->security >= PAGE_SIZE &&
827 (((unsigned long) cred->security & 0xffffff00) !=
828 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
829 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
830 ((u32*)cred->security)[0],
831 ((u32*)cred->security)[1]);
832#endif
833}
834
835/*
836 * report use of invalid credentials
837 */
838void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
839{
840 printk(KERN_ERR "CRED: Invalid credentials\n");
841 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
842 dump_invalid_creds(cred, "Specified", current);
843 BUG();
844}
845EXPORT_SYMBOL(__invalid_creds);
846
847/*
848 * check the credentials on a process
849 */
850void __validate_process_creds(struct task_struct *tsk,
851 const char *file, unsigned line)
852{
853 if (tsk->cred == tsk->real_cred) {
854 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
855 creds_are_invalid(tsk->cred)))
856 goto invalid_creds;
857 } else {
858 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
859 read_cred_subscribers(tsk->cred) < 1 ||
860 creds_are_invalid(tsk->real_cred) ||
861 creds_are_invalid(tsk->cred)))
862 goto invalid_creds;
863 }
864 return;
865
866invalid_creds:
867 printk(KERN_ERR "CRED: Invalid process credentials\n");
868 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
869
870 dump_invalid_creds(tsk->real_cred, "Real", tsk);
871 if (tsk->cred != tsk->real_cred)
872 dump_invalid_creds(tsk->cred, "Effective", tsk);
873 else
874 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
875 BUG();
876}
877EXPORT_SYMBOL(__validate_process_creds);
878
879/*
880 * check creds for do_exit()
881 */
882void validate_creds_for_do_exit(struct task_struct *tsk)
883{
884 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
885 tsk->real_cred, tsk->cred,
886 atomic_read(&tsk->cred->usage),
887 read_cred_subscribers(tsk->cred));
888
889 __validate_process_creds(tsk, __FILE__, __LINE__);
890}
891
892#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..5859f598c951 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
47#include <linux/tracehook.h> 47#include <linux/tracehook.h>
48#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 154{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 156
157#ifdef CONFIG_PERF_COUNTERS 157#ifdef CONFIG_PERF_EVENTS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp); 158 WARN_ON_ONCE(tsk->perf_event_ctxp);
159#endif 159#endif
160 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
161 put_task_struct(tsk); 161 put_task_struct(tsk);
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
359{ 359{
360 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
361 361
362 if (task_session(curr) != pid) 362 if (task_session(curr) != pid) {
363 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
364 366
365 if (task_pgrp(curr) != pid) 367 if (task_pgrp(curr) != pid)
366 change_pid(curr, PIDTYPE_PGID, pid); 368 change_pid(curr, PIDTYPE_PGID, pid);
@@ -901,6 +903,8 @@ NORET_TYPE void do_exit(long code)
901 903
902 tracehook_report_exit(&code); 904 tracehook_report_exit(&code);
903 905
906 validate_creds_for_do_exit(tsk);
907
904 /* 908 /*
905 * We're taking recursive faults here in do_exit. Safest is to just 909 * We're taking recursive faults here in do_exit. Safest is to just
906 * leave this task alone and wait for reboot. 910 * leave this task alone and wait for reboot.
@@ -943,6 +947,8 @@ NORET_TYPE void do_exit(long code)
943 if (group_dead) { 947 if (group_dead) {
944 hrtimer_cancel(&tsk->signal->real_timer); 948 hrtimer_cancel(&tsk->signal->real_timer);
945 exit_itimers(tsk->signal); 949 exit_itimers(tsk->signal);
950 if (tsk->mm)
951 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
946 } 952 }
947 acct_collect(code, group_dead); 953 acct_collect(code, group_dead);
948 if (group_dead) 954 if (group_dead)
@@ -970,8 +976,6 @@ NORET_TYPE void do_exit(long code)
970 disassociate_ctty(1); 976 disassociate_ctty(1);
971 977
972 module_put(task_thread_info(tsk)->exec_domain->module); 978 module_put(task_thread_info(tsk)->exec_domain->module);
973 if (tsk->binfmt)
974 module_put(tsk->binfmt->module);
975 979
976 proc_exit_connector(tsk); 980 proc_exit_connector(tsk);
977 981
@@ -979,7 +983,7 @@ NORET_TYPE void do_exit(long code)
979 * Flush inherited counters to the parent - before the parent 983 * Flush inherited counters to the parent - before the parent
980 * gets woken up by child-exit notifications. 984 * gets woken up by child-exit notifications.
981 */ 985 */
982 perf_counter_exit_task(tsk); 986 perf_event_exit_task(tsk);
983 987
984 exit_notify(tsk, group_dead); 988 exit_notify(tsk, group_dead);
985#ifdef CONFIG_NUMA 989#ifdef CONFIG_NUMA
@@ -1009,7 +1013,10 @@ NORET_TYPE void do_exit(long code)
1009 if (tsk->splice_pipe) 1013 if (tsk->splice_pipe)
1010 __free_pipe_info(tsk->splice_pipe); 1014 __free_pipe_info(tsk->splice_pipe);
1011 1015
1016 validate_creds_for_do_exit(tsk);
1017
1012 preempt_disable(); 1018 preempt_disable();
1019 exit_rcu();
1013 /* causes final put_task_struct in finish_task_switch(). */ 1020 /* causes final put_task_struct in finish_task_switch(). */
1014 tsk->state = TASK_DEAD; 1021 tsk->state = TASK_DEAD;
1015 schedule(); 1022 schedule();
@@ -1088,28 +1095,28 @@ struct wait_opts {
1088 int __user *wo_stat; 1095 int __user *wo_stat;
1089 struct rusage __user *wo_rusage; 1096 struct rusage __user *wo_rusage;
1090 1097
1098 wait_queue_t child_wait;
1091 int notask_error; 1099 int notask_error;
1092}; 1100};
1093 1101
1094static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) 1102static inline
1103struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
1095{ 1104{
1096 struct pid *pid = NULL; 1105 if (type != PIDTYPE_PID)
1097 if (type == PIDTYPE_PID) 1106 task = task->group_leader;
1098 pid = task->pids[type].pid; 1107 return task->pids[type].pid;
1099 else if (type < PIDTYPE_MAX)
1100 pid = task->group_leader->pids[type].pid;
1101 return pid;
1102} 1108}
1103 1109
1104static int eligible_child(struct wait_opts *wo, struct task_struct *p) 1110static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
1105{ 1111{
1106 int err; 1112 return wo->wo_type == PIDTYPE_MAX ||
1107 1113 task_pid_type(p, wo->wo_type) == wo->wo_pid;
1108 if (wo->wo_type < PIDTYPE_MAX) { 1114}
1109 if (task_pid_type(p, wo->wo_type) != wo->wo_pid)
1110 return 0;
1111 }
1112 1115
1116static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1117{
1118 if (!eligible_pid(wo, p))
1119 return 0;
1113 /* Wait for all children (clone and not) if __WALL is set; 1120 /* Wait for all children (clone and not) if __WALL is set;
1114 * otherwise, wait for clone children *only* if __WCLONE is 1121 * otherwise, wait for clone children *only* if __WCLONE is
1115 * set; otherwise, wait for non-clone children *only*. (Note: 1122 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1119,10 +1126,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p)
1119 && !(wo->wo_flags & __WALL)) 1126 && !(wo->wo_flags & __WALL))
1120 return 0; 1127 return 0;
1121 1128
1122 err = security_task_wait(p);
1123 if (err)
1124 return err;
1125
1126 return 1; 1129 return 1;
1127} 1130}
1128 1131
@@ -1135,18 +1138,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
1135 1138
1136 put_task_struct(p); 1139 put_task_struct(p);
1137 infop = wo->wo_info; 1140 infop = wo->wo_info;
1138 if (!retval) 1141 if (infop) {
1139 retval = put_user(SIGCHLD, &infop->si_signo); 1142 if (!retval)
1140 if (!retval) 1143 retval = put_user(SIGCHLD, &infop->si_signo);
1141 retval = put_user(0, &infop->si_errno); 1144 if (!retval)
1142 if (!retval) 1145 retval = put_user(0, &infop->si_errno);
1143 retval = put_user((short)why, &infop->si_code); 1146 if (!retval)
1144 if (!retval) 1147 retval = put_user((short)why, &infop->si_code);
1145 retval = put_user(pid, &infop->si_pid); 1148 if (!retval)
1146 if (!retval) 1149 retval = put_user(pid, &infop->si_pid);
1147 retval = put_user(uid, &infop->si_uid); 1150 if (!retval)
1148 if (!retval) 1151 retval = put_user(uid, &infop->si_uid);
1149 retval = put_user(status, &infop->si_status); 1152 if (!retval)
1153 retval = put_user(status, &infop->si_status);
1154 }
1150 if (!retval) 1155 if (!retval)
1151 retval = pid; 1156 retval = pid;
1152 return retval; 1157 return retval;
@@ -1203,6 +1208,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1203 if (likely(!traced) && likely(!task_detached(p))) { 1208 if (likely(!traced) && likely(!task_detached(p))) {
1204 struct signal_struct *psig; 1209 struct signal_struct *psig;
1205 struct signal_struct *sig; 1210 struct signal_struct *sig;
1211 unsigned long maxrss;
1206 1212
1207 /* 1213 /*
1208 * The resource counters for the group leader are in its 1214 * The resource counters for the group leader are in its
@@ -1251,6 +1257,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1251 psig->coublock += 1257 psig->coublock +=
1252 task_io_get_oublock(p) + 1258 task_io_get_oublock(p) +
1253 sig->oublock + sig->coublock; 1259 sig->oublock + sig->coublock;
1260 maxrss = max(sig->maxrss, sig->cmaxrss);
1261 if (psig->cmaxrss < maxrss)
1262 psig->cmaxrss = maxrss;
1254 task_io_accounting_add(&psig->ioac, &p->ioac); 1263 task_io_accounting_add(&psig->ioac, &p->ioac);
1255 task_io_accounting_add(&psig->ioac, &sig->ioac); 1264 task_io_accounting_add(&psig->ioac, &sig->ioac);
1256 spin_unlock_irq(&p->real_parent->sighand->siglock); 1265 spin_unlock_irq(&p->real_parent->sighand->siglock);
@@ -1472,13 +1481,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
1472 * then ->notask_error is 0 if @p is an eligible child, 1481 * then ->notask_error is 0 if @p is an eligible child,
1473 * or another error from security_task_wait(), or still -ECHILD. 1482 * or another error from security_task_wait(), or still -ECHILD.
1474 */ 1483 */
1475static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, 1484static int wait_consider_task(struct wait_opts *wo, int ptrace,
1476 int ptrace, struct task_struct *p) 1485 struct task_struct *p)
1477{ 1486{
1478 int ret = eligible_child(wo, p); 1487 int ret = eligible_child(wo, p);
1479 if (!ret) 1488 if (!ret)
1480 return ret; 1489 return ret;
1481 1490
1491 ret = security_task_wait(p);
1482 if (unlikely(ret < 0)) { 1492 if (unlikely(ret < 0)) {
1483 /* 1493 /*
1484 * If we have not yet seen any eligible child, 1494 * If we have not yet seen any eligible child,
@@ -1540,7 +1550,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1540 * Do not consider detached threads. 1550 * Do not consider detached threads.
1541 */ 1551 */
1542 if (!task_detached(p)) { 1552 if (!task_detached(p)) {
1543 int ret = wait_consider_task(wo, tsk, 0, p); 1553 int ret = wait_consider_task(wo, 0, p);
1544 if (ret) 1554 if (ret)
1545 return ret; 1555 return ret;
1546 } 1556 }
@@ -1554,7 +1564,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1554 struct task_struct *p; 1564 struct task_struct *p;
1555 1565
1556 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1566 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1557 int ret = wait_consider_task(wo, tsk, 1, p); 1567 int ret = wait_consider_task(wo, 1, p);
1558 if (ret) 1568 if (ret)
1559 return ret; 1569 return ret;
1560 } 1570 }
@@ -1562,15 +1572,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1562 return 0; 1572 return 0;
1563} 1573}
1564 1574
1575static int child_wait_callback(wait_queue_t *wait, unsigned mode,
1576 int sync, void *key)
1577{
1578 struct wait_opts *wo = container_of(wait, struct wait_opts,
1579 child_wait);
1580 struct task_struct *p = key;
1581
1582 if (!eligible_pid(wo, p))
1583 return 0;
1584
1585 if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
1586 return 0;
1587
1588 return default_wake_function(wait, mode, sync, key);
1589}
1590
1591void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
1592{
1593 __wake_up_sync_key(&parent->signal->wait_chldexit,
1594 TASK_INTERRUPTIBLE, 1, p);
1595}
1596
1565static long do_wait(struct wait_opts *wo) 1597static long do_wait(struct wait_opts *wo)
1566{ 1598{
1567 DECLARE_WAITQUEUE(wait, current);
1568 struct task_struct *tsk; 1599 struct task_struct *tsk;
1569 int retval; 1600 int retval;
1570 1601
1571 trace_sched_process_wait(wo->wo_pid); 1602 trace_sched_process_wait(wo->wo_pid);
1572 1603
1573 add_wait_queue(&current->signal->wait_chldexit,&wait); 1604 init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
1605 wo->child_wait.private = current;
1606 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1574repeat: 1607repeat:
1575 /* 1608 /*
1576 * If there is nothing that can match our critiera just get out. 1609 * If there is nothing that can match our critiera just get out.
@@ -1611,32 +1644,7 @@ notask:
1611 } 1644 }
1612end: 1645end:
1613 __set_current_state(TASK_RUNNING); 1646 __set_current_state(TASK_RUNNING);
1614 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1647 remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1615 if (wo->wo_info) {
1616 struct siginfo __user *infop = wo->wo_info;
1617
1618 if (retval > 0)
1619 retval = 0;
1620 else {
1621 /*
1622 * For a WNOHANG return, clear out all the fields
1623 * we would set so the user can easily tell the
1624 * difference.
1625 */
1626 if (!retval)
1627 retval = put_user(0, &infop->si_signo);
1628 if (!retval)
1629 retval = put_user(0, &infop->si_errno);
1630 if (!retval)
1631 retval = put_user(0, &infop->si_code);
1632 if (!retval)
1633 retval = put_user(0, &infop->si_pid);
1634 if (!retval)
1635 retval = put_user(0, &infop->si_uid);
1636 if (!retval)
1637 retval = put_user(0, &infop->si_status);
1638 }
1639 }
1640 return retval; 1648 return retval;
1641} 1649}
1642 1650
@@ -1681,6 +1689,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
1681 wo.wo_stat = NULL; 1689 wo.wo_stat = NULL;
1682 wo.wo_rusage = ru; 1690 wo.wo_rusage = ru;
1683 ret = do_wait(&wo); 1691 ret = do_wait(&wo);
1692
1693 if (ret > 0) {
1694 ret = 0;
1695 } else if (infop) {
1696 /*
1697 * For a WNOHANG return, clear out all the fields
1698 * we would set so the user can easily tell the
1699 * difference.
1700 */
1701 if (!ret)
1702 ret = put_user(0, &infop->si_signo);
1703 if (!ret)
1704 ret = put_user(0, &infop->si_errno);
1705 if (!ret)
1706 ret = put_user(0, &infop->si_code);
1707 if (!ret)
1708 ret = put_user(0, &infop->si_pid);
1709 if (!ret)
1710 ret = put_user(0, &infop->si_uid);
1711 if (!ret)
1712 ret = put_user(0, &infop->si_status);
1713 }
1714
1684 put_pid(pid); 1715 put_pid(pid);
1685 1716
1686 /* avoid REGPARM breakage on x86: */ 1717 /* avoid REGPARM breakage on x86: */
diff --git a/kernel/fork.c b/kernel/fork.c
index e6c04d462ab2..266c6af6ef1b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/ftrace.h> 49#include <linux/ftrace.h>
50#include <linux/profile.h> 50#include <linux/profile.h>
51#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
53#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
54#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
61#include <linux/blkdev.h> 62#include <linux/blkdev.h>
62#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
63#include <linux/magic.h> 64#include <linux/magic.h>
64#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
65 67
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
136/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
137static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
138 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
139void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
140{ 149{
141 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
142 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
143 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -152,8 +162,7 @@ void __put_task_struct(struct task_struct *tsk)
152 WARN_ON(atomic_read(&tsk->usage)); 162 WARN_ON(atomic_read(&tsk->usage));
153 WARN_ON(tsk == current); 163 WARN_ON(tsk == current);
154 164
155 put_cred(tsk->real_cred); 165 exit_creds(tsk);
156 put_cred(tsk->cred);
157 delayacct_tsk_free(tsk); 166 delayacct_tsk_free(tsk);
158 167
159 if (!profile_handoff_task(tsk)) 168 if (!profile_handoff_task(tsk))
@@ -254,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
254 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
255#endif 264#endif
256 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
257 return tsk; 269 return tsk;
258 270
259out: 271out:
@@ -289,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
289 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
290 rb_parent = NULL; 302 rb_parent = NULL;
291 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
292 307
293 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
294 struct file *file; 309 struct file *file;
@@ -419,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup);
419 434
420#include <linux/init_task.h> 435#include <linux/init_task.h>
421 436
437static void mm_init_aio(struct mm_struct *mm)
438{
439#ifdef CONFIG_AIO
440 spin_lock_init(&mm->ioctx_lock);
441 INIT_HLIST_HEAD(&mm->ioctx_list);
442#endif
443}
444
422static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 445static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
423{ 446{
424 atomic_set(&mm->mm_users, 1); 447 atomic_set(&mm->mm_users, 1);
425 atomic_set(&mm->mm_count, 1); 448 atomic_set(&mm->mm_count, 1);
426 init_rwsem(&mm->mmap_sem); 449 init_rwsem(&mm->mmap_sem);
427 INIT_LIST_HEAD(&mm->mmlist); 450 INIT_LIST_HEAD(&mm->mmlist);
428 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 451 mm->flags = (current->mm) ?
452 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
429 mm->core_state = NULL; 453 mm->core_state = NULL;
430 mm->nr_ptes = 0; 454 mm->nr_ptes = 0;
431 set_mm_counter(mm, file_rss, 0); 455 set_mm_counter(mm, file_rss, 0);
432 set_mm_counter(mm, anon_rss, 0); 456 set_mm_counter(mm, anon_rss, 0);
433 spin_lock_init(&mm->page_table_lock); 457 spin_lock_init(&mm->page_table_lock);
434 spin_lock_init(&mm->ioctx_lock);
435 INIT_HLIST_HEAD(&mm->ioctx_list);
436 mm->free_area_cache = TASK_UNMAPPED_BASE; 458 mm->free_area_cache = TASK_UNMAPPED_BASE;
437 mm->cached_hole_size = ~0UL; 459 mm->cached_hole_size = ~0UL;
460 mm_init_aio(mm);
438 mm_init_owner(mm, p); 461 mm_init_owner(mm, p);
439 462
440 if (likely(!mm_alloc_pgd(mm))) { 463 if (likely(!mm_alloc_pgd(mm))) {
@@ -486,6 +509,7 @@ void mmput(struct mm_struct *mm)
486 509
487 if (atomic_dec_and_test(&mm->mm_users)) { 510 if (atomic_dec_and_test(&mm->mm_users)) {
488 exit_aio(mm); 511 exit_aio(mm);
512 ksm_exit(mm);
489 exit_mmap(mm); 513 exit_mmap(mm);
490 set_mm_exe_file(mm, NULL); 514 set_mm_exe_file(mm, NULL);
491 if (!list_empty(&mm->mmlist)) { 515 if (!list_empty(&mm->mmlist)) {
@@ -494,6 +518,8 @@ void mmput(struct mm_struct *mm)
494 spin_unlock(&mmlist_lock); 518 spin_unlock(&mmlist_lock);
495 } 519 }
496 put_swap_token(mm); 520 put_swap_token(mm);
521 if (mm->binfmt)
522 module_put(mm->binfmt->module);
497 mmdrop(mm); 523 mmdrop(mm);
498 } 524 }
499} 525}
@@ -619,9 +645,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
619 mm->hiwater_rss = get_mm_rss(mm); 645 mm->hiwater_rss = get_mm_rss(mm);
620 mm->hiwater_vm = mm->total_vm; 646 mm->hiwater_vm = mm->total_vm;
621 647
648 if (mm->binfmt && !try_module_get(mm->binfmt->module))
649 goto free_pt;
650
622 return mm; 651 return mm;
623 652
624free_pt: 653free_pt:
654 /* don't put binfmt in mmput, we haven't got module yet */
655 mm->binfmt = NULL;
625 mmput(mm); 656 mmput(mm);
626 657
627fail_nomem: 658fail_nomem:
@@ -789,10 +820,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
789 thread_group_cputime_init(sig); 820 thread_group_cputime_init(sig);
790 821
791 /* Expiration times and increments. */ 822 /* Expiration times and increments. */
792 sig->it_virt_expires = cputime_zero; 823 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
793 sig->it_virt_incr = cputime_zero; 824 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
794 sig->it_prof_expires = cputime_zero; 825 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
795 sig->it_prof_incr = cputime_zero; 826 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
796 827
797 /* Cached expiration times. */ 828 /* Cached expiration times. */
798 sig->cputime_expires.prof_exp = cputime_zero; 829 sig->cputime_expires.prof_exp = cputime_zero;
@@ -850,6 +881,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
850 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 881 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
851 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 882 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
852 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 883 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
884 sig->maxrss = sig->cmaxrss = 0;
853 task_io_accounting_init(&sig->ioac); 885 task_io_accounting_init(&sig->ioac);
854 sig->sum_sched_runtime = 0; 886 sig->sum_sched_runtime = 0;
855 taskstats_tgid_init(sig); 887 taskstats_tgid_init(sig);
@@ -864,6 +896,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 896
865 tty_audit_fork(sig); 897 tty_audit_fork(sig);
866 898
899 sig->oom_adj = current->signal->oom_adj;
900
867 return 0; 901 return 0;
868} 902}
869 903
@@ -959,6 +993,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
959 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) 993 if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
960 return ERR_PTR(-EINVAL); 994 return ERR_PTR(-EINVAL);
961 995
996 /*
997 * Siblings of global init remain as zombies on exit since they are
998 * not reaped by their parent (swapper). To solve this and to avoid
999 * multi-rooted process trees, prevent global and container-inits
1000 * from creating siblings.
1001 */
1002 if ((clone_flags & CLONE_PARENT) &&
1003 current->signal->flags & SIGNAL_UNKILLABLE)
1004 return ERR_PTR(-EINVAL);
1005
962 retval = security_task_create(clone_flags); 1006 retval = security_task_create(clone_flags);
963 if (retval) 1007 if (retval)
964 goto fork_out; 1008 goto fork_out;
@@ -1000,18 +1044,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1000 if (!try_module_get(task_thread_info(p)->exec_domain->module)) 1044 if (!try_module_get(task_thread_info(p)->exec_domain->module))
1001 goto bad_fork_cleanup_count; 1045 goto bad_fork_cleanup_count;
1002 1046
1003 if (p->binfmt && !try_module_get(p->binfmt->module))
1004 goto bad_fork_cleanup_put_domain;
1005
1006 p->did_exec = 0; 1047 p->did_exec = 0;
1007 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ 1048 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1008 copy_flags(clone_flags, p); 1049 copy_flags(clone_flags, p);
1009 INIT_LIST_HEAD(&p->children); 1050 INIT_LIST_HEAD(&p->children);
1010 INIT_LIST_HEAD(&p->sibling); 1051 INIT_LIST_HEAD(&p->sibling);
1011#ifdef CONFIG_PREEMPT_RCU 1052 rcu_copy_process(p);
1012 p->rcu_read_lock_nesting = 0;
1013 p->rcu_flipctr_idx = 0;
1014#endif /* #ifdef CONFIG_PREEMPT_RCU */
1015 p->vfork_done = NULL; 1053 p->vfork_done = NULL;
1016 spin_lock_init(&p->alloc_lock); 1054 spin_lock_init(&p->alloc_lock);
1017 1055
@@ -1079,10 +1117,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1079 1117
1080 p->bts = NULL; 1118 p->bts = NULL;
1081 1119
1120 p->stack_start = stack_start;
1121
1082 /* Perform scheduler related setup. Assign this task to a CPU. */ 1122 /* Perform scheduler related setup. Assign this task to a CPU. */
1083 sched_fork(p, clone_flags); 1123 sched_fork(p, clone_flags);
1084 1124
1085 retval = perf_counter_init_task(p); 1125 retval = perf_event_init_task(p);
1086 if (retval) 1126 if (retval)
1087 goto bad_fork_cleanup_policy; 1127 goto bad_fork_cleanup_policy;
1088 1128
@@ -1257,7 +1297,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1257 write_unlock_irq(&tasklist_lock); 1297 write_unlock_irq(&tasklist_lock);
1258 proc_fork_connector(p); 1298 proc_fork_connector(p);
1259 cgroup_post_fork(p); 1299 cgroup_post_fork(p);
1260 perf_counter_fork(p); 1300 perf_event_fork(p);
1261 return p; 1301 return p;
1262 1302
1263bad_fork_free_pid: 1303bad_fork_free_pid:
@@ -1284,21 +1324,17 @@ bad_fork_cleanup_semundo:
1284bad_fork_cleanup_audit: 1324bad_fork_cleanup_audit:
1285 audit_free(p); 1325 audit_free(p);
1286bad_fork_cleanup_policy: 1326bad_fork_cleanup_policy:
1287 perf_counter_free_task(p); 1327 perf_event_free_task(p);
1288#ifdef CONFIG_NUMA 1328#ifdef CONFIG_NUMA
1289 mpol_put(p->mempolicy); 1329 mpol_put(p->mempolicy);
1290bad_fork_cleanup_cgroup: 1330bad_fork_cleanup_cgroup:
1291#endif 1331#endif
1292 cgroup_exit(p, cgroup_callbacks_done); 1332 cgroup_exit(p, cgroup_callbacks_done);
1293 delayacct_tsk_free(p); 1333 delayacct_tsk_free(p);
1294 if (p->binfmt)
1295 module_put(p->binfmt->module);
1296bad_fork_cleanup_put_domain:
1297 module_put(task_thread_info(p)->exec_domain->module); 1334 module_put(task_thread_info(p)->exec_domain->module);
1298bad_fork_cleanup_count: 1335bad_fork_cleanup_count:
1299 atomic_dec(&p->cred->user->processes); 1336 atomic_dec(&p->cred->user->processes);
1300 put_cred(p->real_cred); 1337 exit_creds(p);
1301 put_cred(p->cred);
1302bad_fork_free: 1338bad_fork_free:
1303 free_task(p); 1339 free_task(p);
1304fork_out: 1340fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..b911adceb2c4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -89,33 +89,36 @@ struct futex_pi_state {
89 union futex_key key; 89 union futex_key key;
90}; 90};
91 91
92/* 92/**
93 * We use this hashed waitqueue instead of a normal wait_queue_t, so 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on
97 * @pi_state: optional priority inheritance state
98 * @rt_waiter: rt_waiter storage for use with requeue_pi
99 * @requeue_pi_key: the requeue_pi target futex key
100 * @bitset: bitset for the optional bitmasked wakeup
101 *
102 * We use this hashed waitqueue, instead of a normal wait_queue_t, so
94 * we can wake only the relevant ones (hashed queues may be shared). 103 * we can wake only the relevant ones (hashed queues may be shared).
95 * 104 *
96 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 105 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
97 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
98 * The order of wakup is always to make the first condition true, then 107 * The order of wakup is always to make the first condition true, then
99 * wake up q->waiter, then make the second condition true. 108 * the second.
109 *
110 * PI futexes are typically woken before they are removed from the hash list via
111 * the rt_mutex code. See unqueue_me_pi().
100 */ 112 */
101struct futex_q { 113struct futex_q {
102 struct plist_node list; 114 struct plist_node list;
103 /* Waiter reference */
104 struct task_struct *task;
105 115
106 /* Which hash list lock to use: */ 116 struct task_struct *task;
107 spinlock_t *lock_ptr; 117 spinlock_t *lock_ptr;
108
109 /* Key which the futex is hashed on: */
110 union futex_key key; 118 union futex_key key;
111
112 /* Optional priority inheritance state: */
113 struct futex_pi_state *pi_state; 119 struct futex_pi_state *pi_state;
114
115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 120 struct rt_mutex_waiter *rt_waiter;
117 121 union futex_key *requeue_pi_key;
118 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
121 124
@@ -195,11 +198,12 @@ static void drop_futex_key_refs(union futex_key *key)
195} 198}
196 199
197/** 200/**
198 * get_futex_key - Get parameters which are the keys for a futex. 201 * get_futex_key() - Get parameters which are the keys for a futex
199 * @uaddr: virtual address of the futex 202 * @uaddr: virtual address of the futex
200 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 203 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
201 * @key: address where result is stored. 204 * @key: address where result is stored.
202 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) 205 * @rw: mapping needs to be read/write (values: VERIFY_READ,
206 * VERIFY_WRITE)
203 * 207 *
204 * Returns a negative error code or 0 208 * Returns a negative error code or 0
205 * The key words are stored in *key on success. 209 * The key words are stored in *key on success.
@@ -285,8 +289,8 @@ void put_futex_key(int fshared, union futex_key *key)
285 drop_futex_key_refs(key); 289 drop_futex_key_refs(key);
286} 290}
287 291
288/* 292/**
289 * fault_in_user_writeable - fault in user address and verify RW access 293 * fault_in_user_writeable() - Fault in user address and verify RW access
290 * @uaddr: pointer to faulting user space address 294 * @uaddr: pointer to faulting user space address
291 * 295 *
292 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
@@ -306,8 +310,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
306 310
307/** 311/**
308 * futex_top_waiter() - Return the highest priority waiter on a futex 312 * futex_top_waiter() - Return the highest priority waiter on a futex
309 * @hb: the hash bucket the futex_q's reside in 313 * @hb: the hash bucket the futex_q's reside in
310 * @key: the futex key (to distinguish it from other futex futex_q's) 314 * @key: the futex key (to distinguish it from other futex futex_q's)
311 * 315 *
312 * Must be called with the hb lock held. 316 * Must be called with the hb lock held.
313 */ 317 */
@@ -585,7 +589,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
585} 589}
586 590
587/** 591/**
588 * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex 592 * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
589 * @uaddr: the pi futex user address 593 * @uaddr: the pi futex user address
590 * @hb: the pi futex hash bucket 594 * @hb: the pi futex hash bucket
591 * @key: the futex key associated with uaddr and hb 595 * @key: the futex key associated with uaddr and hb
@@ -1008,9 +1012,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1008 1012
1009/** 1013/**
1010 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1014 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1011 * q: the futex_q 1015 * @q: the futex_q
1012 * key: the key of the requeue target futex 1016 * @key: the key of the requeue target futex
1013 * hb: the hash_bucket of the requeue target futex 1017 * @hb: the hash_bucket of the requeue target futex
1014 * 1018 *
1015 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1019 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1016 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1020 * target futex if it is uncontended or via a lock steal. Set the futex_q key
@@ -1089,6 +1093,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1089 if (!top_waiter) 1093 if (!top_waiter)
1090 return 0; 1094 return 0;
1091 1095
1096 /* Ensure we requeue to the expected futex. */
1097 if (!match_futex(top_waiter->requeue_pi_key, key2))
1098 return -EINVAL;
1099
1092 /* 1100 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1101 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned 1102 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1284,12 @@ retry_private:
1276 continue; 1284 continue;
1277 } 1285 }
1278 1286
1287 /* Ensure we requeue to the expected futex for requeue_pi. */
1288 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1289 ret = -EINVAL;
1290 break;
1291 }
1292
1279 /* 1293 /*
1280 * Requeue nr_requeue waiters and possibly one more in the case 1294 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically. 1295 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1337,6 +1351,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1337 return hb; 1351 return hb;
1338} 1352}
1339 1353
1354static inline void
1355queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1356{
1357 spin_unlock(&hb->lock);
1358 drop_futex_key_refs(&q->key);
1359}
1360
1361/**
1362 * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1363 * @q: The futex_q to enqueue
1364 * @hb: The destination hash bucket
1365 *
1366 * The hb->lock must be held by the caller, and is released here. A call to
1367 * queue_me() is typically paired with exactly one call to unqueue_me(). The
1368 * exceptions involve the PI related operations, which may use unqueue_me_pi()
1369 * or nothing if the unqueue is done as part of the wake process and the unqueue
1370 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1371 * an example).
1372 */
1340static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1373static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1341{ 1374{
1342 int prio; 1375 int prio;
@@ -1360,19 +1393,17 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1360 spin_unlock(&hb->lock); 1393 spin_unlock(&hb->lock);
1361} 1394}
1362 1395
1363static inline void 1396/**
1364queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1397 * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1365{ 1398 * @q: The futex_q to unqueue
1366 spin_unlock(&hb->lock); 1399 *
1367 drop_futex_key_refs(&q->key); 1400 * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1368} 1401 * be paired with exactly one earlier call to queue_me().
1369 1402 *
1370/* 1403 * Returns:
1371 * queue_me and unqueue_me must be called as a pair, each 1404 * 1 - if the futex_q was still queued (and we removed unqueued it)
1372 * exactly once. They are called with the hashed spinlock held. 1405 * 0 - if the futex_q was already removed by the waking thread
1373 */ 1406 */
1374
1375/* Return 1 if we were still queued (ie. 0 means we were woken) */
1376static int unqueue_me(struct futex_q *q) 1407static int unqueue_me(struct futex_q *q)
1377{ 1408{
1378 spinlock_t *lock_ptr; 1409 spinlock_t *lock_ptr;
@@ -1625,17 +1656,14 @@ out:
1625static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, 1656static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1626 struct hrtimer_sleeper *timeout) 1657 struct hrtimer_sleeper *timeout)
1627{ 1658{
1628 queue_me(q, hb);
1629
1630 /* 1659 /*
1631 * There might have been scheduling since the queue_me(), as we 1660 * The task state is guaranteed to be set before another task can
1632 * cannot hold a spinlock across the get_user() in case it 1661 * wake it. set_current_state() is implemented using set_mb() and
1633 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1662 * queue_me() calls spin_unlock() upon completion, both serializing
1634 * queueing ourselves into the futex hash. This code thus has to 1663 * access to the hash list and forcing another memory barrier.
1635 * rely on the futex_wake() code removing us from hash when it
1636 * wakes us up.
1637 */ 1664 */
1638 set_current_state(TASK_INTERRUPTIBLE); 1665 set_current_state(TASK_INTERRUPTIBLE);
1666 queue_me(q, hb);
1639 1667
1640 /* Arm the timer */ 1668 /* Arm the timer */
1641 if (timeout) { 1669 if (timeout) {
@@ -1645,8 +1673,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1645 } 1673 }
1646 1674
1647 /* 1675 /*
1648 * !plist_node_empty() is safe here without any lock. 1676 * If we have been removed from the hash list, then another task
1649 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1677 * has tried to wake us, and we can skip the call to schedule().
1650 */ 1678 */
1651 if (likely(!plist_node_empty(&q->list))) { 1679 if (likely(!plist_node_empty(&q->list))) {
1652 /* 1680 /*
@@ -1751,6 +1779,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1751 q.pi_state = NULL; 1779 q.pi_state = NULL;
1752 q.bitset = bitset; 1780 q.bitset = bitset;
1753 q.rt_waiter = NULL; 1781 q.rt_waiter = NULL;
1782 q.requeue_pi_key = NULL;
1754 1783
1755 if (abs_time) { 1784 if (abs_time) {
1756 to = &timeout; 1785 to = &timeout;
@@ -1858,6 +1887,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1858 1887
1859 q.pi_state = NULL; 1888 q.pi_state = NULL;
1860 q.rt_waiter = NULL; 1889 q.rt_waiter = NULL;
1890 q.requeue_pi_key = NULL;
1861retry: 1891retry:
1862 q.key = FUTEX_KEY_INIT; 1892 q.key = FUTEX_KEY_INIT;
1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1893 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2099,12 +2129,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2099 2129
2100/** 2130/**
2101 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2131 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2102 * @uaddr: the futex we initialyl wait on (non-pi) 2132 * @uaddr: the futex we initially wait on (non-pi)
2103 * @fshared: whether the futexes are shared (1) or not (0). They must be 2133 * @fshared: whether the futexes are shared (1) or not (0). They must be
2104 * the same type, no requeueing from private to shared, etc. 2134 * the same type, no requeueing from private to shared, etc.
2105 * @val: the expected value of uaddr 2135 * @val: the expected value of uaddr
2106 * @abs_time: absolute timeout 2136 * @abs_time: absolute timeout
2107 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. 2137 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
2108 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) 2138 * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
2109 * @uaddr2: the pi futex we will take prior to returning to user-space 2139 * @uaddr2: the pi futex we will take prior to returning to user-space
2110 * 2140 *
@@ -2118,11 +2148,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2148 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following: 2149 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2150 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2151 * 2) wakeup on uaddr2 after a requeue
2122 * 3) signal (before or after requeue) 2152 * 3) signal
2123 * 4) timeout (before or after requeue) 2153 * 4) timeout
2124 * 2154 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2155 * If 3, cleanup and return -ERESTARTNOINTR.
2126 * 2156 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via: 2157 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock 2158 * 5) successful lock
@@ -2130,7 +2160,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2130 * 7) timeout 2160 * 7) timeout
2131 * 8) other lock acquisition failure 2161 * 8) other lock acquisition failure
2132 * 2162 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2163 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2134 * 2164 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2165 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 * 2166 *
@@ -2169,15 +2199,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2169 debug_rt_mutex_init_waiter(&rt_waiter); 2199 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL; 2200 rt_waiter.task = NULL;
2171 2201
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT; 2202 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2203 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0)) 2204 if (unlikely(ret != 0))
2179 goto out; 2205 goto out;
2180 2206
2207 q.pi_state = NULL;
2208 q.bitset = bitset;
2209 q.rt_waiter = &rt_waiter;
2210 q.requeue_pi_key = &key2;
2211
2181 /* Prepare to wait on uaddr. */ 2212 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2213 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret) 2214 if (ret)
@@ -2230,7 +2261,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2230 res = fixup_owner(uaddr2, fshared, &q, !ret); 2261 res = fixup_owner(uaddr2, fshared, &q, !ret);
2231 /* 2262 /*
2232 * If fixup_owner() returned an error, proprogate that. If it 2263 * If fixup_owner() returned an error, proprogate that. If it
2233 * acquired the lock, clear our -ETIMEDOUT or -EINTR. 2264 * acquired the lock, clear -ETIMEDOUT or -EINTR.
2234 */ 2265 */
2235 if (res) 2266 if (res)
2236 ret = (res < 0) ? res : 0; 2267 ret = (res < 0) ? res : 0;
@@ -2248,14 +2279,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2248 rt_mutex_unlock(pi_mutex); 2279 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) { 2280 } else if (ret == -EINTR) {
2250 /* 2281 /*
2251 * We've already been requeued, but we have no way to 2282 * We've already been requeued, but cannot restart by calling
2252 * restart by calling futex_lock_pi() directly. We 2283 * futex_lock_pi() directly. We could restart this syscall, but
2253 * could restart the syscall, but that will look at 2284 * it would detect that the user space "val" changed and return
2254 * the user space value and return right away. So we 2285 * -EWOULDBLOCK. Save the overhead of the restart and return
2255 * drop back with EWOULDBLOCK to tell user space that 2286 * -EWOULDBLOCK directly.
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */ 2287 */
2260 ret = -EWOULDBLOCK; 2288 ret = -EWOULDBLOCK;
2261 } 2289 }
@@ -2289,9 +2317,9 @@ out:
2289 */ 2317 */
2290 2318
2291/** 2319/**
2292 * sys_set_robust_list - set the robust-futex list head of a task 2320 * sys_set_robust_list() - Set the robust-futex list head of a task
2293 * @head: pointer to the list-head 2321 * @head: pointer to the list-head
2294 * @len: length of the list-head, as userspace expects 2322 * @len: length of the list-head, as userspace expects
2295 */ 2323 */
2296SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, 2324SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2297 size_t, len) 2325 size_t, len)
@@ -2310,10 +2338,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2310} 2338}
2311 2339
2312/** 2340/**
2313 * sys_get_robust_list - get the robust-futex list head of a task 2341 * sys_get_robust_list() - Get the robust-futex list head of a task
2314 * @pid: pid of the process [zero for current task] 2342 * @pid: pid of the process [zero for current task]
2315 * @head_ptr: pointer to a list-head pointer, the kernel fills it in 2343 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
2316 * @len_ptr: pointer to a length field, the kernel fills in the header size 2344 * @len_ptr: pointer to a length field, the kernel fills in the header size
2317 */ 2345 */
2318SYSCALL_DEFINE3(get_robust_list, int, pid, 2346SYSCALL_DEFINE3(get_robust_list, int, pid,
2319 struct robust_list_head __user * __user *, head_ptr, 2347 struct robust_list_head __user * __user *, head_ptr,
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..70a298d6da71 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..6d7020490f94 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,36 +48,7 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/** 51#include <trace/events/timer.h>
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81 52
82/* 53/*
83 * The timer bases: 54 * The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 77 }
107}; 78};
108 79
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 80/*
135 * Get the coarse grained time at the softirq based on xtime and 81 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 82 * wall_to_monotonic.
@@ -485,6 +431,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
485 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 431 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
486 __hrtimer_init(timer, clock_id, mode); 432 __hrtimer_init(timer, clock_id, mode);
487} 433}
434EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
488 435
489void destroy_hrtimer_on_stack(struct hrtimer *timer) 436void destroy_hrtimer_on_stack(struct hrtimer *timer)
490{ 437{
@@ -497,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
497static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 444static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
498#endif 445#endif
499 446
447static inline void
448debug_init(struct hrtimer *timer, clockid_t clockid,
449 enum hrtimer_mode mode)
450{
451 debug_hrtimer_init(timer);
452 trace_hrtimer_init(timer, clockid, mode);
453}
454
455static inline void debug_activate(struct hrtimer *timer)
456{
457 debug_hrtimer_activate(timer);
458 trace_hrtimer_start(timer);
459}
460
461static inline void debug_deactivate(struct hrtimer *timer)
462{
463 debug_hrtimer_deactivate(timer);
464 trace_hrtimer_cancel(timer);
465}
466
500/* High resolution timer related functions */ 467/* High resolution timer related functions */
501#ifdef CONFIG_HIGH_RES_TIMERS 468#ifdef CONFIG_HIGH_RES_TIMERS
502 469
@@ -542,13 +509,14 @@ static inline int hrtimer_hres_active(void)
542 * next event 509 * next event
543 * Called with interrupts disabled and base->lock held 510 * Called with interrupts disabled and base->lock held
544 */ 511 */
545static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) 512static void
513hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
546{ 514{
547 int i; 515 int i;
548 struct hrtimer_clock_base *base = cpu_base->clock_base; 516 struct hrtimer_clock_base *base = cpu_base->clock_base;
549 ktime_t expires; 517 ktime_t expires, expires_next;
550 518
551 cpu_base->expires_next.tv64 = KTIME_MAX; 519 expires_next.tv64 = KTIME_MAX;
552 520
553 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 521 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
554 struct hrtimer *timer; 522 struct hrtimer *timer;
@@ -564,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
564 */ 532 */
565 if (expires.tv64 < 0) 533 if (expires.tv64 < 0)
566 expires.tv64 = 0; 534 expires.tv64 = 0;
567 if (expires.tv64 < cpu_base->expires_next.tv64) 535 if (expires.tv64 < expires_next.tv64)
568 cpu_base->expires_next = expires; 536 expires_next = expires;
569 } 537 }
570 538
539 if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
540 return;
541
542 cpu_base->expires_next.tv64 = expires_next.tv64;
543
571 if (cpu_base->expires_next.tv64 != KTIME_MAX) 544 if (cpu_base->expires_next.tv64 != KTIME_MAX)
572 tick_program_event(cpu_base->expires_next, 1); 545 tick_program_event(cpu_base->expires_next, 1);
573} 546}
@@ -650,7 +623,7 @@ static void retrigger_next_event(void *arg)
650 base->clock_base[CLOCK_REALTIME].offset = 623 base->clock_base[CLOCK_REALTIME].offset =
651 timespec_to_ktime(realtime_offset); 624 timespec_to_ktime(realtime_offset);
652 625
653 hrtimer_force_reprogram(base); 626 hrtimer_force_reprogram(base, 0);
654 spin_unlock(&base->lock); 627 spin_unlock(&base->lock);
655} 628}
656 629
@@ -763,7 +736,8 @@ static int hrtimer_switch_to_hres(void)
763static inline int hrtimer_hres_active(void) { return 0; } 736static inline int hrtimer_hres_active(void) { return 0; }
764static inline int hrtimer_is_hres_enabled(void) { return 0; } 737static inline int hrtimer_is_hres_enabled(void) { return 0; }
765static inline int hrtimer_switch_to_hres(void) { return 0; } 738static inline int hrtimer_switch_to_hres(void) { return 0; }
766static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 739static inline void
740hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
767static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 741static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
768 struct hrtimer_clock_base *base, 742 struct hrtimer_clock_base *base,
769 int wakeup) 743 int wakeup)
@@ -853,7 +827,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
853 struct hrtimer *entry; 827 struct hrtimer *entry;
854 int leftmost = 1; 828 int leftmost = 1;
855 829
856 debug_hrtimer_activate(timer); 830 debug_activate(timer);
857 831
858 /* 832 /*
859 * Find the right place in the rbtree: 833 * Find the right place in the rbtree:
@@ -906,19 +880,29 @@ static void __remove_hrtimer(struct hrtimer *timer,
906 struct hrtimer_clock_base *base, 880 struct hrtimer_clock_base *base,
907 unsigned long newstate, int reprogram) 881 unsigned long newstate, int reprogram)
908{ 882{
909 if (timer->state & HRTIMER_STATE_ENQUEUED) { 883 if (!(timer->state & HRTIMER_STATE_ENQUEUED))
910 /* 884 goto out;
911 * Remove the timer from the rbtree and replace the 885
912 * first entry pointer if necessary. 886 /*
913 */ 887 * Remove the timer from the rbtree and replace the first
914 if (base->first == &timer->node) { 888 * entry pointer if necessary.
915 base->first = rb_next(&timer->node); 889 */
916 /* Reprogram the clock event device. if enabled */ 890 if (base->first == &timer->node) {
917 if (reprogram && hrtimer_hres_active()) 891 base->first = rb_next(&timer->node);
918 hrtimer_force_reprogram(base->cpu_base); 892#ifdef CONFIG_HIGH_RES_TIMERS
893 /* Reprogram the clock event device. if enabled */
894 if (reprogram && hrtimer_hres_active()) {
895 ktime_t expires;
896
897 expires = ktime_sub(hrtimer_get_expires(timer),
898 base->offset);
899 if (base->cpu_base->expires_next.tv64 == expires.tv64)
900 hrtimer_force_reprogram(base->cpu_base, 1);
919 } 901 }
920 rb_erase(&timer->node, &base->active); 902#endif
921 } 903 }
904 rb_erase(&timer->node, &base->active);
905out:
922 timer->state = newstate; 906 timer->state = newstate;
923} 907}
924 908
@@ -939,7 +923,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
939 * reprogramming happens in the interrupt handler. This is a 923 * reprogramming happens in the interrupt handler. This is a
940 * rare case and less expensive than a smp call. 924 * rare case and less expensive than a smp call.
941 */ 925 */
942 debug_hrtimer_deactivate(timer); 926 debug_deactivate(timer);
943 timer_stats_hrtimer_clear_start_info(timer); 927 timer_stats_hrtimer_clear_start_info(timer);
944 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 928 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
945 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 929 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1154,7 +1138,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1154 clock_id = CLOCK_MONOTONIC; 1138 clock_id = CLOCK_MONOTONIC;
1155 1139
1156 timer->base = &cpu_base->clock_base[clock_id]; 1140 timer->base = &cpu_base->clock_base[clock_id];
1157 INIT_LIST_HEAD(&timer->cb_entry);
1158 hrtimer_init_timer_hres(timer); 1141 hrtimer_init_timer_hres(timer);
1159 1142
1160#ifdef CONFIG_TIMER_STATS 1143#ifdef CONFIG_TIMER_STATS
@@ -1173,7 +1156,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1173void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1156void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1174 enum hrtimer_mode mode) 1157 enum hrtimer_mode mode)
1175{ 1158{
1176 debug_hrtimer_init(timer); 1159 debug_init(timer, clock_id, mode);
1177 __hrtimer_init(timer, clock_id, mode); 1160 __hrtimer_init(timer, clock_id, mode);
1178} 1161}
1179EXPORT_SYMBOL_GPL(hrtimer_init); 1162EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1197,7 +1180,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1197} 1180}
1198EXPORT_SYMBOL_GPL(hrtimer_get_res); 1181EXPORT_SYMBOL_GPL(hrtimer_get_res);
1199 1182
1200static void __run_hrtimer(struct hrtimer *timer) 1183static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1201{ 1184{
1202 struct hrtimer_clock_base *base = timer->base; 1185 struct hrtimer_clock_base *base = timer->base;
1203 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1186 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1206,7 +1189,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1206 1189
1207 WARN_ON(!irqs_disabled()); 1190 WARN_ON(!irqs_disabled());
1208 1191
1209 debug_hrtimer_deactivate(timer); 1192 debug_deactivate(timer);
1210 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1193 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1211 timer_stats_account_hrtimer(timer); 1194 timer_stats_account_hrtimer(timer);
1212 fn = timer->function; 1195 fn = timer->function;
@@ -1217,7 +1200,9 @@ static void __run_hrtimer(struct hrtimer *timer)
1217 * the timer base. 1200 * the timer base.
1218 */ 1201 */
1219 spin_unlock(&cpu_base->lock); 1202 spin_unlock(&cpu_base->lock);
1203 trace_hrtimer_expire_entry(timer, now);
1220 restart = fn(timer); 1204 restart = fn(timer);
1205 trace_hrtimer_expire_exit(timer);
1221 spin_lock(&cpu_base->lock); 1206 spin_lock(&cpu_base->lock);
1222 1207
1223 /* 1208 /*
@@ -1328,7 +1313,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1328 break; 1313 break;
1329 } 1314 }
1330 1315
1331 __run_hrtimer(timer); 1316 __run_hrtimer(timer, &basenow);
1332 } 1317 }
1333 base++; 1318 base++;
1334 } 1319 }
@@ -1450,7 +1435,7 @@ void hrtimer_run_queues(void)
1450 hrtimer_get_expires_tv64(timer)) 1435 hrtimer_get_expires_tv64(timer))
1451 break; 1436 break;
1452 1437
1453 __run_hrtimer(timer); 1438 __run_hrtimer(timer, &base->softirq_time);
1454 } 1439 }
1455 spin_unlock(&cpu_base->lock); 1440 spin_unlock(&cpu_base->lock);
1456 } 1441 }
@@ -1477,6 +1462,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1477 sl->timer.function = hrtimer_wakeup; 1462 sl->timer.function = hrtimer_wakeup;
1478 sl->task = task; 1463 sl->task = task;
1479} 1464}
1465EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1480 1466
1481static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1467static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1482{ 1468{
@@ -1626,7 +1612,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1626 while ((node = rb_first(&old_base->active))) { 1612 while ((node = rb_first(&old_base->active))) {
1627 timer = rb_entry(node, struct hrtimer, node); 1613 timer = rb_entry(node, struct hrtimer, node);
1628 BUG_ON(hrtimer_callback_running(timer)); 1614 BUG_ON(hrtimer_callback_running(timer));
1629 debug_hrtimer_deactivate(timer); 1615 debug_deactivate(timer);
1630 1616
1631 /* 1617 /*
1632 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1618 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 022a4927b785..d4e841747400 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout)
171 * Process updating of timeout sysctl 171 * Process updating of timeout sysctl
172 */ 172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, 173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer, 174 void __user *buffer,
175 size_t *lenp, loff_t *ppos) 175 size_t *lenp, loff_t *ppos)
176{ 176{
177 int ret; 177 int ret;
178 178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); 179 ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
180 180
181 if (ret || !write) 181 if (ret || !write)
182 goto out; 182 goto out;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc); 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
59
47/* 60/*
48 * Debugging printout: 61 * Debugging printout:
49 */ 62 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed831737..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
230 if (!desc) 230 if (!desc)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc);
233 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
234 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
235 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
236} 238}
237EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
238 240
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
294 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
295 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
296 * 298 *
297 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
298 */ 301 */
299void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
300{ 303{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
304 if (!desc) 307 if (!desc)
305 return; 308 return;
306 309
310 chip_bus_lock(irq, desc);
307 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
308 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
309 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
310} 315}
311EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
312 317
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 return ret; 441 return ret;
437} 442}
438 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
439static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
440{ 465{
441 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
451 return -1; 476 return -1;
452} 477}
453 478
479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
454#ifdef CONFIG_SMP 496#ifdef CONFIG_SMP
455/* 497/*
456 * Check whether we need to change the affinity of the interrupt thread. 498 * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
492 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
493 struct irqaction *action = data; 535 struct irqaction *action = data;
494 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
495 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
496 538
497 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
498 current->irqaction = action; 540 current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
518 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
519 561
520 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
521 } 566 }
522 567
523 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
565 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
566 const char *old_name = NULL; 611 const char *old_name = NULL;
567 unsigned long flags; 612 unsigned long flags;
568 int shared = 0; 613 int nested, shared = 0;
569 int ret; 614 int ret;
570 615
571 if (!desc) 616 if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
590 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
591 } 636 }
592 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
593 /* 658 /*
594 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
595 */ 662 */
596 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
597 struct task_struct *t; 664 struct task_struct *t;
598 665
599 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
662 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
663#endif 730#endif
664 731
665 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
666 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
667 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
668 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
669 desc->depth = 0; 739 desc->depth = 0;
670 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
875 */ 945 */
876void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
877{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
878 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
879} 956}
880EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
881 958
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
884 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
885 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
886 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
887 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
888 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
889 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
963 1042
964 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
965 return -EINVAL; 1044 return -EINVAL;
966 if (!handler) 1045
967 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
968 1051
969 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
970 if (!action) 1053 if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
976 action->name = devname; 1059 action->name = devname;
977 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
978 1061
1062 chip_bus_lock(irq, desc);
979 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
980 if (retval) 1066 if (retval)
981 kfree(action); 1067 kfree(action);
982 1068
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b03451ede528 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/posix-timers.h> 13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
41 return ktime_to_timeval(rem); 42 return ktime_to_timeval(rem);
42} 43}
43 44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime);
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cputime_le(cval, t))
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cputime_sub(cval, t);
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
44int do_getitimer(int which, struct itimerval *value) 79int do_getitimer(int which, struct itimerval *value)
45{ 80{
46 struct task_struct *tsk = current; 81 struct task_struct *tsk = current;
47 cputime_t cinterval, cval;
48 82
49 switch (which) { 83 switch (which) {
50 case ITIMER_REAL: 84 case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 89 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 90 break;
57 case ITIMER_VIRTUAL: 91 case ITIMER_VIRTUAL:
58 spin_lock_irq(&tsk->sighand->siglock); 92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
59 cval = tsk->signal->it_virt_expires;
60 cinterval = tsk->signal->it_virt_incr;
61 if (!cputime_eq(cval, cputime_zero)) {
62 struct task_cputime cputime;
63 cputime_t utime;
64
65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1);
69 } else {
70 cval = cputime_sub(cval, utime);
71 }
72 }
73 spin_unlock_irq(&tsk->sighand->siglock);
74 cputime_to_timeval(cval, &value->it_value);
75 cputime_to_timeval(cinterval, &value->it_interval);
76 break; 93 break;
77 case ITIMER_PROF: 94 case ITIMER_PROF:
78 spin_lock_irq(&tsk->sighand->siglock); 95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
79 cval = tsk->signal->it_prof_expires;
80 cinterval = tsk->signal->it_prof_incr;
81 if (!cputime_eq(cval, cputime_zero)) {
82 struct task_cputime times;
83 cputime_t ptime;
84
85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1);
89 } else {
90 cval = cputime_sub(cval, ptime);
91 }
92 }
93 spin_unlock_irq(&tsk->sighand->siglock);
94 cputime_to_timeval(cval, &value->it_value);
95 cputime_to_timeval(cinterval, &value->it_interval);
96 break; 96 break;
97 default: 97 default:
98 return(-EINVAL); 98 return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
123 struct signal_struct *sig = 123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer); 124 container_of(timer, struct signal_struct, real_timer);
125 125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
126 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); 127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
127 128
128 return HRTIMER_NORESTART; 129 return HRTIMER_NORESTART;
129} 130}
130 131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150
151 nval = timeval_to_cputime(&value->it_value);
152 ns_nval = timeval_to_ns(&value->it_value);
153 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval);
155
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
157 it->error = cputime_sub_ns(nval, ns_nval);
158
159 spin_lock_irq(&tsk->sighand->siglock);
160
161 cval = it->expires;
162 cinterval = it->incr;
163 if (!cputime_eq(cval, cputime_zero) ||
164 !cputime_eq(nval, cputime_zero)) {
165 if (cputime_gt(nval, cputime_zero))
166 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173
174 spin_unlock_irq(&tsk->sighand->siglock);
175
176 if (ovalue) {
177 cputime_to_timeval(cval, &ovalue->it_value);
178 cputime_to_timeval(cinterval, &ovalue->it_interval);
179 }
180}
181
131/* 182/*
132 * Returns true if the timeval is in canonical form 183 * Returns true if the timeval is in canonical form
133 */ 184 */
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
139 struct task_struct *tsk = current; 190 struct task_struct *tsk = current;
140 struct hrtimer *timer; 191 struct hrtimer *timer;
141 ktime_t expires; 192 ktime_t expires;
142 cputime_t cval, cinterval, nval, ninterval;
143 193
144 /* 194 /*
145 * Validate the timevals in value. 195 * Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
171 } else 221 } else
172 tsk->signal->it_real_incr.tv64 = 0; 222 tsk->signal->it_real_incr.tv64 = 0;
173 223
224 trace_itimer_state(ITIMER_REAL, value, 0);
174 spin_unlock_irq(&tsk->sighand->siglock); 225 spin_unlock_irq(&tsk->sighand->siglock);
175 break; 226 break;
176 case ITIMER_VIRTUAL: 227 case ITIMER_VIRTUAL:
177 nval = timeval_to_cputime(&value->it_value); 228 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
178 ninterval = timeval_to_cputime(&value->it_interval);
179 spin_lock_irq(&tsk->sighand->siglock);
180 cval = tsk->signal->it_virt_expires;
181 cinterval = tsk->signal->it_virt_incr;
182 if (!cputime_eq(cval, cputime_zero) ||
183 !cputime_eq(nval, cputime_zero)) {
184 if (cputime_gt(nval, cputime_zero))
185 nval = cputime_add(nval,
186 jiffies_to_cputime(1));
187 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
188 &nval, &cval);
189 }
190 tsk->signal->it_virt_expires = nval;
191 tsk->signal->it_virt_incr = ninterval;
192 spin_unlock_irq(&tsk->sighand->siglock);
193 if (ovalue) {
194 cputime_to_timeval(cval, &ovalue->it_value);
195 cputime_to_timeval(cinterval, &ovalue->it_interval);
196 }
197 break; 229 break;
198 case ITIMER_PROF: 230 case ITIMER_PROF:
199 nval = timeval_to_cputime(&value->it_value); 231 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
200 ninterval = timeval_to_cputime(&value->it_interval);
201 spin_lock_irq(&tsk->sighand->siglock);
202 cval = tsk->signal->it_prof_expires;
203 cinterval = tsk->signal->it_prof_incr;
204 if (!cputime_eq(cval, cputime_zero) ||
205 !cputime_eq(nval, cputime_zero)) {
206 if (cputime_gt(nval, cputime_zero))
207 nval = cputime_add(nval,
208 jiffies_to_cputime(1));
209 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
210 &nval, &cval);
211 }
212 tsk->signal->it_prof_expires = nval;
213 tsk->signal->it_prof_incr = ninterval;
214 spin_unlock_irq(&tsk->sighand->siglock);
215 if (ovalue) {
216 cputime_to_timeval(cval, &ovalue->it_value);
217 cputime_to_timeval(cinterval, &ovalue->it_interval);
218 }
219 break; 232 break;
220 default: 233 default:
221 return -EINVAL; 234 return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3a29dbe7898e..8b6b8b697c68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
59 59
60static inline int is_kernel_text(unsigned long addr) 60static inline int is_kernel_text(unsigned long addr)
61{ 61{
62 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 62 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
63 arch_is_kernel_text(addr))
63 return 1; 64 return 1;
64 return in_gate_area_no_task(addr); 65 return in_gate_area_no_task(addr);
65} 66}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 385c31a1bdbf..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
37#include <linux/suspend.h> 37#include <linux/suspend.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39 39
40#include <trace/events/module.h>
41
40extern int max_threads; 42extern int max_threads;
41 43
42static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -78,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
78#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
79 static int kmod_loop_msg; 81 static int kmod_loop_msg;
80 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
81 va_start(args, fmt); 87 va_start(args, fmt);
82 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
83 va_end(args); 89 va_end(args);
@@ -108,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
108 return -ENOMEM; 114 return -ENOMEM;
109 } 115 }
110 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
111 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
112 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
113 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
@@ -462,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
462 int retval = 0; 470 int retval = 0;
463 471
464 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
465 474
466 helper_lock(); 475 helper_lock();
467 if (sub_info->path[0] == '\0') 476 if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..5240d75f4c60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 234
241 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
242 if (check_safety()) 236 if (check_safety())
243 return -EAGAIN; 237 return -EAGAIN;
244 238
245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
246 int i; 240 int i;
247 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
248 continue; 242 continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
260void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
261{ 255{
262 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
263 struct hlist_node *pos;
264 257
265 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
266 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
267 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
268 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
269 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
270 if (dirty) { 263 if (dirty) {
271 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
272 kip->ngarbage++; 265 kip->ngarbage++;
273 } else { 266 } else
274 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
275 }
276 break; 268 break;
277 } 269 }
278 } 270 }
@@ -1329,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1329 return 0; 1321 return 0;
1330} 1322}
1331 1323
1332static struct seq_operations kprobes_seq_ops = { 1324static const struct seq_operations kprobes_seq_ops = {
1333 .start = kprobe_seq_start, 1325 .start = kprobe_seq_start,
1334 .next = kprobe_seq_next, 1326 .next = kprobe_seq_next,
1335 .stop = kprobe_seq_stop, 1327 .stop = kprobe_seq_stop,
@@ -1341,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
1341 return seq_open(filp, &kprobes_seq_ops); 1333 return seq_open(filp, &kprobes_seq_ops);
1342} 1334}
1343 1335
1344static struct file_operations debugfs_kprobes_operations = { 1336static const struct file_operations debugfs_kprobes_operations = {
1345 .open = kprobes_open, 1337 .open = kprobes_open,
1346 .read = seq_read, 1338 .read = seq_read,
1347 .llseek = seq_lseek, 1339 .llseek = seq_lseek,
@@ -1523,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
1523 return count; 1515 return count;
1524} 1516}
1525 1517
1526static struct file_operations fops_kp = { 1518static const struct file_operations fops_kp = {
1527 .read = read_enabled_file_bool, 1519 .read = read_enabled_file_bool,
1528 .write = write_enabled_file_bool, 1520 .write = write_enabled_file_bool,
1529}; 1521};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..3815ac1d58b2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -636,6 +578,9 @@ static int static_obj(void *obj)
636 if ((addr >= start) && (addr < end)) 578 if ((addr >= start) && (addr < end))
637 return 1; 579 return 1;
638 580
581 if (arch_is_kernel_data(addr))
582 return 1;
583
639#ifdef CONFIG_SMP 584#ifdef CONFIG_SMP
640 /* 585 /*
641 * percpu var? 586 * percpu var?
@@ -898,22 +843,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 843}
899 844
900/* 845/*
846 * For good efficiency of modular, we use power of 2
847 */
848#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
849#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
850
851/*
852 * The circular_queue and helpers is used to implement the
853 * breadth-first search(BFS)algorithem, by which we can build
854 * the shortest path from the next lock to be acquired to the
855 * previous held lock if there is a circular between them.
856 */
857struct circular_queue {
858 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
859 unsigned int front, rear;
860};
861
862static struct circular_queue lock_cq;
863
864unsigned int max_bfs_queue_depth;
865
866static unsigned int lockdep_dependency_gen_id;
867
868static inline void __cq_init(struct circular_queue *cq)
869{
870 cq->front = cq->rear = 0;
871 lockdep_dependency_gen_id++;
872}
873
874static inline int __cq_empty(struct circular_queue *cq)
875{
876 return (cq->front == cq->rear);
877}
878
879static inline int __cq_full(struct circular_queue *cq)
880{
881 return ((cq->rear + 1) & CQ_MASK) == cq->front;
882}
883
884static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
885{
886 if (__cq_full(cq))
887 return -1;
888
889 cq->element[cq->rear] = elem;
890 cq->rear = (cq->rear + 1) & CQ_MASK;
891 return 0;
892}
893
894static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
895{
896 if (__cq_empty(cq))
897 return -1;
898
899 *elem = cq->element[cq->front];
900 cq->front = (cq->front + 1) & CQ_MASK;
901 return 0;
902}
903
904static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
905{
906 return (cq->rear - cq->front) & CQ_MASK;
907}
908
909static inline void mark_lock_accessed(struct lock_list *lock,
910 struct lock_list *parent)
911{
912 unsigned long nr;
913
914 nr = lock - list_entries;
915 WARN_ON(nr >= nr_list_entries);
916 lock->parent = parent;
917 lock->class->dep_gen_id = lockdep_dependency_gen_id;
918}
919
920static inline unsigned long lock_accessed(struct lock_list *lock)
921{
922 unsigned long nr;
923
924 nr = lock - list_entries;
925 WARN_ON(nr >= nr_list_entries);
926 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
927}
928
929static inline struct lock_list *get_lock_parent(struct lock_list *child)
930{
931 return child->parent;
932}
933
934static inline int get_lock_depth(struct lock_list *child)
935{
936 int depth = 0;
937 struct lock_list *parent;
938
939 while ((parent = get_lock_parent(child))) {
940 child = parent;
941 depth++;
942 }
943 return depth;
944}
945
946static int __bfs(struct lock_list *source_entry,
947 void *data,
948 int (*match)(struct lock_list *entry, void *data),
949 struct lock_list **target_entry,
950 int forward)
951{
952 struct lock_list *entry;
953 struct list_head *head;
954 struct circular_queue *cq = &lock_cq;
955 int ret = 1;
956
957 if (match(source_entry, data)) {
958 *target_entry = source_entry;
959 ret = 0;
960 goto exit;
961 }
962
963 if (forward)
964 head = &source_entry->class->locks_after;
965 else
966 head = &source_entry->class->locks_before;
967
968 if (list_empty(head))
969 goto exit;
970
971 __cq_init(cq);
972 __cq_enqueue(cq, (unsigned long)source_entry);
973
974 while (!__cq_empty(cq)) {
975 struct lock_list *lock;
976
977 __cq_dequeue(cq, (unsigned long *)&lock);
978
979 if (!lock->class) {
980 ret = -2;
981 goto exit;
982 }
983
984 if (forward)
985 head = &lock->class->locks_after;
986 else
987 head = &lock->class->locks_before;
988
989 list_for_each_entry(entry, head, entry) {
990 if (!lock_accessed(entry)) {
991 unsigned int cq_depth;
992 mark_lock_accessed(entry, lock);
993 if (match(entry, data)) {
994 *target_entry = entry;
995 ret = 0;
996 goto exit;
997 }
998
999 if (__cq_enqueue(cq, (unsigned long)entry)) {
1000 ret = -1;
1001 goto exit;
1002 }
1003 cq_depth = __cq_get_elem_count(cq);
1004 if (max_bfs_queue_depth < cq_depth)
1005 max_bfs_queue_depth = cq_depth;
1006 }
1007 }
1008 }
1009exit:
1010 return ret;
1011}
1012
1013static inline int __bfs_forwards(struct lock_list *src_entry,
1014 void *data,
1015 int (*match)(struct lock_list *entry, void *data),
1016 struct lock_list **target_entry)
1017{
1018 return __bfs(src_entry, data, match, target_entry, 1);
1019
1020}
1021
1022static inline int __bfs_backwards(struct lock_list *src_entry,
1023 void *data,
1024 int (*match)(struct lock_list *entry, void *data),
1025 struct lock_list **target_entry)
1026{
1027 return __bfs(src_entry, data, match, target_entry, 0);
1028
1029}
1030
1031/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1032 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1033 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1034 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1035 */
909static struct held_lock *check_source, *check_target;
910 1036
911/* 1037/*
912 * Print a dependency chain entry (this is only done when a deadlock 1038 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1039 * has been detected):
914 */ 1040 */
915static noinline int 1041static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1042print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1043{
918 if (debug_locks_silent) 1044 if (debug_locks_silent)
919 return 0; 1045 return 0;
@@ -930,11 +1056,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1056 * header first:
931 */ 1057 */
932static noinline int 1058static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1059print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1060 struct held_lock *check_src,
1061 struct held_lock *check_tgt)
934{ 1062{
935 struct task_struct *curr = current; 1063 struct task_struct *curr = current;
936 1064
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1065 if (debug_locks_silent)
938 return 0; 1066 return 0;
939 1067
940 printk("\n=======================================================\n"); 1068 printk("\n=======================================================\n");
@@ -943,9 +1071,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1071 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1072 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1073 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1074 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1075 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1076 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1077 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1078 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1079
@@ -954,19 +1082,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1082 return 0;
955} 1083}
956 1084
957static noinline int print_circular_bug_tail(void) 1085static inline int class_equal(struct lock_list *entry, void *data)
1086{
1087 return entry->class == data;
1088}
1089
1090static noinline int print_circular_bug(struct lock_list *this,
1091 struct lock_list *target,
1092 struct held_lock *check_src,
1093 struct held_lock *check_tgt)
958{ 1094{
959 struct task_struct *curr = current; 1095 struct task_struct *curr = current;
960 struct lock_list this; 1096 struct lock_list *parent;
1097 int depth;
961 1098
962 if (debug_locks_silent) 1099 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1100 return 0;
964 1101
965 this.class = hlock_class(check_source); 1102 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1103 return 0;
968 1104
969 print_circular_bug_entry(&this, 0); 1105 depth = get_lock_depth(target);
1106
1107 print_circular_bug_header(target, depth, check_src, check_tgt);
1108
1109 parent = get_lock_parent(target);
1110
1111 while (parent) {
1112 print_circular_bug_entry(parent, --depth);
1113 parent = get_lock_parent(parent);
1114 }
970 1115
971 printk("\nother info that might help us debug this:\n\n"); 1116 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1117 lockdep_print_held_locks(curr);
@@ -977,73 +1122,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1122 return 0;
978} 1123}
979 1124
980#define RECURSION_LIMIT 40 1125static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1126{
984 if (!debug_locks_off_graph_unlock()) 1127 if (!debug_locks_off_graph_unlock())
985 return 0; 1128 return 0;
986 1129
987 WARN_ON(1); 1130 WARN(1, "lockdep bfs error:%d\n", ret);
988 1131
989 return 0; 1132 return 0;
990} 1133}
991 1134
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1135static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1136{
995 struct lock_list *entry; 1137 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1138 return 0;
1139}
997 1140
998 if (lockdep_dependency_visit(class, depth)) 1141unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1142{
1143 unsigned long count = 0;
1144 struct lock_list *uninitialized_var(target_entry);
1000 1145
1001 /* 1146 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1147
1007 return ret; 1148 return count;
1008} 1149}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1150unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1151{
1012 unsigned long ret, flags; 1152 unsigned long ret, flags;
1153 struct lock_list this;
1154
1155 this.parent = NULL;
1156 this.class = class;
1013 1157
1014 local_irq_save(flags); 1158 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1159 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1160 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1161 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1162 local_irq_restore(flags);
1019 1163
1020 return ret; 1164 return ret;
1021} 1165}
1022 1166
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1167unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1168{
1026 struct lock_list *entry; 1169 unsigned long count = 0;
1027 unsigned long ret = 1; 1170 struct lock_list *uninitialized_var(target_entry);
1028 1171
1029 if (lockdep_dependency_visit(class, depth)) 1172 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1173
1037 return ret; 1174 return count;
1038} 1175}
1039 1176
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1177unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1178{
1042 unsigned long ret, flags; 1179 unsigned long ret, flags;
1180 struct lock_list this;
1181
1182 this.parent = NULL;
1183 this.class = class;
1043 1184
1044 local_irq_save(flags); 1185 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1186 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1187 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1188 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1189 local_irq_restore(flags);
1049 1190
@@ -1055,29 +1196,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1196 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1197 */
1057static noinline int 1198static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1199check_noncircular(struct lock_list *root, struct lock_class *target,
1200 struct lock_list **target_entry)
1059{ 1201{
1060 struct lock_list *entry; 1202 int result;
1061 1203
1062 if (lockdep_dependency_visit(source, depth)) 1204 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1205
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1206 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1207
1067 max_recursion_depth = depth; 1208 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1209}
1082 1210
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1211#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1214,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1214 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1215 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1216 */
1089static enum lock_usage_bit find_usage_bit; 1217
1090static struct lock_class *forwards_match, *backwards_match; 1218static inline int usage_match(struct lock_list *entry, void *bit)
1219{
1220 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1221}
1222
1223
1091 1224
1092/* 1225/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1226 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1227 * at @root->class that matches @bit.
1095 * 1228 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1229 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1230 * into *@target_entry.
1098 * 1231 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1232 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1233 * Return <0 on error.
1101 */ 1234 */
1102static noinline int 1235static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1236find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1237 struct lock_list **target_entry)
1104{ 1238{
1105 struct lock_list *entry; 1239 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1240
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1241 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1242
1122 /* 1243 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1244
1124 */ 1245 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1246}
1133 1247
1134/* 1248/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1249 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1250 * at @root->class that matches @bit.
1137 * 1251 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1252 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1253 * into *@target_entry.
1140 * 1254 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1255 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1256 * Return <0 on error.
1143 */ 1257 */
1144static noinline int 1258static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1259find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1260 struct lock_list **target_entry)
1146{ 1261{
1147 struct lock_list *entry; 1262 int result;
1148 int ret;
1149 1263
1150 if (lockdep_dependency_visit(source, depth)) 1264 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1265
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1266 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1267
1156 if (depth > max_recursion_depth) 1268 return result;
1157 max_recursion_depth = depth; 1269}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1270
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1271static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1272{
1163 backwards_match = source; 1273 int bit;
1164 return 2;
1165 }
1166 1274
1167 if (!source && debug_locks_off_graph_unlock()) { 1275 printk("%*s->", depth, "");
1168 WARN_ON(1); 1276 print_lock_name(class);
1169 return 0; 1277 printk(" ops: %lu", class->ops);
1170 } 1278 printk(" {\n");
1171 1279
1172 /* 1280 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1281 if (class->usage_mask & (1 << bit)) {
1174 */ 1282 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1283
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1284 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1285 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1286 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1287 }
1180 } 1288 }
1181 return 1; 1289 printk("%*s }\n", depth, "");
1290
1291 printk("%*s ... key at: ",depth,"");
1292 print_ip_sym((unsigned long)class->key);
1293}
1294
1295/*
1296 * printk the shortest lock dependencies from @start to @end in reverse order:
1297 */
1298static void __used
1299print_shortest_lock_dependencies(struct lock_list *leaf,
1300 struct lock_list *root)
1301{
1302 struct lock_list *entry = leaf;
1303 int depth;
1304
1305 /*compute depth from generated tree by BFS*/
1306 depth = get_lock_depth(leaf);
1307
1308 do {
1309 print_lock_class_header(entry->class, depth);
1310 printk("%*s ... acquired at:\n", depth, "");
1311 print_stack_trace(&entry->trace, 2);
1312 printk("\n");
1313
1314 if (depth == 0 && (entry != root)) {
1315 printk("lockdep:%s bad BFS generated tree\n", __func__);
1316 break;
1317 }
1318
1319 entry = get_lock_parent(entry);
1320 depth--;
1321 } while (entry && (depth >= 0));
1322
1323 return;
1182} 1324}
1183 1325
1184static int 1326static int
1185print_bad_irq_dependency(struct task_struct *curr, 1327print_bad_irq_dependency(struct task_struct *curr,
1328 struct lock_list *prev_root,
1329 struct lock_list *next_root,
1330 struct lock_list *backwards_entry,
1331 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1332 struct held_lock *prev,
1187 struct held_lock *next, 1333 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1334 enum lock_usage_bit bit1,
@@ -1215,26 +1361,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1361
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1362 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1363 irqclass);
1218 print_lock_name(backwards_match); 1364 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1365 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1366
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1367 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1368
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1369 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1370 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1371 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1372 printk("...");
1227 1373
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1374 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1375
1230 printk("\nother info that might help us debug this:\n\n"); 1376 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1377 lockdep_print_held_locks(curr);
1232 1378
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1379 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1380 printk(" and the holding lock:\n");
1381 if (!save_trace(&prev_root->trace))
1382 return 0;
1383 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1384
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1385 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1386 printk(" and %s-irq-unsafe lock:\n", irqclass);
1387 if (!save_trace(&next_root->trace))
1388 return 0;
1389 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1390
1239 printk("\nstack backtrace:\n"); 1391 printk("\nstack backtrace:\n");
1240 dump_stack(); 1392 dump_stack();
@@ -1248,19 +1400,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1400 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1401{
1250 int ret; 1402 int ret;
1403 struct lock_list this, that;
1404 struct lock_list *uninitialized_var(target_entry);
1405 struct lock_list *uninitialized_var(target_entry1);
1406
1407 this.parent = NULL;
1251 1408
1252 find_usage_bit = bit_backwards; 1409 this.class = hlock_class(prev);
1253 /* fills in <backwards_match> */ 1410 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1411 if (ret < 0)
1255 if (!ret || ret == 1) 1412 return print_bfs_bug(ret);
1413 if (ret == 1)
1256 return ret; 1414 return ret;
1257 1415
1258 find_usage_bit = bit_forwards; 1416 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1417 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1418 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1419 if (ret < 0)
1420 return print_bfs_bug(ret);
1421 if (ret == 1)
1261 return ret; 1422 return ret;
1262 /* ret == 2 */ 1423
1263 return print_bad_irq_dependency(curr, prev, next, 1424 return print_bad_irq_dependency(curr, &this, &that,
1425 target_entry, target_entry1,
1426 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1427 bit_backwards, bit_forwards, irqclass);
1265} 1428}
1266 1429
@@ -1472,6 +1635,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1635{
1473 struct lock_list *entry; 1636 struct lock_list *entry;
1474 int ret; 1637 int ret;
1638 struct lock_list this;
1639 struct lock_list *uninitialized_var(target_entry);
1475 1640
1476 /* 1641 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1642 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1647,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1647 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1648 * keep the stackframe size of the recursive functions low:
1484 */ 1649 */
1485 check_source = next; 1650 this.class = hlock_class(next);
1486 check_target = prev; 1651 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1652 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1653 if (unlikely(!ret))
1654 return print_circular_bug(&this, target_entry, next, prev);
1655 else if (unlikely(ret < 0))
1656 return print_bfs_bug(ret);
1489 1657
1490 if (!check_prev_add_irq(curr, prev, next)) 1658 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1659 return 0;
@@ -1884,7 +2052,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2052 * print irq inversion bug:
1885 */ 2053 */
1886static int 2054static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2055print_irq_inversion_bug(struct task_struct *curr,
2056 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2057 struct held_lock *this, int forwards,
1889 const char *irqclass) 2058 const char *irqclass)
1890{ 2059{
@@ -1902,17 +2071,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2071 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2072 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2073 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2074 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2075 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2076
1908 printk("\nother info that might help us debug this:\n"); 2077 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2078 lockdep_print_held_locks(curr);
1910 2079
1911 printk("\nthe first lock's dependencies:\n"); 2080 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2081 if (!save_trace(&root->trace))
1913 2082 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2083 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2084
1917 printk("\nstack backtrace:\n"); 2085 printk("\nstack backtrace:\n");
1918 dump_stack(); 2086 dump_stack();
@@ -1929,14 +2097,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2097 enum lock_usage_bit bit, const char *irqclass)
1930{ 2098{
1931 int ret; 2099 int ret;
1932 2100 struct lock_list root;
1933 find_usage_bit = bit; 2101 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2102
1935 ret = find_usage_forwards(hlock_class(this), 0); 2103 root.parent = NULL;
1936 if (!ret || ret == 1) 2104 root.class = hlock_class(this);
2105 ret = find_usage_forwards(&root, bit, &target_entry);
2106 if (ret < 0)
2107 return print_bfs_bug(ret);
2108 if (ret == 1)
1937 return ret; 2109 return ret;
1938 2110
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2111 return print_irq_inversion_bug(curr, &root, target_entry,
2112 this, 1, irqclass);
1940} 2113}
1941 2114
1942/* 2115/*
@@ -1948,14 +2121,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2121 enum lock_usage_bit bit, const char *irqclass)
1949{ 2122{
1950 int ret; 2123 int ret;
1951 2124 struct lock_list root;
1952 find_usage_bit = bit; 2125 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2126
1954 ret = find_usage_backwards(hlock_class(this), 0); 2127 root.parent = NULL;
1955 if (!ret || ret == 1) 2128 root.class = hlock_class(this);
2129 ret = find_usage_backwards(&root, bit, &target_entry);
2130 if (ret < 0)
2131 return print_bfs_bug(ret);
2132 if (ret == 1)
1956 return ret; 2133 return ret;
1957 2134
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2135 return print_irq_inversion_bug(curr, &root, target_entry,
2136 this, 1, irqclass);
1959} 2137}
1960 2138
1961void print_irqtrace_events(struct task_struct *curr) 2139void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2708 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2709static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2710 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2711 struct lockdep_map *nest_lock, unsigned long ip,
2712 int references)
2534{ 2713{
2535 struct task_struct *curr = current; 2714 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2715 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2716 struct held_lock *hlock;
2538 unsigned int depth, id; 2717 unsigned int depth, id;
2539 int chain_head = 0; 2718 int chain_head = 0;
2719 int class_idx;
2540 u64 chain_key; 2720 u64 chain_key;
2541 2721
2542 if (!prove_locking) 2722 if (!prove_locking)
@@ -2584,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2764 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2765 return 0;
2586 2766
2767 class_idx = class - lock_classes + 1;
2768
2769 if (depth) {
2770 hlock = curr->held_locks + depth - 1;
2771 if (hlock->class_idx == class_idx && nest_lock) {
2772 if (hlock->references)
2773 hlock->references++;
2774 else
2775 hlock->references = 2;
2776
2777 return 1;
2778 }
2779 }
2780
2587 hlock = curr->held_locks + depth; 2781 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2782 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2783 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2784 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2785 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2786 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2787 hlock->nest_lock = nest_lock;
@@ -2595,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2789 hlock->read = read;
2596 hlock->check = check; 2790 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2791 hlock->hardirqs_off = !!hardirqs_off;
2792 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2793#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2794 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2795 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2898 return 1;
2704} 2899}
2705 2900
2901static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2902{
2903 if (hlock->instance == lock)
2904 return 1;
2905
2906 if (hlock->references) {
2907 struct lock_class *class = lock->class_cache;
2908
2909 if (!class)
2910 class = look_up_lock_class(lock, 0);
2911
2912 if (DEBUG_LOCKS_WARN_ON(!class))
2913 return 0;
2914
2915 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2916 return 0;
2917
2918 if (hlock->class_idx == class - lock_classes + 1)
2919 return 1;
2920 }
2921
2922 return 0;
2923}
2924
2706static int 2925static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2926__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2927 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2945 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2946 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2947 break;
2729 if (hlock->instance == lock) 2948 if (match_held_lock(hlock, lock))
2730 goto found_it; 2949 goto found_it;
2731 prev_hlock = hlock; 2950 prev_hlock = hlock;
2732 } 2951 }
@@ -2745,7 +2964,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2964 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2965 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2966 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2967 hlock->nest_lock, hlock->acquire_ip,
2968 hlock->references))
2749 return 0; 2969 return 0;
2750 } 2970 }
2751 2971
@@ -2784,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3004 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3005 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3006 break;
2787 if (hlock->instance == lock) 3007 if (match_held_lock(hlock, lock))
2788 goto found_it; 3008 goto found_it;
2789 prev_hlock = hlock; 3009 prev_hlock = hlock;
2790 } 3010 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3011 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3012
2793found_it: 3013found_it:
2794 lock_release_holdtime(hlock); 3014 if (hlock->instance == lock)
3015 lock_release_holdtime(hlock);
3016
3017 if (hlock->references) {
3018 hlock->references--;
3019 if (hlock->references) {
3020 /*
3021 * We had, and after removing one, still have
3022 * references, the current lock stack is still
3023 * valid. We're done!
3024 */
3025 return 1;
3026 }
3027 }
2795 3028
2796 /* 3029 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3030 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3031 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3032 * entries (if any), recalculating the hash along the way:
2800 */ 3033 */
3034
2801 curr->lockdep_depth = i; 3035 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3036 curr->curr_chain_key = hlock->prev_chain_key;
2803 3037
@@ -2806,7 +3040,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3040 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3041 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3042 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3043 hlock->nest_lock, hlock->acquire_ip,
3044 hlock->references))
2810 return 0; 3045 return 0;
2811 } 3046 }
2812 3047
@@ -2836,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3071 /*
2837 * Is the unlock non-nested: 3072 * Is the unlock non-nested:
2838 */ 3073 */
2839 if (hlock->instance != lock) 3074 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3075 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3076 curr->lockdep_depth--;
2842 3077
@@ -2881,6 +3116,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3116 check_chain_key(curr);
2882} 3117}
2883 3118
3119static int __lock_is_held(struct lockdep_map *lock)
3120{
3121 struct task_struct *curr = current;
3122 int i;
3123
3124 for (i = 0; i < curr->lockdep_depth; i++) {
3125 struct held_lock *hlock = curr->held_locks + i;
3126
3127 if (match_held_lock(hlock, lock))
3128 return 1;
3129 }
3130
3131 return 0;
3132}
3133
2884/* 3134/*
2885 * Check whether we follow the irq-flags state precisely: 3135 * Check whether we follow the irq-flags state precisely:
2886 */ 3136 */
@@ -2957,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3207
2958 current->lockdep_recursion = 1; 3208 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3209 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3210 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3211 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3212 raw_local_irq_restore(flags);
2963} 3213}
@@ -2982,6 +3232,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3232}
2983EXPORT_SYMBOL_GPL(lock_release); 3233EXPORT_SYMBOL_GPL(lock_release);
2984 3234
3235int lock_is_held(struct lockdep_map *lock)
3236{
3237 unsigned long flags;
3238 int ret = 0;
3239
3240 if (unlikely(current->lockdep_recursion))
3241 return ret;
3242
3243 raw_local_irq_save(flags);
3244 check_flags(flags);
3245
3246 current->lockdep_recursion = 1;
3247 ret = __lock_is_held(lock);
3248 current->lockdep_recursion = 0;
3249 raw_local_irq_restore(flags);
3250
3251 return ret;
3252}
3253EXPORT_SYMBOL_GPL(lock_is_held);
3254
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3255void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3256{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3257 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3311 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3312 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3313 break;
3044 if (hlock->instance == lock) 3314 if (match_held_lock(hlock, lock))
3045 goto found_it; 3315 goto found_it;
3046 prev_hlock = hlock; 3316 prev_hlock = hlock;
3047 } 3317 }
@@ -3049,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3319 return;
3050 3320
3051found_it: 3321found_it:
3322 if (hlock->instance != lock)
3323 return;
3324
3052 hlock->waittime_stamp = sched_clock(); 3325 hlock->waittime_stamp = sched_clock();
3053 3326
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3327 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3361 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3362 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3363 break;
3091 if (hlock->instance == lock) 3364 if (match_held_lock(hlock, lock))
3092 goto found_it; 3365 goto found_it;
3093 prev_hlock = hlock; 3366 prev_hlock = hlock;
3094 } 3367 }
@@ -3096,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3369 return;
3097 3370
3098found_it: 3371found_it:
3372 if (hlock->instance != lock)
3373 return;
3374
3099 cpu = smp_processor_id(); 3375 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3376 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3377 now = sched_clock();
@@ -3326,7 +3602,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3602 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3603 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3604 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3605 sizeof(struct list_head) * CHAINHASH_SIZE
3606#ifdef CONFIG_PROVE_LOCKING
3607 + sizeof(struct circular_queue)
3608#endif
3609 ) / 1024
3610 );
3330 3611
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3612 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3613 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -670,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
670 return 0; 594 return 0;
671} 595}
672 596
673static struct seq_operations lockstat_ops = { 597static const struct seq_operations lockstat_ops = {
674 .start = ls_start, 598 .start = ls_start,
675 .next = ls_next, 599 .next = ls_next,
676 .stop = ls_stop, 600 .stop = ls_stop,
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 2d537186191f..8b7d8805819d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -55,6 +56,11 @@
55#include <linux/percpu.h> 56#include <linux/percpu.h>
56#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
57 58
59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h>
61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
58#if 0 64#if 0
59#define DEBUGP printk 65#define DEBUGP printk
60#else 66#else
@@ -364,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 370
365#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
366 372
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 374
369static void *percpu_modalloc(unsigned long size, unsigned long align, 375static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 376 const char *name)
@@ -389,7 +395,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 395 free_percpu(freeme);
390} 396}
391 397
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 399
394/* Number of blocks used and allocated. */ 400/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 401static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +541,7 @@ static int percpu_modinit(void)
535} 541}
536__initcall(percpu_modinit); 542__initcall(percpu_modinit);
537 543
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 545
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 546static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 547 Elf_Shdr *sechdrs,
@@ -942,6 +948,8 @@ void module_put(struct module *module)
942 if (module) { 948 if (module) {
943 unsigned int cpu = get_cpu(); 949 unsigned int cpu = get_cpu();
944 local_dec(__module_ref_addr(module, cpu)); 950 local_dec(__module_ref_addr(module, cpu));
951 trace_module_put(module, _RET_IP_,
952 local_read(__module_ref_addr(module, cpu)));
945 /* Maybe they're waiting for us to drop reference? */ 953 /* Maybe they're waiting for us to drop reference? */
946 if (unlikely(!module_is_live(module))) 954 if (unlikely(!module_is_live(module)))
947 wake_up_process(module->waiter); 955 wake_up_process(module->waiter);
@@ -1497,6 +1505,8 @@ static int __unlink_module(void *_mod)
1497/* Free a module, remove from lists, etc (must hold module_mutex). */ 1505/* Free a module, remove from lists, etc (must hold module_mutex). */
1498static void free_module(struct module *mod) 1506static void free_module(struct module *mod)
1499{ 1507{
1508 trace_module_free(mod);
1509
1500 /* Delete from various lists */ 1510 /* Delete from various lists */
1501 stop_machine(__unlink_module, mod, NULL); 1511 stop_machine(__unlink_module, mod, NULL);
1502 remove_notes_attrs(mod); 1512 remove_notes_attrs(mod);
@@ -1526,6 +1536,10 @@ static void free_module(struct module *mod)
1526 1536
1527 /* Finally, free the core (containing the module structure) */ 1537 /* Finally, free the core (containing the module structure) */
1528 module_free(mod, mod->module_core); 1538 module_free(mod, mod->module_core);
1539
1540#ifdef CONFIG_MPU
1541 update_protections(current->mm);
1542#endif
1529} 1543}
1530 1544
1531void *__symbol_get(const char *symbol) 1545void *__symbol_get(const char *symbol)
@@ -1783,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1783 } 1797 }
1784} 1798}
1785 1799
1800static void free_modinfo(struct module *mod)
1801{
1802 struct module_attribute *attr;
1803 int i;
1804
1805 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1806 if (attr->free)
1807 attr->free(mod);
1808 }
1809}
1810
1786#ifdef CONFIG_KALLSYMS 1811#ifdef CONFIG_KALLSYMS
1787 1812
1788/* lookup symbol in given range of kernel_symbols */ 1813/* lookup symbol in given range of kernel_symbols */
@@ -1848,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym,
1848 return '?'; 1873 return '?';
1849} 1874}
1850 1875
1876static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
1877 unsigned int shnum)
1878{
1879 const Elf_Shdr *sec;
1880
1881 if (src->st_shndx == SHN_UNDEF
1882 || src->st_shndx >= shnum
1883 || !src->st_name)
1884 return false;
1885
1886 sec = sechdrs + src->st_shndx;
1887 if (!(sec->sh_flags & SHF_ALLOC)
1888#ifndef CONFIG_KALLSYMS_ALL
1889 || !(sec->sh_flags & SHF_EXECINSTR)
1890#endif
1891 || (sec->sh_entsize & INIT_OFFSET_MASK))
1892 return false;
1893
1894 return true;
1895}
1896
1897static unsigned long layout_symtab(struct module *mod,
1898 Elf_Shdr *sechdrs,
1899 unsigned int symindex,
1900 unsigned int strindex,
1901 const Elf_Ehdr *hdr,
1902 const char *secstrings,
1903 unsigned long *pstroffs,
1904 unsigned long *strmap)
1905{
1906 unsigned long symoffs;
1907 Elf_Shdr *symsect = sechdrs + symindex;
1908 Elf_Shdr *strsect = sechdrs + strindex;
1909 const Elf_Sym *src;
1910 const char *strtab;
1911 unsigned int i, nsrc, ndst;
1912
1913 /* Put symbol section at end of init part of module. */
1914 symsect->sh_flags |= SHF_ALLOC;
1915 symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
1916 symindex) | INIT_OFFSET_MASK;
1917 DEBUGP("\t%s\n", secstrings + symsect->sh_name);
1918
1919 src = (void *)hdr + symsect->sh_offset;
1920 nsrc = symsect->sh_size / sizeof(*src);
1921 strtab = (void *)hdr + strsect->sh_offset;
1922 for (ndst = i = 1; i < nsrc; ++i, ++src)
1923 if (is_core_symbol(src, sechdrs, hdr->e_shnum)) {
1924 unsigned int j = src->st_name;
1925
1926 while(!__test_and_set_bit(j, strmap) && strtab[j])
1927 ++j;
1928 ++ndst;
1929 }
1930
1931 /* Append room for core symbols at end of core part. */
1932 symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
1933 mod->core_size = symoffs + ndst * sizeof(Elf_Sym);
1934
1935 /* Put string table section at end of init part of module. */
1936 strsect->sh_flags |= SHF_ALLOC;
1937 strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
1938 strindex) | INIT_OFFSET_MASK;
1939 DEBUGP("\t%s\n", secstrings + strsect->sh_name);
1940
1941 /* Append room for core symbols' strings at end of core part. */
1942 *pstroffs = mod->core_size;
1943 __set_bit(0, strmap);
1944 mod->core_size += bitmap_weight(strmap, strsect->sh_size);
1945
1946 return symoffs;
1947}
1948
1851static void add_kallsyms(struct module *mod, 1949static void add_kallsyms(struct module *mod,
1852 Elf_Shdr *sechdrs, 1950 Elf_Shdr *sechdrs,
1951 unsigned int shnum,
1853 unsigned int symindex, 1952 unsigned int symindex,
1854 unsigned int strindex, 1953 unsigned int strindex,
1855 const char *secstrings) 1954 unsigned long symoffs,
1955 unsigned long stroffs,
1956 const char *secstrings,
1957 unsigned long *strmap)
1856{ 1958{
1857 unsigned int i; 1959 unsigned int i, ndst;
1960 const Elf_Sym *src;
1961 Elf_Sym *dst;
1962 char *s;
1858 1963
1859 mod->symtab = (void *)sechdrs[symindex].sh_addr; 1964 mod->symtab = (void *)sechdrs[symindex].sh_addr;
1860 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1965 mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1864,13 +1969,46 @@ static void add_kallsyms(struct module *mod,
1864 for (i = 0; i < mod->num_symtab; i++) 1969 for (i = 0; i < mod->num_symtab; i++)
1865 mod->symtab[i].st_info 1970 mod->symtab[i].st_info
1866 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); 1971 = elf_type(&mod->symtab[i], sechdrs, secstrings, mod);
1972
1973 mod->core_symtab = dst = mod->module_core + symoffs;
1974 src = mod->symtab;
1975 *dst = *src;
1976 for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
1977 if (!is_core_symbol(src, sechdrs, shnum))
1978 continue;
1979 dst[ndst] = *src;
1980 dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name);
1981 ++ndst;
1982 }
1983 mod->core_num_syms = ndst;
1984
1985 mod->core_strtab = s = mod->module_core + stroffs;
1986 for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i)
1987 if (test_bit(i, strmap))
1988 *++s = mod->strtab[i];
1867} 1989}
1868#else 1990#else
1991static inline unsigned long layout_symtab(struct module *mod,
1992 Elf_Shdr *sechdrs,
1993 unsigned int symindex,
1994 unsigned int strindex,
1995 const Elf_Ehdr *hdr,
1996 const char *secstrings,
1997 unsigned long *pstroffs,
1998 unsigned long *strmap)
1999{
2000 return 0;
2001}
2002
1869static inline void add_kallsyms(struct module *mod, 2003static inline void add_kallsyms(struct module *mod,
1870 Elf_Shdr *sechdrs, 2004 Elf_Shdr *sechdrs,
2005 unsigned int shnum,
1871 unsigned int symindex, 2006 unsigned int symindex,
1872 unsigned int strindex, 2007 unsigned int strindex,
1873 const char *secstrings) 2008 unsigned long symoffs,
2009 unsigned long stroffs,
2010 const char *secstrings,
2011 const unsigned long *strmap)
1874{ 2012{
1875} 2013}
1876#endif /* CONFIG_KALLSYMS */ 2014#endif /* CONFIG_KALLSYMS */
@@ -1945,6 +2083,8 @@ static noinline struct module *load_module(void __user *umod,
1945 struct module *mod; 2083 struct module *mod;
1946 long err = 0; 2084 long err = 0;
1947 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2085 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
2086 unsigned long symoffs, stroffs, *strmap;
2087
1948 mm_segment_t old_fs; 2088 mm_segment_t old_fs;
1949 2089
1950 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 2090 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2026,11 +2166,6 @@ static noinline struct module *load_module(void __user *umod,
2026 /* Don't keep modinfo and version sections. */ 2166 /* Don't keep modinfo and version sections. */
2027 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2167 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2028 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2168 sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2029#ifdef CONFIG_KALLSYMS
2030 /* Keep symbol and string tables for decoding later. */
2031 sechdrs[symindex].sh_flags |= SHF_ALLOC;
2032 sechdrs[strindex].sh_flags |= SHF_ALLOC;
2033#endif
2034 2169
2035 /* Check module struct version now, before we try to use module. */ 2170 /* Check module struct version now, before we try to use module. */
2036 if (!check_modstruct_version(sechdrs, versindex, mod)) { 2171 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -2066,6 +2201,13 @@ static noinline struct module *load_module(void __user *umod,
2066 goto free_hdr; 2201 goto free_hdr;
2067 } 2202 }
2068 2203
2204 strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size)
2205 * sizeof(long), GFP_KERNEL);
2206 if (!strmap) {
2207 err = -ENOMEM;
2208 goto free_mod;
2209 }
2210
2069 if (find_module(mod->name)) { 2211 if (find_module(mod->name)) {
2070 err = -EEXIST; 2212 err = -EEXIST;
2071 goto free_mod; 2213 goto free_mod;
@@ -2095,6 +2237,8 @@ static noinline struct module *load_module(void __user *umod,
2095 this is done generically; there doesn't appear to be any 2237 this is done generically; there doesn't appear to be any
2096 special cases for the architectures. */ 2238 special cases for the architectures. */
2097 layout_sections(mod, hdr, sechdrs, secstrings); 2239 layout_sections(mod, hdr, sechdrs, secstrings);
2240 symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr,
2241 secstrings, &stroffs, strmap);
2098 2242
2099 /* Do the allocs. */ 2243 /* Do the allocs. */
2100 ptr = module_alloc_update_bounds(mod->core_size); 2244 ptr = module_alloc_update_bounds(mod->core_size);
@@ -2228,10 +2372,6 @@ static noinline struct module *load_module(void __user *umod,
2228 sizeof(*mod->ctors), &mod->num_ctors); 2372 sizeof(*mod->ctors), &mod->num_ctors);
2229#endif 2373#endif
2230 2374
2231#ifdef CONFIG_MARKERS
2232 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2233 sizeof(*mod->markers), &mod->num_markers);
2234#endif
2235#ifdef CONFIG_TRACEPOINTS 2375#ifdef CONFIG_TRACEPOINTS
2236 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2376 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2237 "__tracepoints", 2377 "__tracepoints",
@@ -2303,7 +2443,10 @@ static noinline struct module *load_module(void __user *umod,
2303 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2443 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
2304 sechdrs[pcpuindex].sh_size); 2444 sechdrs[pcpuindex].sh_size);
2305 2445
2306 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2446 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
2447 symoffs, stroffs, secstrings, strmap);
2448 kfree(strmap);
2449 strmap = NULL;
2307 2450
2308 if (!mod->taints) { 2451 if (!mod->taints) {
2309 struct _ddebug *debug; 2452 struct _ddebug *debug;
@@ -2364,6 +2507,8 @@ static noinline struct module *load_module(void __user *umod,
2364 /* Get rid of temporary copy */ 2507 /* Get rid of temporary copy */
2365 vfree(hdr); 2508 vfree(hdr);
2366 2509
2510 trace_module_load(mod);
2511
2367 /* Done! */ 2512 /* Done! */
2368 return mod; 2513 return mod;
2369 2514
@@ -2373,13 +2518,14 @@ static noinline struct module *load_module(void __user *umod,
2373 synchronize_sched(); 2518 synchronize_sched();
2374 module_arch_cleanup(mod); 2519 module_arch_cleanup(mod);
2375 cleanup: 2520 cleanup:
2521 free_modinfo(mod);
2376 kobject_del(&mod->mkobj.kobj); 2522 kobject_del(&mod->mkobj.kobj);
2377 kobject_put(&mod->mkobj.kobj); 2523 kobject_put(&mod->mkobj.kobj);
2378 free_unload: 2524 free_unload:
2379 module_unload_free(mod); 2525 module_unload_free(mod);
2380#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2526#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2381 free_init:
2382 percpu_modfree(mod->refptr); 2527 percpu_modfree(mod->refptr);
2528 free_init:
2383#endif 2529#endif
2384 module_free(mod, mod->module_init); 2530 module_free(mod, mod->module_init);
2385 free_core: 2531 free_core:
@@ -2390,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod,
2390 percpu_modfree(percpu); 2536 percpu_modfree(percpu);
2391 free_mod: 2537 free_mod:
2392 kfree(args); 2538 kfree(args);
2539 kfree(strmap);
2393 free_hdr: 2540 free_hdr:
2394 vfree(hdr); 2541 vfree(hdr);
2395 return ERR_PTR(err); 2542 return ERR_PTR(err);
@@ -2479,6 +2626,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2479 /* Drop initial reference. */ 2626 /* Drop initial reference. */
2480 module_put(mod); 2627 module_put(mod);
2481 trim_init_extable(mod); 2628 trim_init_extable(mod);
2629#ifdef CONFIG_KALLSYMS
2630 mod->num_symtab = mod->core_num_syms;
2631 mod->symtab = mod->core_symtab;
2632 mod->strtab = mod->core_strtab;
2633#endif
2482 module_free(mod, mod->module_init); 2634 module_free(mod, mod->module_init);
2483 mod->module_init = NULL; 2635 mod->module_init = NULL;
2484 mod->init_size = 0; 2636 mod->init_size = 0;
@@ -2940,27 +3092,12 @@ void module_layout(struct module *mod,
2940 struct modversion_info *ver, 3092 struct modversion_info *ver,
2941 struct kernel_param *kp, 3093 struct kernel_param *kp,
2942 struct kernel_symbol *ks, 3094 struct kernel_symbol *ks,
2943 struct marker *marker,
2944 struct tracepoint *tp) 3095 struct tracepoint *tp)
2945{ 3096{
2946} 3097}
2947EXPORT_SYMBOL(module_layout); 3098EXPORT_SYMBOL(module_layout);
2948#endif 3099#endif
2949 3100
2950#ifdef CONFIG_MARKERS
2951void module_update_markers(void)
2952{
2953 struct module *mod;
2954
2955 mutex_lock(&module_mutex);
2956 list_for_each_entry(mod, &modules, list)
2957 if (!mod->taints)
2958 marker_update_probe_range(mod->markers,
2959 mod->markers + mod->num_markers);
2960 mutex_unlock(&module_mutex);
2961}
2962#endif
2963
2964#ifdef CONFIG_TRACEPOINTS 3101#ifdef CONFIG_TRACEPOINTS
2965void module_update_tracepoints(void) 3102void module_update_tracepoints(void)
2966{ 3103{
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 5aa854f9e5ae..2a5dfec8efe0 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
42 * (hence either you are in the same cgroup as task, or in an 42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof) 43 * ancestor cgroup thereof)
44 */ 44 */
45static int ns_can_attach(struct cgroup_subsys *ss, 45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct cgroup *new_cgroup, struct task_struct *task) 46 struct task_struct *task, bool threadgroup)
47{ 47{
48 if (current != task) { 48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN)) 49 if (!capable(CAP_SYS_ADMIN))
@@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss,
56 if (!cgroup_is_descendant(new_cgroup, task)) 56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM; 57 return -EPERM;
58 58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
59 return 0; 71 return 0;
60} 72}
61 73
diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..bcdef26e3332 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,7 +177,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 177 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 178 * 'C' - modules from drivers/staging are loaded.
179 * 179 *
180 * The string is overwritten by the next call to print_taint(). 180 * The string is overwritten by the next call to print_tainted().
181 */ 181 */
182const char *print_tainted(void) 182const char *print_tainted(void)
183{ 183{
diff --git a/kernel/params.c b/kernel/params.c
index 7f6912ced2ba..9da58eabdcb2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/device.h> 23#include <linux/device.h>
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val)
87 } 88 }
88 89
89 for (i = 0; args[i]; i++) { 90 for (i = 0; args[i]; i++) {
90 if (args[i] == ' ' && !in_quote) 91 if (isspace(args[i]) && !in_quote)
91 break; 92 break;
92 if (equals == 0) { 93 if (equals == 0) {
93 if (args[i] == '=') 94 if (args[i] == '=')
@@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i; 122 next = args + i;
122 123
123 /* Chew up trailing spaces. */ 124 /* Chew up trailing spaces. */
124 while (*next == ' ') 125 while (isspace(*next))
125 next++; 126 next++;
126 return next; 127 return next;
127} 128}
@@ -138,7 +139,7 @@ int parse_args(const char *name,
138 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
139 140
140 /* Chew leading spaces */ 141 /* Chew leading spaces */
141 while (*args == ' ') 142 while (isspace(*args))
142 args++; 143 args++;
143 144
144 while (*args) { 145 while (*args) {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index d7cbc579fc80..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4861 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * 0 - not paranoid
50 * 1 - disallow cpu counters to unpriv
51 * 2 - disallow kernel profiling to unpriv
52 */
53int sysctl_perf_counter_paranoid __read_mostly = 1;
54
55static inline bool perf_paranoid_cpu(void)
56{
57 return sysctl_perf_counter_paranoid > 0;
58}
59
60static inline bool perf_paranoid_kernel(void)
61{
62 return sysctl_perf_counter_paranoid > 1;
63}
64
65int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
66
67/*
68 * max perf counter sample rate
69 */
70int sysctl_perf_counter_sample_rate __read_mostly = 100000;
71
72static atomic64_t perf_counter_id;
73
74/*
75 * Lock for (sysadmin-configurable) counter reservations:
76 */
77static DEFINE_SPINLOCK(perf_resource_lock);
78
79/*
80 * Architecture provided APIs - weak aliases:
81 */
82extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
83{
84 return NULL;
85}
86
87void __weak hw_perf_disable(void) { barrier(); }
88void __weak hw_perf_enable(void) { barrier(); }
89
90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
92
93int __weak
94hw_perf_group_sched_in(struct perf_counter *group_leader,
95 struct perf_cpu_context *cpuctx,
96 struct perf_counter_context *ctx, int cpu)
97{
98 return 0;
99}
100
101void __weak perf_counter_print_debug(void) { }
102
103static DEFINE_PER_CPU(int, disable_count);
104
105void __perf_disable(void)
106{
107 __get_cpu_var(disable_count)++;
108}
109
110bool __perf_enable(void)
111{
112 return !--__get_cpu_var(disable_count);
113}
114
115void perf_disable(void)
116{
117 __perf_disable();
118 hw_perf_disable();
119}
120
121void perf_enable(void)
122{
123 if (__perf_enable())
124 hw_perf_enable();
125}
126
127static void get_ctx(struct perf_counter_context *ctx)
128{
129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
130}
131
132static void free_ctx(struct rcu_head *head)
133{
134 struct perf_counter_context *ctx;
135
136 ctx = container_of(head, struct perf_counter_context, rcu_head);
137 kfree(ctx);
138}
139
140static void put_ctx(struct perf_counter_context *ctx)
141{
142 if (atomic_dec_and_test(&ctx->refcount)) {
143 if (ctx->parent_ctx)
144 put_ctx(ctx->parent_ctx);
145 if (ctx->task)
146 put_task_struct(ctx->task);
147 call_rcu(&ctx->rcu_head, free_ctx);
148 }
149}
150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
173/*
174 * Get the perf_counter_context for a task and lock it.
175 * This has to cope with with the fact that until it is locked,
176 * the context could get moved to another task.
177 */
178static struct perf_counter_context *
179perf_lock_task_context(struct task_struct *task, unsigned long *flags)
180{
181 struct perf_counter_context *ctx;
182
183 rcu_read_lock();
184 retry:
185 ctx = rcu_dereference(task->perf_counter_ctxp);
186 if (ctx) {
187 /*
188 * If this context is a clone of another, it might
189 * get swapped for another underneath us by
190 * perf_counter_task_sched_out, though the
191 * rcu_read_lock() protects us from any context
192 * getting freed. Lock the context and check if it
193 * got swapped before we could get the lock, and retry
194 * if so. If we locked the right context, then it
195 * can't get swapped on us any more.
196 */
197 spin_lock_irqsave(&ctx->lock, *flags);
198 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
199 spin_unlock_irqrestore(&ctx->lock, *flags);
200 goto retry;
201 }
202
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
206 }
207 }
208 rcu_read_unlock();
209 return ctx;
210}
211
212/*
213 * Get the context for a task and increment its pin_count so it
214 * can't get swapped to another task. This also increments its
215 * reference count so that the context can't get freed.
216 */
217static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
218{
219 struct perf_counter_context *ctx;
220 unsigned long flags;
221
222 ctx = perf_lock_task_context(task, &flags);
223 if (ctx) {
224 ++ctx->pin_count;
225 spin_unlock_irqrestore(&ctx->lock, flags);
226 }
227 return ctx;
228}
229
230static void perf_unpin_context(struct perf_counter_context *ctx)
231{
232 unsigned long flags;
233
234 spin_lock_irqsave(&ctx->lock, flags);
235 --ctx->pin_count;
236 spin_unlock_irqrestore(&ctx->lock, flags);
237 put_ctx(ctx);
238}
239
240/*
241 * Add a counter from the lists for its context.
242 * Must be called with ctx->mutex and ctx->lock held.
243 */
244static void
245list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246{
247 struct perf_counter *group_leader = counter->group_leader;
248
249 /*
250 * Depending on whether it is a standalone or sibling counter,
251 * add it straight to the context's counter list, or to the group
252 * leader's sibling list:
253 */
254 if (group_leader == counter)
255 list_add_tail(&counter->list_entry, &ctx->counter_list);
256 else {
257 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
258 group_leader->nr_siblings++;
259 }
260
261 list_add_rcu(&counter->event_entry, &ctx->event_list);
262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
265}
266
267/*
268 * Remove a counter from the lists for its context.
269 * Must be called with ctx->mutex and ctx->lock held.
270 */
271static void
272list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
273{
274 struct perf_counter *sibling, *tmp;
275
276 if (list_empty(&counter->list_entry))
277 return;
278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
281
282 list_del_init(&counter->list_entry);
283 list_del_rcu(&counter->event_entry);
284
285 if (counter->group_leader != counter)
286 counter->group_leader->nr_siblings--;
287
288 /*
289 * If this was a group counter with sibling counters then
290 * upgrade the siblings to singleton counters by adding them
291 * to the context list directly:
292 */
293 list_for_each_entry_safe(sibling, tmp,
294 &counter->sibling_list, list_entry) {
295
296 list_move_tail(&sibling->list_entry, &ctx->counter_list);
297 sibling->group_leader = sibling;
298 }
299}
300
301static void
302counter_sched_out(struct perf_counter *counter,
303 struct perf_cpu_context *cpuctx,
304 struct perf_counter_context *ctx)
305{
306 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
307 return;
308
309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
314 counter->tstamp_stopped = ctx->time;
315 counter->pmu->disable(counter);
316 counter->oncpu = -1;
317
318 if (!is_software_counter(counter))
319 cpuctx->active_oncpu--;
320 ctx->nr_active--;
321 if (counter->attr.exclusive || !cpuctx->active_oncpu)
322 cpuctx->exclusive = 0;
323}
324
325static void
326group_sched_out(struct perf_counter *group_counter,
327 struct perf_cpu_context *cpuctx,
328 struct perf_counter_context *ctx)
329{
330 struct perf_counter *counter;
331
332 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
333 return;
334
335 counter_sched_out(group_counter, cpuctx, ctx);
336
337 /*
338 * Schedule out siblings (if any):
339 */
340 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
341 counter_sched_out(counter, cpuctx, ctx);
342
343 if (group_counter->attr.exclusive)
344 cpuctx->exclusive = 0;
345}
346
347/*
348 * Cross CPU call to remove a performance counter
349 *
350 * We disable the counter on the hardware level first. After that we
351 * remove it from the context list.
352 */
353static void __perf_counter_remove_from_context(void *info)
354{
355 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
356 struct perf_counter *counter = info;
357 struct perf_counter_context *ctx = counter->ctx;
358
359 /*
360 * If this is a task context, we need to check whether it is
361 * the current task context of this cpu. If not it has been
362 * scheduled out before the smp call arrived.
363 */
364 if (ctx->task && cpuctx->task_ctx != ctx)
365 return;
366
367 spin_lock(&ctx->lock);
368 /*
369 * Protect the list operation against NMI by disabling the
370 * counters on a global level.
371 */
372 perf_disable();
373
374 counter_sched_out(counter, cpuctx, ctx);
375
376 list_del_counter(counter, ctx);
377
378 if (!ctx->task) {
379 /*
380 * Allow more per task counters with respect to the
381 * reservation:
382 */
383 cpuctx->max_pertask =
384 min(perf_max_counters - ctx->nr_counters,
385 perf_max_counters - perf_reserved_percpu);
386 }
387
388 perf_enable();
389 spin_unlock(&ctx->lock);
390}
391
392
393/*
394 * Remove the counter from a task's (or a CPU's) list of counters.
395 *
396 * Must be called with ctx->mutex held.
397 *
398 * CPU counters are removed with a smp call. For task counters we only
399 * call when the task is on a CPU.
400 *
401 * If counter->ctx is a cloned context, callers must make sure that
402 * every task struct that counter->ctx->task could possibly point to
403 * remains valid. This is OK when called from perf_release since
404 * that only calls us on the top-level context, which can't be a clone.
405 * When called from perf_counter_exit_task, it's OK because the
406 * context has been detached from its task.
407 */
408static void perf_counter_remove_from_context(struct perf_counter *counter)
409{
410 struct perf_counter_context *ctx = counter->ctx;
411 struct task_struct *task = ctx->task;
412
413 if (!task) {
414 /*
415 * Per cpu counters are removed via an smp call and
416 * the removal is always sucessful.
417 */
418 smp_call_function_single(counter->cpu,
419 __perf_counter_remove_from_context,
420 counter, 1);
421 return;
422 }
423
424retry:
425 task_oncpu_function_call(task, __perf_counter_remove_from_context,
426 counter);
427
428 spin_lock_irq(&ctx->lock);
429 /*
430 * If the context is active we need to retry the smp call.
431 */
432 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
433 spin_unlock_irq(&ctx->lock);
434 goto retry;
435 }
436
437 /*
438 * The lock prevents that this context is scheduled in so we
439 * can remove the counter safely, if the call above did not
440 * succeed.
441 */
442 if (!list_empty(&counter->list_entry)) {
443 list_del_counter(counter, ctx);
444 }
445 spin_unlock_irq(&ctx->lock);
446}
447
448static inline u64 perf_clock(void)
449{
450 return cpu_clock(smp_processor_id());
451}
452
453/*
454 * Update the record of the current time in a context.
455 */
456static void update_context_time(struct perf_counter_context *ctx)
457{
458 u64 now = perf_clock();
459
460 ctx->time += now - ctx->timestamp;
461 ctx->timestamp = now;
462}
463
464/*
465 * Update the total_time_enabled and total_time_running fields for a counter.
466 */
467static void update_counter_times(struct perf_counter *counter)
468{
469 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end;
471
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
473 return;
474
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
476
477 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
478 run_end = counter->tstamp_stopped;
479 else
480 run_end = ctx->time;
481
482 counter->total_time_running = run_end - counter->tstamp_running;
483}
484
485/*
486 * Update total_time_enabled and total_time_running for all counters in a group.
487 */
488static void update_group_times(struct perf_counter *leader)
489{
490 struct perf_counter *counter;
491
492 update_counter_times(leader);
493 list_for_each_entry(counter, &leader->sibling_list, list_entry)
494 update_counter_times(counter);
495}
496
497/*
498 * Cross CPU call to disable a performance counter
499 */
500static void __perf_counter_disable(void *info)
501{
502 struct perf_counter *counter = info;
503 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
504 struct perf_counter_context *ctx = counter->ctx;
505
506 /*
507 * If this is a per-task counter, need to check whether this
508 * counter's task is the current task on this cpu.
509 */
510 if (ctx->task && cpuctx->task_ctx != ctx)
511 return;
512
513 spin_lock(&ctx->lock);
514
515 /*
516 * If the counter is on, turn it off.
517 * If it is in error state, leave it in error state.
518 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx);
521 update_counter_times(counter);
522 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx);
524 else
525 counter_sched_out(counter, cpuctx, ctx);
526 counter->state = PERF_COUNTER_STATE_OFF;
527 }
528
529 spin_unlock(&ctx->lock);
530}
531
532/*
533 * Disable a counter.
534 *
535 * If counter->ctx is a cloned context, callers must make sure that
536 * every task struct that counter->ctx->task could possibly point to
537 * remains valid. This condition is satisifed when called through
538 * perf_counter_for_each_child or perf_counter_for_each because they
539 * hold the top-level counter's child_mutex, so any descendant that
540 * goes to exit will block in sync_child_counter.
541 * When called from perf_pending_counter it's OK because counter->ctx
542 * is the current context on this CPU and preemption is disabled,
543 * hence we can't get into perf_counter_task_sched_out for this context.
544 */
545static void perf_counter_disable(struct perf_counter *counter)
546{
547 struct perf_counter_context *ctx = counter->ctx;
548 struct task_struct *task = ctx->task;
549
550 if (!task) {
551 /*
552 * Disable the counter on the cpu that it's on
553 */
554 smp_call_function_single(counter->cpu, __perf_counter_disable,
555 counter, 1);
556 return;
557 }
558
559 retry:
560 task_oncpu_function_call(task, __perf_counter_disable, counter);
561
562 spin_lock_irq(&ctx->lock);
563 /*
564 * If the counter is still active, we need to retry the cross-call.
565 */
566 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
567 spin_unlock_irq(&ctx->lock);
568 goto retry;
569 }
570
571 /*
572 * Since we have the lock this context can't be scheduled
573 * in, so we can change the state safely.
574 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF;
578 }
579
580 spin_unlock_irq(&ctx->lock);
581}
582
583static int
584counter_sched_in(struct perf_counter *counter,
585 struct perf_cpu_context *cpuctx,
586 struct perf_counter_context *ctx,
587 int cpu)
588{
589 if (counter->state <= PERF_COUNTER_STATE_OFF)
590 return 0;
591
592 counter->state = PERF_COUNTER_STATE_ACTIVE;
593 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
594 /*
595 * The new state must be visible before we turn it on in the hardware:
596 */
597 smp_wmb();
598
599 if (counter->pmu->enable(counter)) {
600 counter->state = PERF_COUNTER_STATE_INACTIVE;
601 counter->oncpu = -1;
602 return -EAGAIN;
603 }
604
605 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
606
607 if (!is_software_counter(counter))
608 cpuctx->active_oncpu++;
609 ctx->nr_active++;
610
611 if (counter->attr.exclusive)
612 cpuctx->exclusive = 1;
613
614 return 0;
615}
616
617static int
618group_sched_in(struct perf_counter *group_counter,
619 struct perf_cpu_context *cpuctx,
620 struct perf_counter_context *ctx,
621 int cpu)
622{
623 struct perf_counter *counter, *partial_group;
624 int ret;
625
626 if (group_counter->state == PERF_COUNTER_STATE_OFF)
627 return 0;
628
629 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
630 if (ret)
631 return ret < 0 ? ret : 0;
632
633 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
634 return -EAGAIN;
635
636 /*
637 * Schedule in siblings as one group (if any):
638 */
639 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
640 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
641 partial_group = counter;
642 goto group_error;
643 }
644 }
645
646 return 0;
647
648group_error:
649 /*
650 * Groups can be scheduled in as one unit only, so undo any
651 * partial group before returning:
652 */
653 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
654 if (counter == partial_group)
655 break;
656 counter_sched_out(counter, cpuctx, ctx);
657 }
658 counter_sched_out(group_counter, cpuctx, ctx);
659
660 return -EAGAIN;
661}
662
663/*
664 * Return 1 for a group consisting entirely of software counters,
665 * 0 if the group contains any hardware counters.
666 */
667static int is_software_only_group(struct perf_counter *leader)
668{
669 struct perf_counter *counter;
670
671 if (!is_software_counter(leader))
672 return 0;
673
674 list_for_each_entry(counter, &leader->sibling_list, list_entry)
675 if (!is_software_counter(counter))
676 return 0;
677
678 return 1;
679}
680
681/*
682 * Work out whether we can put this counter group on the CPU now.
683 */
684static int group_can_go_on(struct perf_counter *counter,
685 struct perf_cpu_context *cpuctx,
686 int can_add_hw)
687{
688 /*
689 * Groups consisting entirely of software counters can always go on.
690 */
691 if (is_software_only_group(counter))
692 return 1;
693 /*
694 * If an exclusive group is already on, no other hardware
695 * counters can go on.
696 */
697 if (cpuctx->exclusive)
698 return 0;
699 /*
700 * If this group is exclusive and there are already
701 * counters on the CPU, it can't go on.
702 */
703 if (counter->attr.exclusive && cpuctx->active_oncpu)
704 return 0;
705 /*
706 * Otherwise, try to add it if all previous groups were able
707 * to go on.
708 */
709 return can_add_hw;
710}
711
712static void add_counter_to_ctx(struct perf_counter *counter,
713 struct perf_counter_context *ctx)
714{
715 list_add_counter(counter, ctx);
716 counter->tstamp_enabled = ctx->time;
717 counter->tstamp_running = ctx->time;
718 counter->tstamp_stopped = ctx->time;
719}
720
721/*
722 * Cross CPU call to install and enable a performance counter
723 *
724 * Must be called with ctx->mutex held
725 */
726static void __perf_install_in_context(void *info)
727{
728 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
729 struct perf_counter *counter = info;
730 struct perf_counter_context *ctx = counter->ctx;
731 struct perf_counter *leader = counter->group_leader;
732 int cpu = smp_processor_id();
733 int err;
734
735 /*
736 * If this is a task context, we need to check whether it is
737 * the current task context of this cpu. If not it has been
738 * scheduled out before the smp call arrived.
739 * Or possibly this is the right context but it isn't
740 * on this cpu because it had no counters.
741 */
742 if (ctx->task && cpuctx->task_ctx != ctx) {
743 if (cpuctx->task_ctx || ctx->task != current)
744 return;
745 cpuctx->task_ctx = ctx;
746 }
747
748 spin_lock(&ctx->lock);
749 ctx->is_active = 1;
750 update_context_time(ctx);
751
752 /*
753 * Protect the list operation against NMI by disabling the
754 * counters on a global level. NOP for non NMI based counters.
755 */
756 perf_disable();
757
758 add_counter_to_ctx(counter, ctx);
759
760 /*
761 * Don't put the counter on if it is disabled or if
762 * it is in a group and the group isn't on.
763 */
764 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
765 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
766 goto unlock;
767
768 /*
769 * An exclusive counter can't go on if there are already active
770 * hardware counters, and no hardware counter can go on if there
771 * is already an exclusive counter on.
772 */
773 if (!group_can_go_on(counter, cpuctx, 1))
774 err = -EEXIST;
775 else
776 err = counter_sched_in(counter, cpuctx, ctx, cpu);
777
778 if (err) {
779 /*
780 * This counter couldn't go on. If it is in a group
781 * then we have to pull the whole group off.
782 * If the counter group is pinned then put it in error state.
783 */
784 if (leader != counter)
785 group_sched_out(leader, cpuctx, ctx);
786 if (leader->attr.pinned) {
787 update_group_times(leader);
788 leader->state = PERF_COUNTER_STATE_ERROR;
789 }
790 }
791
792 if (!err && !ctx->task && cpuctx->max_pertask)
793 cpuctx->max_pertask--;
794
795 unlock:
796 perf_enable();
797
798 spin_unlock(&ctx->lock);
799}
800
801/*
802 * Attach a performance counter to a context
803 *
804 * First we add the counter to the list with the hardware enable bit
805 * in counter->hw_config cleared.
806 *
807 * If the counter is attached to a task which is on a CPU we use a smp
808 * call to enable it in the task context. The task might have been
809 * scheduled away, but we check this in the smp call again.
810 *
811 * Must be called with ctx->mutex held.
812 */
813static void
814perf_install_in_context(struct perf_counter_context *ctx,
815 struct perf_counter *counter,
816 int cpu)
817{
818 struct task_struct *task = ctx->task;
819
820 if (!task) {
821 /*
822 * Per cpu counters are installed via an smp call and
823 * the install is always sucessful.
824 */
825 smp_call_function_single(cpu, __perf_install_in_context,
826 counter, 1);
827 return;
828 }
829
830retry:
831 task_oncpu_function_call(task, __perf_install_in_context,
832 counter);
833
834 spin_lock_irq(&ctx->lock);
835 /*
836 * we need to retry the smp call.
837 */
838 if (ctx->is_active && list_empty(&counter->list_entry)) {
839 spin_unlock_irq(&ctx->lock);
840 goto retry;
841 }
842
843 /*
844 * The lock prevents that this context is scheduled in so we
845 * can add the counter safely, if it the call above did not
846 * succeed.
847 */
848 if (list_empty(&counter->list_entry))
849 add_counter_to_ctx(counter, ctx);
850 spin_unlock_irq(&ctx->lock);
851}
852
853/*
854 * Cross CPU call to enable a performance counter
855 */
856static void __perf_counter_enable(void *info)
857{
858 struct perf_counter *counter = info;
859 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
860 struct perf_counter_context *ctx = counter->ctx;
861 struct perf_counter *leader = counter->group_leader;
862 int err;
863
864 /*
865 * If this is a per-task counter, need to check whether this
866 * counter's task is the current task on this cpu.
867 */
868 if (ctx->task && cpuctx->task_ctx != ctx) {
869 if (cpuctx->task_ctx || ctx->task != current)
870 return;
871 cpuctx->task_ctx = ctx;
872 }
873
874 spin_lock(&ctx->lock);
875 ctx->is_active = 1;
876 update_context_time(ctx);
877
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE;
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882
883 /*
884 * If the counter is in a group and isn't the group leader,
885 * then don't put it on unless the group is on.
886 */
887 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
888 goto unlock;
889
890 if (!group_can_go_on(counter, cpuctx, 1)) {
891 err = -EEXIST;
892 } else {
893 perf_disable();
894 if (counter == leader)
895 err = group_sched_in(counter, cpuctx, ctx,
896 smp_processor_id());
897 else
898 err = counter_sched_in(counter, cpuctx, ctx,
899 smp_processor_id());
900 perf_enable();
901 }
902
903 if (err) {
904 /*
905 * If this counter can't go on and it's part of a
906 * group, then the whole group has to come off.
907 */
908 if (leader != counter)
909 group_sched_out(leader, cpuctx, ctx);
910 if (leader->attr.pinned) {
911 update_group_times(leader);
912 leader->state = PERF_COUNTER_STATE_ERROR;
913 }
914 }
915
916 unlock:
917 spin_unlock(&ctx->lock);
918}
919
920/*
921 * Enable a counter.
922 *
923 * If counter->ctx is a cloned context, callers must make sure that
924 * every task struct that counter->ctx->task could possibly point to
925 * remains valid. This condition is satisfied when called through
926 * perf_counter_for_each_child or perf_counter_for_each as described
927 * for perf_counter_disable.
928 */
929static void perf_counter_enable(struct perf_counter *counter)
930{
931 struct perf_counter_context *ctx = counter->ctx;
932 struct task_struct *task = ctx->task;
933
934 if (!task) {
935 /*
936 * Enable the counter on the cpu that it's on
937 */
938 smp_call_function_single(counter->cpu, __perf_counter_enable,
939 counter, 1);
940 return;
941 }
942
943 spin_lock_irq(&ctx->lock);
944 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
945 goto out;
946
947 /*
948 * If the counter is in error state, clear that first.
949 * That way, if we see the counter in error state below, we
950 * know that it has gone back into error state, as distinct
951 * from the task having been scheduled away before the
952 * cross-call arrived.
953 */
954 if (counter->state == PERF_COUNTER_STATE_ERROR)
955 counter->state = PERF_COUNTER_STATE_OFF;
956
957 retry:
958 spin_unlock_irq(&ctx->lock);
959 task_oncpu_function_call(task, __perf_counter_enable, counter);
960
961 spin_lock_irq(&ctx->lock);
962
963 /*
964 * If the context is active and the counter is still off,
965 * we need to retry the cross-call.
966 */
967 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
968 goto retry;
969
970 /*
971 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely.
973 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) {
975 counter->state = PERF_COUNTER_STATE_INACTIVE;
976 counter->tstamp_enabled =
977 ctx->time - counter->total_time_enabled;
978 }
979 out:
980 spin_unlock_irq(&ctx->lock);
981}
982
983static int perf_counter_refresh(struct perf_counter *counter, int refresh)
984{
985 /*
986 * not supported on inherited counters
987 */
988 if (counter->attr.inherit)
989 return -EINVAL;
990
991 atomic_add(refresh, &counter->event_limit);
992 perf_counter_enable(counter);
993
994 return 0;
995}
996
997void __perf_counter_sched_out(struct perf_counter_context *ctx,
998 struct perf_cpu_context *cpuctx)
999{
1000 struct perf_counter *counter;
1001
1002 spin_lock(&ctx->lock);
1003 ctx->is_active = 0;
1004 if (likely(!ctx->nr_counters))
1005 goto out;
1006 update_context_time(ctx);
1007
1008 perf_disable();
1009 if (ctx->nr_active) {
1010 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1011 if (counter != counter->group_leader)
1012 counter_sched_out(counter, cpuctx, ctx);
1013 else
1014 group_sched_out(counter, cpuctx, ctx);
1015 }
1016 }
1017 perf_enable();
1018 out:
1019 spin_unlock(&ctx->lock);
1020}
1021
1022/*
1023 * Test whether two contexts are equivalent, i.e. whether they
1024 * have both been cloned from the same version of the same context
1025 * and they both have the same number of enabled counters.
1026 * If the number of enabled counters is the same, then the set
1027 * of enabled counters should be the same, because these are both
1028 * inherited contexts, therefore we can't access individual counters
1029 * in them directly with an fd; we can only enable/disable all
1030 * counters via prctl, or enable/disable all counters in a family
1031 * via ioctl, which will have the same effect on both contexts.
1032 */
1033static int context_equiv(struct perf_counter_context *ctx1,
1034 struct perf_counter_context *ctx2)
1035{
1036 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1037 && ctx1->parent_gen == ctx2->parent_gen
1038 && !ctx1->pin_count && !ctx2->pin_count;
1039}
1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1116/*
1117 * Called from scheduler to remove the counters of the current task,
1118 * with interrupts disabled.
1119 *
1120 * We stop each counter and update the counter value in counter->count.
1121 *
1122 * This does not protect us against NMI, but disable()
1123 * sets the disabled bit in the control field of counter _before_
1124 * accessing the counter control register. If a NMI hits, then it will
1125 * not restart the counter.
1126 */
1127void perf_counter_task_sched_out(struct task_struct *task,
1128 struct task_struct *next, int cpu)
1129{
1130 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1131 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1132 struct perf_counter_context *next_ctx;
1133 struct perf_counter_context *parent;
1134 struct pt_regs *regs;
1135 int do_switch = 1;
1136
1137 regs = task_pt_regs(task);
1138 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1139
1140 if (likely(!ctx || !cpuctx->task_ctx))
1141 return;
1142
1143 update_context_time(ctx);
1144
1145 rcu_read_lock();
1146 parent = rcu_dereference(ctx->parent_ctx);
1147 next_ctx = next->perf_counter_ctxp;
1148 if (parent && next_ctx &&
1149 rcu_dereference(next_ctx->parent_ctx) == parent) {
1150 /*
1151 * Looks like the two contexts are clones, so we might be
1152 * able to optimize the context switch. We lock both
1153 * contexts and check that they are clones under the
1154 * lock (including re-checking that neither has been
1155 * uncloned in the meantime). It doesn't matter which
1156 * order we take the locks because no other cpu could
1157 * be trying to lock both of these tasks.
1158 */
1159 spin_lock(&ctx->lock);
1160 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1161 if (context_equiv(ctx, next_ctx)) {
1162 /*
1163 * XXX do we need a memory barrier of sorts
1164 * wrt to rcu_dereference() of perf_counter_ctxp
1165 */
1166 task->perf_counter_ctxp = next_ctx;
1167 next->perf_counter_ctxp = ctx;
1168 ctx->task = next;
1169 next_ctx->task = task;
1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1173 }
1174 spin_unlock(&next_ctx->lock);
1175 spin_unlock(&ctx->lock);
1176 }
1177 rcu_read_unlock();
1178
1179 if (do_switch) {
1180 __perf_counter_sched_out(ctx, cpuctx);
1181 cpuctx->task_ctx = NULL;
1182 }
1183}
1184
1185/*
1186 * Called with IRQs disabled
1187 */
1188static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1189{
1190 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1191
1192 if (!cpuctx->task_ctx)
1193 return;
1194
1195 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1196 return;
1197
1198 __perf_counter_sched_out(ctx, cpuctx);
1199 cpuctx->task_ctx = NULL;
1200}
1201
1202/*
1203 * Called with IRQs disabled
1204 */
1205static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1206{
1207 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1208}
1209
1210static void
1211__perf_counter_sched_in(struct perf_counter_context *ctx,
1212 struct perf_cpu_context *cpuctx, int cpu)
1213{
1214 struct perf_counter *counter;
1215 int can_add_hw = 1;
1216
1217 spin_lock(&ctx->lock);
1218 ctx->is_active = 1;
1219 if (likely(!ctx->nr_counters))
1220 goto out;
1221
1222 ctx->timestamp = perf_clock();
1223
1224 perf_disable();
1225
1226 /*
1227 * First go through the list and put on any pinned groups
1228 * in order to give them the best chance of going on.
1229 */
1230 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1231 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1232 !counter->attr.pinned)
1233 continue;
1234 if (counter->cpu != -1 && counter->cpu != cpu)
1235 continue;
1236
1237 if (counter != counter->group_leader)
1238 counter_sched_in(counter, cpuctx, ctx, cpu);
1239 else {
1240 if (group_can_go_on(counter, cpuctx, 1))
1241 group_sched_in(counter, cpuctx, ctx, cpu);
1242 }
1243
1244 /*
1245 * If this pinned group hasn't been scheduled,
1246 * put it in error state.
1247 */
1248 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1249 update_group_times(counter);
1250 counter->state = PERF_COUNTER_STATE_ERROR;
1251 }
1252 }
1253
1254 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1255 /*
1256 * Ignore counters in OFF or ERROR state, and
1257 * ignore pinned counters since we did them already.
1258 */
1259 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1260 counter->attr.pinned)
1261 continue;
1262
1263 /*
1264 * Listen to the 'cpu' scheduling filter constraint
1265 * of counters:
1266 */
1267 if (counter->cpu != -1 && counter->cpu != cpu)
1268 continue;
1269
1270 if (counter != counter->group_leader) {
1271 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1272 can_add_hw = 0;
1273 } else {
1274 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1275 if (group_sched_in(counter, cpuctx, ctx, cpu))
1276 can_add_hw = 0;
1277 }
1278 }
1279 }
1280 perf_enable();
1281 out:
1282 spin_unlock(&ctx->lock);
1283}
1284
1285/*
1286 * Called from scheduler to add the counters of the current task
1287 * with interrupts disabled.
1288 *
1289 * We restore the counter value and then enable it.
1290 *
1291 * This does not protect us against NMI, but enable()
1292 * sets the enabled bit in the control field of counter _before_
1293 * accessing the counter control register. If a NMI hits, then it will
1294 * keep the counter running.
1295 */
1296void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1297{
1298 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1299 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1300
1301 if (likely(!ctx))
1302 return;
1303 if (cpuctx->task_ctx == ctx)
1304 return;
1305 __perf_counter_sched_in(ctx, cpuctx, cpu);
1306 cpuctx->task_ctx = ctx;
1307}
1308
1309static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1310{
1311 struct perf_counter_context *ctx = &cpuctx->ctx;
1312
1313 __perf_counter_sched_in(ctx, cpuctx, cpu);
1314}
1315
1316#define MAX_INTERRUPTS (~0ULL)
1317
1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1319
1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1321{
1322 struct hw_perf_counter *hwc = &counter->hw;
1323 u64 period, sample_period;
1324 s64 delta;
1325
1326 events *= hwc->sample_period;
1327 period = div64_u64(events, counter->attr.sample_freq);
1328
1329 delta = (s64)(period - hwc->sample_period);
1330 delta = (delta + 7) / 8; /* low pass filter */
1331
1332 sample_period = hwc->sample_period + delta;
1333
1334 if (!sample_period)
1335 sample_period = 1;
1336
1337 hwc->sample_period = sample_period;
1338}
1339
1340static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1341{
1342 struct perf_counter *counter;
1343 struct hw_perf_counter *hwc;
1344 u64 interrupts, freq;
1345
1346 spin_lock(&ctx->lock);
1347 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1348 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1349 continue;
1350
1351 hwc = &counter->hw;
1352
1353 interrupts = hwc->interrupts;
1354 hwc->interrupts = 0;
1355
1356 /*
1357 * unthrottle counters on the tick
1358 */
1359 if (interrupts == MAX_INTERRUPTS) {
1360 perf_log_throttle(counter, 1);
1361 counter->pmu->unthrottle(counter);
1362 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1363 }
1364
1365 if (!counter->attr.freq || !counter->attr.sample_freq)
1366 continue;
1367
1368 /*
1369 * if the specified freq < HZ then we need to skip ticks
1370 */
1371 if (counter->attr.sample_freq < HZ) {
1372 freq = counter->attr.sample_freq;
1373
1374 hwc->freq_count += freq;
1375 hwc->freq_interrupts += interrupts;
1376
1377 if (hwc->freq_count < HZ)
1378 continue;
1379
1380 interrupts = hwc->freq_interrupts;
1381 hwc->freq_interrupts = 0;
1382 hwc->freq_count -= HZ;
1383 } else
1384 freq = HZ;
1385
1386 perf_adjust_period(counter, freq * interrupts);
1387
1388 /*
1389 * In order to avoid being stalled by an (accidental) huge
1390 * sample period, force reset the sample period if we didn't
1391 * get any events in this freq period.
1392 */
1393 if (!interrupts) {
1394 perf_disable();
1395 counter->pmu->disable(counter);
1396 atomic64_set(&hwc->period_left, 0);
1397 counter->pmu->enable(counter);
1398 perf_enable();
1399 }
1400 }
1401 spin_unlock(&ctx->lock);
1402}
1403
1404/*
1405 * Round-robin a context's counters:
1406 */
1407static void rotate_ctx(struct perf_counter_context *ctx)
1408{
1409 struct perf_counter *counter;
1410
1411 if (!ctx->nr_counters)
1412 return;
1413
1414 spin_lock(&ctx->lock);
1415 /*
1416 * Rotate the first entry last (works just fine for group counters too):
1417 */
1418 perf_disable();
1419 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1420 list_move_tail(&counter->list_entry, &ctx->counter_list);
1421 break;
1422 }
1423 perf_enable();
1424
1425 spin_unlock(&ctx->lock);
1426}
1427
1428void perf_counter_task_tick(struct task_struct *curr, int cpu)
1429{
1430 struct perf_cpu_context *cpuctx;
1431 struct perf_counter_context *ctx;
1432
1433 if (!atomic_read(&nr_counters))
1434 return;
1435
1436 cpuctx = &per_cpu(perf_cpu_context, cpu);
1437 ctx = curr->perf_counter_ctxp;
1438
1439 perf_ctx_adjust_freq(&cpuctx->ctx);
1440 if (ctx)
1441 perf_ctx_adjust_freq(ctx);
1442
1443 perf_counter_cpu_sched_out(cpuctx);
1444 if (ctx)
1445 __perf_counter_task_sched_out(ctx);
1446
1447 rotate_ctx(&cpuctx->ctx);
1448 if (ctx)
1449 rotate_ctx(ctx);
1450
1451 perf_counter_cpu_sched_in(cpuctx, cpu);
1452 if (ctx)
1453 perf_counter_task_sched_in(curr, cpu);
1454}
1455
1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1502 * Cross CPU call to read the hardware counter
1503 */
1504static void __perf_counter_read(void *info)
1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1507 struct perf_counter *counter = info;
1508 struct perf_counter_context *ctx = counter->ctx;
1509 unsigned long flags;
1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1521 local_irq_save(flags);
1522 if (ctx->is_active)
1523 update_context_time(ctx);
1524 counter->pmu->read(counter);
1525 update_counter_times(counter);
1526 local_irq_restore(flags);
1527}
1528
1529static u64 perf_counter_read(struct perf_counter *counter)
1530{
1531 /*
1532 * If counter is enabled and currently active on a CPU, update the
1533 * value in the counter structure:
1534 */
1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1536 smp_call_function_single(counter->oncpu,
1537 __perf_counter_read, counter, 1);
1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1539 update_counter_times(counter);
1540 }
1541
1542 return atomic64_read(&counter->count);
1543}
1544
1545/*
1546 * Initialize the perf_counter context in a task_struct:
1547 */
1548static void
1549__perf_counter_init_context(struct perf_counter_context *ctx,
1550 struct task_struct *task)
1551{
1552 memset(ctx, 0, sizeof(*ctx));
1553 spin_lock_init(&ctx->lock);
1554 mutex_init(&ctx->mutex);
1555 INIT_LIST_HEAD(&ctx->counter_list);
1556 INIT_LIST_HEAD(&ctx->event_list);
1557 atomic_set(&ctx->refcount, 1);
1558 ctx->task = task;
1559}
1560
1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1562{
1563 struct perf_counter_context *ctx;
1564 struct perf_cpu_context *cpuctx;
1565 struct task_struct *task;
1566 unsigned long flags;
1567 int err;
1568
1569 /*
1570 * If cpu is not a wildcard then this is a percpu counter:
1571 */
1572 if (cpu != -1) {
1573 /* Must be root to operate on a CPU counter: */
1574 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1575 return ERR_PTR(-EACCES);
1576
1577 if (cpu < 0 || cpu > num_possible_cpus())
1578 return ERR_PTR(-EINVAL);
1579
1580 /*
1581 * We could be clever and allow to attach a counter to an
1582 * offline CPU and activate it when the CPU comes up, but
1583 * that's for later.
1584 */
1585 if (!cpu_isset(cpu, cpu_online_map))
1586 return ERR_PTR(-ENODEV);
1587
1588 cpuctx = &per_cpu(perf_cpu_context, cpu);
1589 ctx = &cpuctx->ctx;
1590 get_ctx(ctx);
1591
1592 return ctx;
1593 }
1594
1595 rcu_read_lock();
1596 if (!pid)
1597 task = current;
1598 else
1599 task = find_task_by_vpid(pid);
1600 if (task)
1601 get_task_struct(task);
1602 rcu_read_unlock();
1603
1604 if (!task)
1605 return ERR_PTR(-ESRCH);
1606
1607 /*
1608 * Can't attach counters to a dying task.
1609 */
1610 err = -ESRCH;
1611 if (task->flags & PF_EXITING)
1612 goto errout;
1613
1614 /* Reuse ptrace permission checks for now. */
1615 err = -EACCES;
1616 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1617 goto errout;
1618
1619 retry:
1620 ctx = perf_lock_task_context(task, &flags);
1621 if (ctx) {
1622 unclone_ctx(ctx);
1623 spin_unlock_irqrestore(&ctx->lock, flags);
1624 }
1625
1626 if (!ctx) {
1627 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1628 err = -ENOMEM;
1629 if (!ctx)
1630 goto errout;
1631 __perf_counter_init_context(ctx, task);
1632 get_ctx(ctx);
1633 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1634 /*
1635 * We raced with some other task; use
1636 * the context they set.
1637 */
1638 kfree(ctx);
1639 goto retry;
1640 }
1641 get_task_struct(task);
1642 }
1643
1644 put_task_struct(task);
1645 return ctx;
1646
1647 errout:
1648 put_task_struct(task);
1649 return ERR_PTR(err);
1650}
1651
1652static void free_counter_rcu(struct rcu_head *head)
1653{
1654 struct perf_counter *counter;
1655
1656 counter = container_of(head, struct perf_counter, rcu_head);
1657 if (counter->ns)
1658 put_pid_ns(counter->ns);
1659 kfree(counter);
1660}
1661
1662static void perf_pending_sync(struct perf_counter *counter);
1663
1664static void free_counter(struct perf_counter *counter)
1665{
1666 perf_pending_sync(counter);
1667
1668 if (!counter->parent) {
1669 atomic_dec(&nr_counters);
1670 if (counter->attr.mmap)
1671 atomic_dec(&nr_mmap_counters);
1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1677
1678 if (counter->destroy)
1679 counter->destroy(counter);
1680
1681 put_ctx(counter->ctx);
1682 call_rcu(&counter->rcu_head, free_counter_rcu);
1683}
1684
1685/*
1686 * Called when the last reference to the file is gone.
1687 */
1688static int perf_release(struct inode *inode, struct file *file)
1689{
1690 struct perf_counter *counter = file->private_data;
1691 struct perf_counter_context *ctx = counter->ctx;
1692
1693 file->private_data = NULL;
1694
1695 WARN_ON_ONCE(ctx->parent_ctx);
1696 mutex_lock(&ctx->mutex);
1697 perf_counter_remove_from_context(counter);
1698 mutex_unlock(&ctx->mutex);
1699
1700 mutex_lock(&counter->owner->perf_counter_mutex);
1701 list_del_init(&counter->owner_entry);
1702 mutex_unlock(&counter->owner->perf_counter_mutex);
1703 put_task_struct(counter->owner);
1704
1705 free_counter(counter);
1706
1707 return 0;
1708}
1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1829/*
1830 * Read the performance counter - simple non blocking version for now
1831 */
1832static ssize_t
1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1834{
1835 u64 read_format = counter->attr.read_format;
1836 int ret;
1837
1838 /*
1839 * Return end-of-file for a read on a counter that is in
1840 * error state (i.e. because it was pinned but it couldn't be
1841 * scheduled on to the CPU at some point).
1842 */
1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1844 return 0;
1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1850 mutex_lock(&counter->child_mutex);
1851 if (read_format & PERF_FORMAT_GROUP)
1852 ret = perf_counter_read_group(counter, read_format, buf);
1853 else
1854 ret = perf_counter_read_one(counter, read_format, buf);
1855 mutex_unlock(&counter->child_mutex);
1856
1857 return ret;
1858}
1859
1860static ssize_t
1861perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1862{
1863 struct perf_counter *counter = file->private_data;
1864
1865 return perf_read_hw(counter, buf, count);
1866}
1867
1868static unsigned int perf_poll(struct file *file, poll_table *wait)
1869{
1870 struct perf_counter *counter = file->private_data;
1871 struct perf_mmap_data *data;
1872 unsigned int events = POLL_HUP;
1873
1874 rcu_read_lock();
1875 data = rcu_dereference(counter->data);
1876 if (data)
1877 events = atomic_xchg(&data->poll, 0);
1878 rcu_read_unlock();
1879
1880 poll_wait(file, &counter->waitq, wait);
1881
1882 return events;
1883}
1884
1885static void perf_counter_reset(struct perf_counter *counter)
1886{
1887 (void)perf_counter_read(counter);
1888 atomic64_set(&counter->count, 0);
1889 perf_counter_update_userpage(counter);
1890}
1891
1892/*
1893 * Holding the top-level counter's child_mutex means that any
1894 * descendant process that has inherited this counter will block
1895 * in sync_child_counter if it goes to exit, thus satisfying the
1896 * task existence requirements of perf_counter_enable/disable.
1897 */
1898static void perf_counter_for_each_child(struct perf_counter *counter,
1899 void (*func)(struct perf_counter *))
1900{
1901 struct perf_counter *child;
1902
1903 WARN_ON_ONCE(counter->ctx->parent_ctx);
1904 mutex_lock(&counter->child_mutex);
1905 func(counter);
1906 list_for_each_entry(child, &counter->child_list, child_list)
1907 func(child);
1908 mutex_unlock(&counter->child_mutex);
1909}
1910
1911static void perf_counter_for_each(struct perf_counter *counter,
1912 void (*func)(struct perf_counter *))
1913{
1914 struct perf_counter_context *ctx = counter->ctx;
1915 struct perf_counter *sibling;
1916
1917 WARN_ON_ONCE(ctx->parent_ctx);
1918 mutex_lock(&ctx->mutex);
1919 counter = counter->group_leader;
1920
1921 perf_counter_for_each_child(counter, func);
1922 func(counter);
1923 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1924 perf_counter_for_each_child(counter, func);
1925 mutex_unlock(&ctx->mutex);
1926}
1927
1928static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1929{
1930 struct perf_counter_context *ctx = counter->ctx;
1931 unsigned long size;
1932 int ret = 0;
1933 u64 value;
1934
1935 if (!counter->attr.sample_period)
1936 return -EINVAL;
1937
1938 size = copy_from_user(&value, arg, sizeof(value));
1939 if (size != sizeof(value))
1940 return -EFAULT;
1941
1942 if (!value)
1943 return -EINVAL;
1944
1945 spin_lock_irq(&ctx->lock);
1946 if (counter->attr.freq) {
1947 if (value > sysctl_perf_counter_sample_rate) {
1948 ret = -EINVAL;
1949 goto unlock;
1950 }
1951
1952 counter->attr.sample_freq = value;
1953 } else {
1954 counter->attr.sample_period = value;
1955 counter->hw.sample_period = value;
1956 }
1957unlock:
1958 spin_unlock_irq(&ctx->lock);
1959
1960 return ret;
1961}
1962
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{
1965 struct perf_counter *counter = file->private_data;
1966 void (*func)(struct perf_counter *);
1967 u32 flags = arg;
1968
1969 switch (cmd) {
1970 case PERF_COUNTER_IOC_ENABLE:
1971 func = perf_counter_enable;
1972 break;
1973 case PERF_COUNTER_IOC_DISABLE:
1974 func = perf_counter_disable;
1975 break;
1976 case PERF_COUNTER_IOC_RESET:
1977 func = perf_counter_reset;
1978 break;
1979
1980 case PERF_COUNTER_IOC_REFRESH:
1981 return perf_counter_refresh(counter, arg);
1982
1983 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg);
1985
1986 default:
1987 return -ENOTTY;
1988 }
1989
1990 if (flags & PERF_IOC_FLAG_GROUP)
1991 perf_counter_for_each(counter, func);
1992 else
1993 perf_counter_for_each_child(counter, func);
1994
1995 return 0;
1996}
1997
1998int perf_counter_task_enable(void)
1999{
2000 struct perf_counter *counter;
2001
2002 mutex_lock(&current->perf_counter_mutex);
2003 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2004 perf_counter_for_each_child(counter, perf_counter_enable);
2005 mutex_unlock(&current->perf_counter_mutex);
2006
2007 return 0;
2008}
2009
2010int perf_counter_task_disable(void)
2011{
2012 struct perf_counter *counter;
2013
2014 mutex_lock(&current->perf_counter_mutex);
2015 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2016 perf_counter_for_each_child(counter, perf_counter_disable);
2017 mutex_unlock(&current->perf_counter_mutex);
2018
2019 return 0;
2020}
2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
2034/*
2035 * Callers need to ensure there can be no nesting of this function, otherwise
2036 * the seqlock logic goes bad. We can not serialize this because the arch
2037 * code calls this from NMI context.
2038 */
2039void perf_counter_update_userpage(struct perf_counter *counter)
2040{
2041 struct perf_counter_mmap_page *userpg;
2042 struct perf_mmap_data *data;
2043
2044 rcu_read_lock();
2045 data = rcu_dereference(counter->data);
2046 if (!data)
2047 goto unlock;
2048
2049 userpg = data->user_page;
2050
2051 /*
2052 * Disable preemption so as to not let the corresponding user-space
2053 * spin too long if we get preempted.
2054 */
2055 preempt_disable();
2056 ++userpg->lock;
2057 barrier();
2058 userpg->index = perf_counter_index(counter);
2059 userpg->offset = atomic64_read(&counter->count);
2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
2069 barrier();
2070 ++userpg->lock;
2071 preempt_enable();
2072unlock:
2073 rcu_read_unlock();
2074}
2075
2076static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2077{
2078 struct perf_counter *counter = vma->vm_file->private_data;
2079 struct perf_mmap_data *data;
2080 int ret = VM_FAULT_SIGBUS;
2081
2082 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2083 if (vmf->pgoff == 0)
2084 ret = 0;
2085 return ret;
2086 }
2087
2088 rcu_read_lock();
2089 data = rcu_dereference(counter->data);
2090 if (!data)
2091 goto unlock;
2092
2093 if (vmf->pgoff == 0) {
2094 vmf->page = virt_to_page(data->user_page);
2095 } else {
2096 int nr = vmf->pgoff - 1;
2097
2098 if ((unsigned)nr > data->nr_pages)
2099 goto unlock;
2100
2101 if (vmf->flags & FAULT_FLAG_WRITE)
2102 goto unlock;
2103
2104 vmf->page = virt_to_page(data->data_pages[nr]);
2105 }
2106
2107 get_page(vmf->page);
2108 vmf->page->mapping = vma->vm_file->f_mapping;
2109 vmf->page->index = vmf->pgoff;
2110
2111 ret = 0;
2112unlock:
2113 rcu_read_unlock();
2114
2115 return ret;
2116}
2117
2118static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2119{
2120 struct perf_mmap_data *data;
2121 unsigned long size;
2122 int i;
2123
2124 WARN_ON(atomic_read(&counter->mmap_count));
2125
2126 size = sizeof(struct perf_mmap_data);
2127 size += nr_pages * sizeof(void *);
2128
2129 data = kzalloc(size, GFP_KERNEL);
2130 if (!data)
2131 goto fail;
2132
2133 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2134 if (!data->user_page)
2135 goto fail_user_page;
2136
2137 for (i = 0; i < nr_pages; i++) {
2138 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2139 if (!data->data_pages[i])
2140 goto fail_data_pages;
2141 }
2142
2143 data->nr_pages = nr_pages;
2144 atomic_set(&data->lock, -1);
2145
2146 rcu_assign_pointer(counter->data, data);
2147
2148 return 0;
2149
2150fail_data_pages:
2151 for (i--; i >= 0; i--)
2152 free_page((unsigned long)data->data_pages[i]);
2153
2154 free_page((unsigned long)data->user_page);
2155
2156fail_user_page:
2157 kfree(data);
2158
2159fail:
2160 return -ENOMEM;
2161}
2162
2163static void perf_mmap_free_page(unsigned long addr)
2164{
2165 struct page *page = virt_to_page((void *)addr);
2166
2167 page->mapping = NULL;
2168 __free_page(page);
2169}
2170
2171static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2172{
2173 struct perf_mmap_data *data;
2174 int i;
2175
2176 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2177
2178 perf_mmap_free_page((unsigned long)data->user_page);
2179 for (i = 0; i < data->nr_pages; i++)
2180 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2181
2182 kfree(data);
2183}
2184
2185static void perf_mmap_data_free(struct perf_counter *counter)
2186{
2187 struct perf_mmap_data *data = counter->data;
2188
2189 WARN_ON(atomic_read(&counter->mmap_count));
2190
2191 rcu_assign_pointer(counter->data, NULL);
2192 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2193}
2194
2195static void perf_mmap_open(struct vm_area_struct *vma)
2196{
2197 struct perf_counter *counter = vma->vm_file->private_data;
2198
2199 atomic_inc(&counter->mmap_count);
2200}
2201
2202static void perf_mmap_close(struct vm_area_struct *vma)
2203{
2204 struct perf_counter *counter = vma->vm_file->private_data;
2205
2206 WARN_ON_ONCE(counter->ctx->parent_ctx);
2207 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2208 struct user_struct *user = current_user();
2209
2210 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2211 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2212 perf_mmap_data_free(counter);
2213 mutex_unlock(&counter->mmap_mutex);
2214 }
2215}
2216
2217static struct vm_operations_struct perf_mmap_vmops = {
2218 .open = perf_mmap_open,
2219 .close = perf_mmap_close,
2220 .fault = perf_mmap_fault,
2221 .page_mkwrite = perf_mmap_fault,
2222};
2223
2224static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2225{
2226 struct perf_counter *counter = file->private_data;
2227 unsigned long user_locked, user_lock_limit;
2228 struct user_struct *user = current_user();
2229 unsigned long locked, lock_limit;
2230 unsigned long vma_size;
2231 unsigned long nr_pages;
2232 long user_extra, extra;
2233 int ret = 0;
2234
2235 if (!(vma->vm_flags & VM_SHARED))
2236 return -EINVAL;
2237
2238 vma_size = vma->vm_end - vma->vm_start;
2239 nr_pages = (vma_size / PAGE_SIZE) - 1;
2240
2241 /*
2242 * If we have data pages ensure they're a power-of-two number, so we
2243 * can do bitmasks instead of modulo.
2244 */
2245 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2246 return -EINVAL;
2247
2248 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2249 return -EINVAL;
2250
2251 if (vma->vm_pgoff != 0)
2252 return -EINVAL;
2253
2254 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex);
2256 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL;
2259 goto unlock;
2260 }
2261
2262 user_extra = nr_pages + 1;
2263 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2264
2265 /*
2266 * Increase the limit linearly with more CPUs:
2267 */
2268 user_lock_limit *= num_online_cpus();
2269
2270 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2271
2272 extra = 0;
2273 if (user_locked > user_lock_limit)
2274 extra = user_locked - user_lock_limit;
2275
2276 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2277 lock_limit >>= PAGE_SHIFT;
2278 locked = vma->vm_mm->locked_vm + extra;
2279
2280 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2281 ret = -EPERM;
2282 goto unlock;
2283 }
2284
2285 WARN_ON(counter->data);
2286 ret = perf_mmap_data_alloc(counter, nr_pages);
2287 if (ret)
2288 goto unlock;
2289
2290 atomic_set(&counter->mmap_count, 1);
2291 atomic_long_add(user_extra, &user->locked_vm);
2292 vma->vm_mm->locked_vm += extra;
2293 counter->data->nr_locked = extra;
2294 if (vma->vm_flags & VM_WRITE)
2295 counter->data->writable = 1;
2296
2297unlock:
2298 mutex_unlock(&counter->mmap_mutex);
2299
2300 vma->vm_flags |= VM_RESERVED;
2301 vma->vm_ops = &perf_mmap_vmops;
2302
2303 return ret;
2304}
2305
2306static int perf_fasync(int fd, struct file *filp, int on)
2307{
2308 struct inode *inode = filp->f_path.dentry->d_inode;
2309 struct perf_counter *counter = filp->private_data;
2310 int retval;
2311
2312 mutex_lock(&inode->i_mutex);
2313 retval = fasync_helper(fd, filp, on, &counter->fasync);
2314 mutex_unlock(&inode->i_mutex);
2315
2316 if (retval < 0)
2317 return retval;
2318
2319 return 0;
2320}
2321
2322static const struct file_operations perf_fops = {
2323 .release = perf_release,
2324 .read = perf_read,
2325 .poll = perf_poll,
2326 .unlocked_ioctl = perf_ioctl,
2327 .compat_ioctl = perf_ioctl,
2328 .mmap = perf_mmap,
2329 .fasync = perf_fasync,
2330};
2331
2332/*
2333 * Perf counter wakeup
2334 *
2335 * If there's data, ensure we set the poll() state and publish everything
2336 * to user-space before waking everybody up.
2337 */
2338
2339void perf_counter_wakeup(struct perf_counter *counter)
2340{
2341 wake_up_all(&counter->waitq);
2342
2343 if (counter->pending_kill) {
2344 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2345 counter->pending_kill = 0;
2346 }
2347}
2348
2349/*
2350 * Pending wakeups
2351 *
2352 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2353 *
2354 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2355 * single linked list and use cmpxchg() to add entries lockless.
2356 */
2357
2358static void perf_pending_counter(struct perf_pending_entry *entry)
2359{
2360 struct perf_counter *counter = container_of(entry,
2361 struct perf_counter, pending);
2362
2363 if (counter->pending_disable) {
2364 counter->pending_disable = 0;
2365 __perf_counter_disable(counter);
2366 }
2367
2368 if (counter->pending_wakeup) {
2369 counter->pending_wakeup = 0;
2370 perf_counter_wakeup(counter);
2371 }
2372}
2373
2374#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2375
2376static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2377 PENDING_TAIL,
2378};
2379
2380static void perf_pending_queue(struct perf_pending_entry *entry,
2381 void (*func)(struct perf_pending_entry *))
2382{
2383 struct perf_pending_entry **head;
2384
2385 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2386 return;
2387
2388 entry->func = func;
2389
2390 head = &get_cpu_var(perf_pending_head);
2391
2392 do {
2393 entry->next = *head;
2394 } while (cmpxchg(head, entry->next, entry) != entry->next);
2395
2396 set_perf_counter_pending();
2397
2398 put_cpu_var(perf_pending_head);
2399}
2400
2401static int __perf_pending_run(void)
2402{
2403 struct perf_pending_entry *list;
2404 int nr = 0;
2405
2406 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2407 while (list != PENDING_TAIL) {
2408 void (*func)(struct perf_pending_entry *);
2409 struct perf_pending_entry *entry = list;
2410
2411 list = list->next;
2412
2413 func = entry->func;
2414 entry->next = NULL;
2415 /*
2416 * Ensure we observe the unqueue before we issue the wakeup,
2417 * so that we won't be waiting forever.
2418 * -- see perf_not_pending().
2419 */
2420 smp_wmb();
2421
2422 func(entry);
2423 nr++;
2424 }
2425
2426 return nr;
2427}
2428
2429static inline int perf_not_pending(struct perf_counter *counter)
2430{
2431 /*
2432 * If we flush on whatever cpu we run, there is a chance we don't
2433 * need to wait.
2434 */
2435 get_cpu();
2436 __perf_pending_run();
2437 put_cpu();
2438
2439 /*
2440 * Ensure we see the proper queue state before going to sleep
2441 * so that we do not miss the wakeup. -- see perf_pending_handle()
2442 */
2443 smp_rmb();
2444 return counter->pending.next == NULL;
2445}
2446
2447static void perf_pending_sync(struct perf_counter *counter)
2448{
2449 wait_event(counter->waitq, perf_not_pending(counter));
2450}
2451
2452void perf_counter_do_pending(void)
2453{
2454 __perf_pending_run();
2455}
2456
2457/*
2458 * Callchain support -- arch specific
2459 */
2460
2461__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2462{
2463 return NULL;
2464}
2465
2466/*
2467 * Output
2468 */
2469
2470struct perf_output_handle {
2471 struct perf_counter *counter;
2472 struct perf_mmap_data *data;
2473 unsigned long head;
2474 unsigned long offset;
2475 int nmi;
2476 int sample;
2477 int locked;
2478 unsigned long flags;
2479};
2480
2481static bool perf_output_space(struct perf_mmap_data *data,
2482 unsigned int offset, unsigned int head)
2483{
2484 unsigned long tail;
2485 unsigned long mask;
2486
2487 if (!data->writable)
2488 return true;
2489
2490 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2491 /*
2492 * Userspace could choose to issue a mb() before updating the tail
2493 * pointer. So that all reads will be completed before the write is
2494 * issued.
2495 */
2496 tail = ACCESS_ONCE(data->user_page->data_tail);
2497 smp_rmb();
2498
2499 offset = (offset - tail) & mask;
2500 head = (head - tail) & mask;
2501
2502 if ((int)(head - offset) < 0)
2503 return false;
2504
2505 return true;
2506}
2507
2508static void perf_output_wakeup(struct perf_output_handle *handle)
2509{
2510 atomic_set(&handle->data->poll, POLL_IN);
2511
2512 if (handle->nmi) {
2513 handle->counter->pending_wakeup = 1;
2514 perf_pending_queue(&handle->counter->pending,
2515 perf_pending_counter);
2516 } else
2517 perf_counter_wakeup(handle->counter);
2518}
2519
2520/*
2521 * Curious locking construct.
2522 *
2523 * We need to ensure a later event doesn't publish a head when a former
2524 * event isn't done writing. However since we need to deal with NMIs we
2525 * cannot fully serialize things.
2526 *
2527 * What we do is serialize between CPUs so we only have to deal with NMI
2528 * nesting on a single CPU.
2529 *
2530 * We only publish the head (and generate a wakeup) when the outer-most
2531 * event completes.
2532 */
2533static void perf_output_lock(struct perf_output_handle *handle)
2534{
2535 struct perf_mmap_data *data = handle->data;
2536 int cpu;
2537
2538 handle->locked = 0;
2539
2540 local_irq_save(handle->flags);
2541 cpu = smp_processor_id();
2542
2543 if (in_nmi() && atomic_read(&data->lock) == cpu)
2544 return;
2545
2546 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2547 cpu_relax();
2548
2549 handle->locked = 1;
2550}
2551
2552static void perf_output_unlock(struct perf_output_handle *handle)
2553{
2554 struct perf_mmap_data *data = handle->data;
2555 unsigned long head;
2556 int cpu;
2557
2558 data->done_head = data->head;
2559
2560 if (!handle->locked)
2561 goto out;
2562
2563again:
2564 /*
2565 * The xchg implies a full barrier that ensures all writes are done
2566 * before we publish the new head, matched by a rmb() in userspace when
2567 * reading this position.
2568 */
2569 while ((head = atomic_long_xchg(&data->done_head, 0)))
2570 data->user_page->data_head = head;
2571
2572 /*
2573 * NMI can happen here, which means we can miss a done_head update.
2574 */
2575
2576 cpu = atomic_xchg(&data->lock, -1);
2577 WARN_ON_ONCE(cpu != smp_processor_id());
2578
2579 /*
2580 * Therefore we have to validate we did not indeed do so.
2581 */
2582 if (unlikely(atomic_long_read(&data->done_head))) {
2583 /*
2584 * Since we had it locked, we can lock it again.
2585 */
2586 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2587 cpu_relax();
2588
2589 goto again;
2590 }
2591
2592 if (atomic_xchg(&data->wakeup, 0))
2593 perf_output_wakeup(handle);
2594out:
2595 local_irq_restore(handle->flags);
2596}
2597
2598static void perf_output_copy(struct perf_output_handle *handle,
2599 const void *buf, unsigned int len)
2600{
2601 unsigned int pages_mask;
2602 unsigned int offset;
2603 unsigned int size;
2604 void **pages;
2605
2606 offset = handle->offset;
2607 pages_mask = handle->data->nr_pages - 1;
2608 pages = handle->data->data_pages;
2609
2610 do {
2611 unsigned int page_offset;
2612 int nr;
2613
2614 nr = (offset >> PAGE_SHIFT) & pages_mask;
2615 page_offset = offset & (PAGE_SIZE - 1);
2616 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2617
2618 memcpy(pages[nr] + page_offset, buf, size);
2619
2620 len -= size;
2621 buf += size;
2622 offset += size;
2623 } while (len);
2624
2625 handle->offset = offset;
2626
2627 /*
2628 * Check we didn't copy past our reservation window, taking the
2629 * possible unsigned int wrap into account.
2630 */
2631 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2632}
2633
2634#define perf_output_put(handle, x) \
2635 perf_output_copy((handle), &(x), sizeof(x))
2636
2637static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample)
2640{
2641 struct perf_mmap_data *data;
2642 unsigned int offset, head;
2643 int have_lost;
2644 struct {
2645 struct perf_event_header header;
2646 u64 id;
2647 u64 lost;
2648 } lost_event;
2649
2650 /*
2651 * For inherited counters we send all the output towards the parent.
2652 */
2653 if (counter->parent)
2654 counter = counter->parent;
2655
2656 rcu_read_lock();
2657 data = rcu_dereference(counter->data);
2658 if (!data)
2659 goto out;
2660
2661 handle->data = data;
2662 handle->counter = counter;
2663 handle->nmi = nmi;
2664 handle->sample = sample;
2665
2666 if (!data->nr_pages)
2667 goto fail;
2668
2669 have_lost = atomic_read(&data->lost);
2670 if (have_lost)
2671 size += sizeof(lost_event);
2672
2673 perf_output_lock(handle);
2674
2675 do {
2676 offset = head = atomic_long_read(&data->head);
2677 head += size;
2678 if (unlikely(!perf_output_space(data, offset, head)))
2679 goto fail;
2680 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2681
2682 handle->offset = offset;
2683 handle->head = head;
2684
2685 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2686 atomic_set(&data->wakeup, 1);
2687
2688 if (have_lost) {
2689 lost_event.header.type = PERF_EVENT_LOST;
2690 lost_event.header.misc = 0;
2691 lost_event.header.size = sizeof(lost_event);
2692 lost_event.id = counter->id;
2693 lost_event.lost = atomic_xchg(&data->lost, 0);
2694
2695 perf_output_put(handle, lost_event);
2696 }
2697
2698 return 0;
2699
2700fail:
2701 atomic_inc(&data->lost);
2702 perf_output_unlock(handle);
2703out:
2704 rcu_read_unlock();
2705
2706 return -ENOSPC;
2707}
2708
2709static void perf_output_end(struct perf_output_handle *handle)
2710{
2711 struct perf_counter *counter = handle->counter;
2712 struct perf_mmap_data *data = handle->data;
2713
2714 int wakeup_events = counter->attr.wakeup_events;
2715
2716 if (handle->sample && wakeup_events) {
2717 int events = atomic_inc_return(&data->events);
2718 if (events >= wakeup_events) {
2719 atomic_sub(wakeup_events, &data->events);
2720 atomic_set(&data->wakeup, 1);
2721 }
2722 }
2723
2724 perf_output_unlock(handle);
2725 rcu_read_unlock();
2726}
2727
2728static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2729{
2730 /*
2731 * only top level counters have the pid namespace they were created in
2732 */
2733 if (counter->parent)
2734 counter = counter->parent;
2735
2736 return task_tgid_nr_ns(p, counter->ns);
2737}
2738
2739static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2740{
2741 /*
2742 * only top level counters have the pid namespace they were created in
2743 */
2744 if (counter->parent)
2745 counter = counter->parent;
2746
2747 return task_pid_nr_ns(p, counter->ns);
2748}
2749
2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2824 struct perf_sample_data *data)
2825{
2826 int ret;
2827 u64 sample_type = counter->attr.sample_type;
2828 struct perf_output_handle handle;
2829 struct perf_event_header header;
2830 u64 ip;
2831 struct {
2832 u32 pid, tid;
2833 } tid_entry;
2834 struct perf_callchain_entry *callchain = NULL;
2835 int callchain_size = 0;
2836 u64 time;
2837 struct {
2838 u32 cpu, reserved;
2839 } cpu_entry;
2840
2841 header.type = PERF_EVENT_SAMPLE;
2842 header.size = sizeof(header);
2843
2844 header.misc = 0;
2845 header.misc |= perf_misc_flags(data->regs);
2846
2847 if (sample_type & PERF_SAMPLE_IP) {
2848 ip = perf_instruction_pointer(data->regs);
2849 header.size += sizeof(ip);
2850 }
2851
2852 if (sample_type & PERF_SAMPLE_TID) {
2853 /* namespace issues */
2854 tid_entry.pid = perf_counter_pid(counter, current);
2855 tid_entry.tid = perf_counter_tid(counter, current);
2856
2857 header.size += sizeof(tid_entry);
2858 }
2859
2860 if (sample_type & PERF_SAMPLE_TIME) {
2861 /*
2862 * Maybe do better on x86 and provide cpu_clock_nmi()
2863 */
2864 time = sched_clock();
2865
2866 header.size += sizeof(u64);
2867 }
2868
2869 if (sample_type & PERF_SAMPLE_ADDR)
2870 header.size += sizeof(u64);
2871
2872 if (sample_type & PERF_SAMPLE_ID)
2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2876 header.size += sizeof(u64);
2877
2878 if (sample_type & PERF_SAMPLE_CPU) {
2879 header.size += sizeof(cpu_entry);
2880
2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2883 }
2884
2885 if (sample_type & PERF_SAMPLE_PERIOD)
2886 header.size += sizeof(u64);
2887
2888 if (sample_type & PERF_SAMPLE_READ)
2889 header.size += perf_counter_read_size(counter);
2890
2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2892 callchain = perf_callchain(data->regs);
2893
2894 if (callchain) {
2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2896 header.size += callchain_size;
2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2911 }
2912
2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2914 if (ret)
2915 return;
2916
2917 perf_output_put(&handle, header);
2918
2919 if (sample_type & PERF_SAMPLE_IP)
2920 perf_output_put(&handle, ip);
2921
2922 if (sample_type & PERF_SAMPLE_TID)
2923 perf_output_put(&handle, tid_entry);
2924
2925 if (sample_type & PERF_SAMPLE_TIME)
2926 perf_output_put(&handle, time);
2927
2928 if (sample_type & PERF_SAMPLE_ADDR)
2929 perf_output_put(&handle, data->addr);
2930
2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2938 perf_output_put(&handle, counter->id);
2939
2940 if (sample_type & PERF_SAMPLE_CPU)
2941 perf_output_put(&handle, cpu_entry);
2942
2943 if (sample_type & PERF_SAMPLE_PERIOD)
2944 perf_output_put(&handle, data->period);
2945
2946 if (sample_type & PERF_SAMPLE_READ)
2947 perf_output_read(&handle, counter);
2948
2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2950 if (callchain)
2951 perf_output_copy(&handle, callchain, callchain_size);
2952 else {
2953 u64 nr = 0;
2954 perf_output_put(&handle, nr);
2955 }
2956 }
2957
2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2971 }
2972 }
2973
2974 perf_output_end(&handle);
2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
3010
3011 perf_output_end(&handle);
3012}
3013
3014/*
3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
3018 */
3019
3020struct perf_task_event {
3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
3023
3024 struct {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
3031 } event;
3032};
3033
3034static void perf_counter_task_output(struct perf_counter *counter,
3035 struct perf_task_event *task_event)
3036{
3037 struct perf_output_handle handle;
3038 int size = task_event->event.header.size;
3039 struct task_struct *task = task_event->task;
3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3041
3042 if (ret)
3043 return;
3044
3045 task_event->event.pid = perf_counter_pid(counter, task);
3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
3050
3051 perf_output_put(&handle, task_event->event);
3052 perf_output_end(&handle);
3053}
3054
3055static int perf_counter_task_match(struct perf_counter *counter)
3056{
3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3058 return 1;
3059
3060 return 0;
3061}
3062
3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3064 struct perf_task_event *task_event)
3065{
3066 struct perf_counter *counter;
3067
3068 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3069 return;
3070
3071 rcu_read_lock();
3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3073 if (perf_counter_task_match(counter))
3074 perf_counter_task_output(counter, task_event);
3075 }
3076 rcu_read_unlock();
3077}
3078
3079static void perf_counter_task_event(struct perf_task_event *task_event)
3080{
3081 struct perf_cpu_context *cpuctx;
3082 struct perf_counter_context *ctx = task_event->task_ctx;
3083
3084 cpuctx = &get_cpu_var(perf_cpu_context);
3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3086 put_cpu_var(perf_cpu_context);
3087
3088 rcu_read_lock();
3089 if (!ctx)
3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3091 if (ctx)
3092 perf_counter_task_ctx(ctx, task_event);
3093 rcu_read_unlock();
3094}
3095
3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
3099{
3100 struct perf_task_event task_event;
3101
3102 if (!atomic_read(&nr_comm_counters) &&
3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
3105 return;
3106
3107 task_event = (struct perf_task_event){
3108 .task = task,
3109 .task_ctx = task_ctx,
3110 .event = {
3111 .header = {
3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3113 .misc = 0,
3114 .size = sizeof(task_event.event),
3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
3120 },
3121 };
3122
3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
3129}
3130
3131/*
3132 * comm tracking
3133 */
3134
3135struct perf_comm_event {
3136 struct task_struct *task;
3137 char *comm;
3138 int comm_size;
3139
3140 struct {
3141 struct perf_event_header header;
3142
3143 u32 pid;
3144 u32 tid;
3145 } event;
3146};
3147
3148static void perf_counter_comm_output(struct perf_counter *counter,
3149 struct perf_comm_event *comm_event)
3150{
3151 struct perf_output_handle handle;
3152 int size = comm_event->event.header.size;
3153 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3154
3155 if (ret)
3156 return;
3157
3158 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3159 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3160
3161 perf_output_put(&handle, comm_event->event);
3162 perf_output_copy(&handle, comm_event->comm,
3163 comm_event->comm_size);
3164 perf_output_end(&handle);
3165}
3166
3167static int perf_counter_comm_match(struct perf_counter *counter)
3168{
3169 if (counter->attr.comm)
3170 return 1;
3171
3172 return 0;
3173}
3174
3175static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3176 struct perf_comm_event *comm_event)
3177{
3178 struct perf_counter *counter;
3179
3180 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3181 return;
3182
3183 rcu_read_lock();
3184 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3185 if (perf_counter_comm_match(counter))
3186 perf_counter_comm_output(counter, comm_event);
3187 }
3188 rcu_read_unlock();
3189}
3190
3191static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3192{
3193 struct perf_cpu_context *cpuctx;
3194 struct perf_counter_context *ctx;
3195 unsigned int size;
3196 char comm[TASK_COMM_LEN];
3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
3201
3202 comm_event->comm = comm;
3203 comm_event->comm_size = size;
3204
3205 comm_event->event.header.size = sizeof(comm_event->event) + size;
3206
3207 cpuctx = &get_cpu_var(perf_cpu_context);
3208 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3209 put_cpu_var(perf_cpu_context);
3210
3211 rcu_read_lock();
3212 /*
3213 * doesn't really matter which of the child contexts the
3214 * events ends up in.
3215 */
3216 ctx = rcu_dereference(current->perf_counter_ctxp);
3217 if (ctx)
3218 perf_counter_comm_ctx(ctx, comm_event);
3219 rcu_read_unlock();
3220}
3221
3222void perf_counter_comm(struct task_struct *task)
3223{
3224 struct perf_comm_event comm_event;
3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
3229 if (!atomic_read(&nr_comm_counters))
3230 return;
3231
3232 comm_event = (struct perf_comm_event){
3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
3236 .event = {
3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
3244 },
3245 };
3246
3247 perf_counter_comm_event(&comm_event);
3248}
3249
3250/*
3251 * mmap tracking
3252 */
3253
3254struct perf_mmap_event {
3255 struct vm_area_struct *vma;
3256
3257 const char *file_name;
3258 int file_size;
3259
3260 struct {
3261 struct perf_event_header header;
3262
3263 u32 pid;
3264 u32 tid;
3265 u64 start;
3266 u64 len;
3267 u64 pgoff;
3268 } event;
3269};
3270
3271static void perf_counter_mmap_output(struct perf_counter *counter,
3272 struct perf_mmap_event *mmap_event)
3273{
3274 struct perf_output_handle handle;
3275 int size = mmap_event->event.header.size;
3276 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3277
3278 if (ret)
3279 return;
3280
3281 mmap_event->event.pid = perf_counter_pid(counter, current);
3282 mmap_event->event.tid = perf_counter_tid(counter, current);
3283
3284 perf_output_put(&handle, mmap_event->event);
3285 perf_output_copy(&handle, mmap_event->file_name,
3286 mmap_event->file_size);
3287 perf_output_end(&handle);
3288}
3289
3290static int perf_counter_mmap_match(struct perf_counter *counter,
3291 struct perf_mmap_event *mmap_event)
3292{
3293 if (counter->attr.mmap)
3294 return 1;
3295
3296 return 0;
3297}
3298
3299static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3300 struct perf_mmap_event *mmap_event)
3301{
3302 struct perf_counter *counter;
3303
3304 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3305 return;
3306
3307 rcu_read_lock();
3308 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3309 if (perf_counter_mmap_match(counter, mmap_event))
3310 perf_counter_mmap_output(counter, mmap_event);
3311 }
3312 rcu_read_unlock();
3313}
3314
3315static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_cpu_context *cpuctx;
3318 struct perf_counter_context *ctx;
3319 struct vm_area_struct *vma = mmap_event->vma;
3320 struct file *file = vma->vm_file;
3321 unsigned int size;
3322 char tmp[16];
3323 char *buf = NULL;
3324 const char *name;
3325
3326 memset(tmp, 0, sizeof(tmp));
3327
3328 if (file) {
3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3335 if (!buf) {
3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
3337 goto got_name;
3338 }
3339 name = d_path(&file->f_path, buf, PATH_MAX);
3340 if (IS_ERR(name)) {
3341 name = strncpy(tmp, "//toolong", sizeof(tmp));
3342 goto got_name;
3343 }
3344 } else {
3345 if (arch_vma_name(mmap_event->vma)) {
3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
3348 goto got_name;
3349 }
3350
3351 if (!vma->vm_mm) {
3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3353 goto got_name;
3354 }
3355
3356 name = strncpy(tmp, "//anon", sizeof(tmp));
3357 goto got_name;
3358 }
3359
3360got_name:
3361 size = ALIGN(strlen(name)+1, sizeof(u64));
3362
3363 mmap_event->file_name = name;
3364 mmap_event->file_size = size;
3365
3366 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3367
3368 cpuctx = &get_cpu_var(perf_cpu_context);
3369 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3370 put_cpu_var(perf_cpu_context);
3371
3372 rcu_read_lock();
3373 /*
3374 * doesn't really matter which of the child contexts the
3375 * events ends up in.
3376 */
3377 ctx = rcu_dereference(current->perf_counter_ctxp);
3378 if (ctx)
3379 perf_counter_mmap_ctx(ctx, mmap_event);
3380 rcu_read_unlock();
3381
3382 kfree(buf);
3383}
3384
3385void __perf_counter_mmap(struct vm_area_struct *vma)
3386{
3387 struct perf_mmap_event mmap_event;
3388
3389 if (!atomic_read(&nr_mmap_counters))
3390 return;
3391
3392 mmap_event = (struct perf_mmap_event){
3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
3396 .event = {
3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
3404 .start = vma->vm_start,
3405 .len = vma->vm_end - vma->vm_start,
3406 .pgoff = vma->vm_pgoff,
3407 },
3408 };
3409
3410 perf_counter_mmap_event(&mmap_event);
3411}
3412
3413/*
3414 * IRQ throttle logging
3415 */
3416
3417static void perf_log_throttle(struct perf_counter *counter, int enable)
3418{
3419 struct perf_output_handle handle;
3420 int ret;
3421
3422 struct {
3423 struct perf_event_header header;
3424 u64 time;
3425 u64 id;
3426 u64 stream_id;
3427 } throttle_event = {
3428 .header = {
3429 .type = PERF_EVENT_THROTTLE,
3430 .misc = 0,
3431 .size = sizeof(throttle_event),
3432 },
3433 .time = sched_clock(),
3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
3436 };
3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3442 if (ret)
3443 return;
3444
3445 perf_output_put(&handle, throttle_event);
3446 perf_output_end(&handle);
3447}
3448
3449/*
3450 * Generic counter overflow handling, sampling.
3451 */
3452
3453int perf_counter_overflow(struct perf_counter *counter, int nmi,
3454 struct perf_sample_data *data)
3455{
3456 int events = atomic_read(&counter->event_limit);
3457 int throttle = counter->pmu->unthrottle != NULL;
3458 struct hw_perf_counter *hwc = &counter->hw;
3459 int ret = 0;
3460
3461 if (!throttle) {
3462 hwc->interrupts++;
3463 } else {
3464 if (hwc->interrupts != MAX_INTERRUPTS) {
3465 hwc->interrupts++;
3466 if (HZ * hwc->interrupts >
3467 (u64)sysctl_perf_counter_sample_rate) {
3468 hwc->interrupts = MAX_INTERRUPTS;
3469 perf_log_throttle(counter, 0);
3470 ret = 1;
3471 }
3472 } else {
3473 /*
3474 * Keep re-disabling counters even though on the previous
3475 * pass we disabled it - just in case we raced with a
3476 * sched-in and the counter got enabled again:
3477 */
3478 ret = 1;
3479 }
3480 }
3481
3482 if (counter->attr.freq) {
3483 u64 now = sched_clock();
3484 s64 delta = now - hwc->freq_stamp;
3485
3486 hwc->freq_stamp = now;
3487
3488 if (delta > 0 && delta < TICK_NSEC)
3489 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3490 }
3491
3492 /*
3493 * XXX event_limit might not quite work as expected on inherited
3494 * counters
3495 */
3496
3497 counter->pending_kill = POLL_IN;
3498 if (events && atomic_dec_and_test(&counter->event_limit)) {
3499 ret = 1;
3500 counter->pending_kill = POLL_HUP;
3501 if (nmi) {
3502 counter->pending_disable = 1;
3503 perf_pending_queue(&counter->pending,
3504 perf_pending_counter);
3505 } else
3506 perf_counter_disable(counter);
3507 }
3508
3509 perf_counter_output(counter, nmi, data);
3510 return ret;
3511}
3512
3513/*
3514 * Generic software counter infrastructure
3515 */
3516
3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3525{
3526 struct hw_perf_counter *hwc = &counter->hw;
3527 u64 period = hwc->last_period;
3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3532
3533again:
3534 old = val = atomic64_read(&hwc->period_left);
3535 if (val < 0)
3536 return 0;
3537
3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3543
3544 return nr;
3545}
3546
3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3549{
3550 struct hw_perf_counter *hwc = &counter->hw;
3551 u64 overflow;
3552
3553 data->period = counter->hw.last_period;
3554 overflow = perf_swcounter_set_period(counter);
3555
3556 if (hwc->interrupts == MAX_INTERRUPTS)
3557 return;
3558
3559 for (; overflow; overflow--) {
3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3568}
3569
3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3571{
3572 /*
3573 * Nothing to do, we already reset hwc->interrupts.
3574 */
3575}
3576
3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3578 int nmi, struct perf_sample_data *data)
3579{
3580 struct hw_perf_counter *hwc = &counter->hw;
3581
3582 atomic64_add(nr, &counter->count);
3583
3584 if (!hwc->sample_period)
3585 return;
3586
3587 if (!data->regs)
3588 return;
3589
3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3591 perf_swcounter_overflow(counter, nmi, data);
3592}
3593
3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3595{
3596 /*
3597 * The counter is active, we're good!
3598 */
3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3600 return 1;
3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3606 return 0;
3607
3608 /*
3609 * The counter is inactive, if the context is active
3610 * we're part of a group that didn't make it on the 'pmu',
3611 * not counting.
3612 */
3613 if (counter->ctx->is_active)
3614 return 0;
3615
3616 /*
3617 * We're inactive and the context is too, this means the
3618 * task is scheduled out, we're counting events that happen
3619 * to us, like migration events.
3620 */
3621 return 1;
3622}
3623
3624static int perf_swcounter_match(struct perf_counter *counter,
3625 enum perf_type_id type,
3626 u32 event, struct pt_regs *regs)
3627{
3628 if (!perf_swcounter_is_counting(counter))
3629 return 0;
3630
3631 if (counter->attr.type != type)
3632 return 0;
3633 if (counter->attr.config != event)
3634 return 0;
3635
3636 if (regs) {
3637 if (counter->attr.exclude_user && user_mode(regs))
3638 return 0;
3639
3640 if (counter->attr.exclude_kernel && !user_mode(regs))
3641 return 0;
3642 }
3643
3644 return 1;
3645}
3646
3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3648 enum perf_type_id type,
3649 u32 event, u64 nr, int nmi,
3650 struct perf_sample_data *data)
3651{
3652 struct perf_counter *counter;
3653
3654 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3655 return;
3656
3657 rcu_read_lock();
3658 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3659 if (perf_swcounter_match(counter, type, event, data->regs))
3660 perf_swcounter_add(counter, nr, nmi, data);
3661 }
3662 rcu_read_unlock();
3663}
3664
3665static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3666{
3667 if (in_nmi())
3668 return &cpuctx->recursion[3];
3669
3670 if (in_irq())
3671 return &cpuctx->recursion[2];
3672
3673 if (in_softirq())
3674 return &cpuctx->recursion[1];
3675
3676 return &cpuctx->recursion[0];
3677}
3678
3679static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3680 u64 nr, int nmi,
3681 struct perf_sample_data *data)
3682{
3683 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3684 int *recursion = perf_swcounter_recursion_context(cpuctx);
3685 struct perf_counter_context *ctx;
3686
3687 if (*recursion)
3688 goto out;
3689
3690 (*recursion)++;
3691 barrier();
3692
3693 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3694 nr, nmi, data);
3695 rcu_read_lock();
3696 /*
3697 * doesn't really matter which of the child contexts the
3698 * events ends up in.
3699 */
3700 ctx = rcu_dereference(current->perf_counter_ctxp);
3701 if (ctx)
3702 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3703 rcu_read_unlock();
3704
3705 barrier();
3706 (*recursion)--;
3707
3708out:
3709 put_cpu_var(perf_cpu_context);
3710}
3711
3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3713 struct pt_regs *regs, u64 addr)
3714{
3715 struct perf_sample_data data = {
3716 .regs = regs,
3717 .addr = addr,
3718 };
3719
3720 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3721}
3722
3723static void perf_swcounter_read(struct perf_counter *counter)
3724{
3725}
3726
3727static int perf_swcounter_enable(struct perf_counter *counter)
3728{
3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3735 return 0;
3736}
3737
3738static void perf_swcounter_disable(struct perf_counter *counter)
3739{
3740}
3741
3742static const struct pmu perf_ops_generic = {
3743 .enable = perf_swcounter_enable,
3744 .disable = perf_swcounter_disable,
3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3747};
3748
3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3785 * Software counter: cpu wall time clock
3786 */
3787
3788static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3789{
3790 int cpu = raw_smp_processor_id();
3791 s64 prev;
3792 u64 now;
3793
3794 now = cpu_clock(cpu);
3795 prev = atomic64_read(&counter->hw.prev_count);
3796 atomic64_set(&counter->hw.prev_count, now);
3797 atomic64_add(now - prev, &counter->count);
3798}
3799
3800static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3801{
3802 struct hw_perf_counter *hwc = &counter->hw;
3803 int cpu = raw_smp_processor_id();
3804
3805 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3806 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3807 hwc->hrtimer.function = perf_swcounter_hrtimer;
3808 if (hwc->sample_period) {
3809 u64 period = max_t(u64, 10000, hwc->sample_period);
3810 __hrtimer_start_range_ns(&hwc->hrtimer,
3811 ns_to_ktime(period), 0,
3812 HRTIMER_MODE_REL, 0);
3813 }
3814
3815 return 0;
3816}
3817
3818static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3819{
3820 if (counter->hw.sample_period)
3821 hrtimer_cancel(&counter->hw.hrtimer);
3822 cpu_clock_perf_counter_update(counter);
3823}
3824
3825static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3826{
3827 cpu_clock_perf_counter_update(counter);
3828}
3829
3830static const struct pmu perf_ops_cpu_clock = {
3831 .enable = cpu_clock_perf_counter_enable,
3832 .disable = cpu_clock_perf_counter_disable,
3833 .read = cpu_clock_perf_counter_read,
3834};
3835
3836/*
3837 * Software counter: task time clock
3838 */
3839
3840static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3841{
3842 u64 prev;
3843 s64 delta;
3844
3845 prev = atomic64_xchg(&counter->hw.prev_count, now);
3846 delta = now - prev;
3847 atomic64_add(delta, &counter->count);
3848}
3849
3850static int task_clock_perf_counter_enable(struct perf_counter *counter)
3851{
3852 struct hw_perf_counter *hwc = &counter->hw;
3853 u64 now;
3854
3855 now = counter->ctx->time;
3856
3857 atomic64_set(&hwc->prev_count, now);
3858 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3859 hwc->hrtimer.function = perf_swcounter_hrtimer;
3860 if (hwc->sample_period) {
3861 u64 period = max_t(u64, 10000, hwc->sample_period);
3862 __hrtimer_start_range_ns(&hwc->hrtimer,
3863 ns_to_ktime(period), 0,
3864 HRTIMER_MODE_REL, 0);
3865 }
3866
3867 return 0;
3868}
3869
3870static void task_clock_perf_counter_disable(struct perf_counter *counter)
3871{
3872 if (counter->hw.sample_period)
3873 hrtimer_cancel(&counter->hw.hrtimer);
3874 task_clock_perf_counter_update(counter, counter->ctx->time);
3875
3876}
3877
3878static void task_clock_perf_counter_read(struct perf_counter *counter)
3879{
3880 u64 time;
3881
3882 if (!in_nmi()) {
3883 update_context_time(counter->ctx);
3884 time = counter->ctx->time;
3885 } else {
3886 u64 now = perf_clock();
3887 u64 delta = now - counter->ctx->timestamp;
3888 time = counter->ctx->time + delta;
3889 }
3890
3891 task_clock_perf_counter_update(counter, time);
3892}
3893
3894static const struct pmu perf_ops_task_clock = {
3895 .enable = task_clock_perf_counter_enable,
3896 .disable = task_clock_perf_counter_disable,
3897 .read = task_clock_perf_counter_read,
3898};
3899
3900#ifdef CONFIG_EVENT_PROFILE
3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3903{
3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3909 struct perf_sample_data data = {
3910 .regs = get_irq_regs(),
3911 .addr = addr,
3912 .raw = &raw,
3913 };
3914
3915 if (!data.regs)
3916 data.regs = task_pt_regs(current);
3917
3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3919}
3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3921
3922extern int ftrace_profile_enable(int);
3923extern void ftrace_profile_disable(int);
3924
3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3926{
3927 ftrace_profile_disable(counter->attr.config);
3928}
3929
3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3931{
3932 /*
3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3939
3940 if (ftrace_profile_enable(counter->attr.config))
3941 return NULL;
3942
3943 counter->destroy = tp_perf_counter_destroy;
3944
3945 return &perf_ops_generic;
3946}
3947#else
3948static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3949{
3950 return NULL;
3951}
3952#endif
3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3966{
3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3969
3970 /*
3971 * Software counters (currently) can't in general distinguish
3972 * between user, kernel and hypervisor events.
3973 * However, context switches and cpu migrations are considered
3974 * to be kernel events, and page faults are never hypervisor
3975 * events.
3976 */
3977 switch (event) {
3978 case PERF_COUNT_SW_CPU_CLOCK:
3979 pmu = &perf_ops_cpu_clock;
3980
3981 break;
3982 case PERF_COUNT_SW_TASK_CLOCK:
3983 /*
3984 * If the user instantiates this as a per-cpu counter,
3985 * use the cpu_clock counter instead.
3986 */
3987 if (counter->ctx->task)
3988 pmu = &perf_ops_task_clock;
3989 else
3990 pmu = &perf_ops_cpu_clock;
3991
3992 break;
3993 case PERF_COUNT_SW_PAGE_FAULTS:
3994 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
4002 pmu = &perf_ops_generic;
4003 break;
4004 }
4005
4006 return pmu;
4007}
4008
4009/*
4010 * Allocate and initialize a counter structure
4011 */
4012static struct perf_counter *
4013perf_counter_alloc(struct perf_counter_attr *attr,
4014 int cpu,
4015 struct perf_counter_context *ctx,
4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
4018 gfp_t gfpflags)
4019{
4020 const struct pmu *pmu;
4021 struct perf_counter *counter;
4022 struct hw_perf_counter *hwc;
4023 long err;
4024
4025 counter = kzalloc(sizeof(*counter), gfpflags);
4026 if (!counter)
4027 return ERR_PTR(-ENOMEM);
4028
4029 /*
4030 * Single counters are their own group leaders, with an
4031 * empty sibling list:
4032 */
4033 if (!group_leader)
4034 group_leader = counter;
4035
4036 mutex_init(&counter->child_mutex);
4037 INIT_LIST_HEAD(&counter->child_list);
4038
4039 INIT_LIST_HEAD(&counter->list_entry);
4040 INIT_LIST_HEAD(&counter->event_entry);
4041 INIT_LIST_HEAD(&counter->sibling_list);
4042 init_waitqueue_head(&counter->waitq);
4043
4044 mutex_init(&counter->mmap_mutex);
4045
4046 counter->cpu = cpu;
4047 counter->attr = *attr;
4048 counter->group_leader = group_leader;
4049 counter->pmu = NULL;
4050 counter->ctx = ctx;
4051 counter->oncpu = -1;
4052
4053 counter->parent = parent_counter;
4054
4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4056 counter->id = atomic64_inc_return(&perf_counter_id);
4057
4058 counter->state = PERF_COUNTER_STATE_INACTIVE;
4059
4060 if (attr->disabled)
4061 counter->state = PERF_COUNTER_STATE_OFF;
4062
4063 pmu = NULL;
4064
4065 hwc = &counter->hw;
4066 hwc->sample_period = attr->sample_period;
4067 if (attr->freq && attr->sample_freq)
4068 hwc->sample_period = 1;
4069 hwc->last_period = hwc->sample_period;
4070
4071 atomic64_set(&hwc->period_left, hwc->sample_period);
4072
4073 /*
4074 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4075 */
4076 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4077 goto done;
4078
4079 switch (attr->type) {
4080 case PERF_TYPE_RAW:
4081 case PERF_TYPE_HARDWARE:
4082 case PERF_TYPE_HW_CACHE:
4083 pmu = hw_perf_counter_init(counter);
4084 break;
4085
4086 case PERF_TYPE_SOFTWARE:
4087 pmu = sw_perf_counter_init(counter);
4088 break;
4089
4090 case PERF_TYPE_TRACEPOINT:
4091 pmu = tp_perf_counter_init(counter);
4092 break;
4093
4094 default:
4095 break;
4096 }
4097done:
4098 err = 0;
4099 if (!pmu)
4100 err = -EINVAL;
4101 else if (IS_ERR(pmu))
4102 err = PTR_ERR(pmu);
4103
4104 if (err) {
4105 if (counter->ns)
4106 put_pid_ns(counter->ns);
4107 kfree(counter);
4108 return ERR_PTR(err);
4109 }
4110
4111 counter->pmu = pmu;
4112
4113 if (!counter->parent) {
4114 atomic_inc(&nr_counters);
4115 if (counter->attr.mmap)
4116 atomic_inc(&nr_mmap_counters);
4117 if (counter->attr.comm)
4118 atomic_inc(&nr_comm_counters);
4119 if (counter->attr.task)
4120 atomic_inc(&nr_task_counters);
4121 }
4122
4123 return counter;
4124}
4125
4126static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4127 struct perf_counter_attr *attr)
4128{
4129 int ret;
4130 u32 size;
4131
4132 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4133 return -EFAULT;
4134
4135 /*
4136 * zero the full structure, so that a short copy will be nice.
4137 */
4138 memset(attr, 0, sizeof(*attr));
4139
4140 ret = get_user(size, &uattr->size);
4141 if (ret)
4142 return ret;
4143
4144 if (size > PAGE_SIZE) /* silly large */
4145 goto err_size;
4146
4147 if (!size) /* abi compat */
4148 size = PERF_ATTR_SIZE_VER0;
4149
4150 if (size < PERF_ATTR_SIZE_VER0)
4151 goto err_size;
4152
4153 /*
4154 * If we're handed a bigger struct than we know of,
4155 * ensure all the unknown bits are 0.
4156 */
4157 if (size > sizeof(*attr)) {
4158 unsigned long val;
4159 unsigned long __user *addr;
4160 unsigned long __user *end;
4161
4162 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4163 sizeof(unsigned long));
4164 end = PTR_ALIGN((void __user *)uattr + size,
4165 sizeof(unsigned long));
4166
4167 for (; addr < end; addr += sizeof(unsigned long)) {
4168 ret = get_user(val, addr);
4169 if (ret)
4170 return ret;
4171 if (val)
4172 goto err_size;
4173 }
4174 }
4175
4176 ret = copy_from_user(attr, uattr, size);
4177 if (ret)
4178 return -EFAULT;
4179
4180 /*
4181 * If the type exists, the corresponding creation will verify
4182 * the attr->config.
4183 */
4184 if (attr->type >= PERF_TYPE_MAX)
4185 return -EINVAL;
4186
4187 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4188 return -EINVAL;
4189
4190 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4191 return -EINVAL;
4192
4193 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4194 return -EINVAL;
4195
4196out:
4197 return ret;
4198
4199err_size:
4200 put_user(sizeof(*attr), &uattr->size);
4201 ret = -E2BIG;
4202 goto out;
4203}
4204
4205/**
4206 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4207 *
4208 * @attr_uptr: event type attributes for monitoring/sampling
4209 * @pid: target pid
4210 * @cpu: target cpu
4211 * @group_fd: group leader counter fd
4212 */
4213SYSCALL_DEFINE5(perf_counter_open,
4214 struct perf_counter_attr __user *, attr_uptr,
4215 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4216{
4217 struct perf_counter *counter, *group_leader;
4218 struct perf_counter_attr attr;
4219 struct perf_counter_context *ctx;
4220 struct file *counter_file = NULL;
4221 struct file *group_file = NULL;
4222 int fput_needed = 0;
4223 int fput_needed2 = 0;
4224 int ret;
4225
4226 /* for future expandability... */
4227 if (flags)
4228 return -EINVAL;
4229
4230 ret = perf_copy_attr(attr_uptr, &attr);
4231 if (ret)
4232 return ret;
4233
4234 if (!attr.exclude_kernel) {
4235 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4236 return -EACCES;
4237 }
4238
4239 if (attr.freq) {
4240 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4241 return -EINVAL;
4242 }
4243
4244 /*
4245 * Get the target context (task or percpu):
4246 */
4247 ctx = find_get_context(pid, cpu);
4248 if (IS_ERR(ctx))
4249 return PTR_ERR(ctx);
4250
4251 /*
4252 * Look up the group leader (we will attach this counter to it):
4253 */
4254 group_leader = NULL;
4255 if (group_fd != -1) {
4256 ret = -EINVAL;
4257 group_file = fget_light(group_fd, &fput_needed);
4258 if (!group_file)
4259 goto err_put_context;
4260 if (group_file->f_op != &perf_fops)
4261 goto err_put_context;
4262
4263 group_leader = group_file->private_data;
4264 /*
4265 * Do not allow a recursive hierarchy (this new sibling
4266 * becoming part of another group-sibling):
4267 */
4268 if (group_leader->group_leader != group_leader)
4269 goto err_put_context;
4270 /*
4271 * Do not allow to attach to a group in a different
4272 * task or CPU context:
4273 */
4274 if (group_leader->ctx != ctx)
4275 goto err_put_context;
4276 /*
4277 * Only a group leader can be exclusive or pinned
4278 */
4279 if (attr.exclusive || attr.pinned)
4280 goto err_put_context;
4281 }
4282
4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4284 NULL, GFP_KERNEL);
4285 ret = PTR_ERR(counter);
4286 if (IS_ERR(counter))
4287 goto err_put_context;
4288
4289 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4290 if (ret < 0)
4291 goto err_free_put_context;
4292
4293 counter_file = fget_light(ret, &fput_needed2);
4294 if (!counter_file)
4295 goto err_free_put_context;
4296
4297 counter->filp = counter_file;
4298 WARN_ON_ONCE(ctx->parent_ctx);
4299 mutex_lock(&ctx->mutex);
4300 perf_install_in_context(ctx, counter, cpu);
4301 ++ctx->generation;
4302 mutex_unlock(&ctx->mutex);
4303
4304 counter->owner = current;
4305 get_task_struct(current);
4306 mutex_lock(&current->perf_counter_mutex);
4307 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4308 mutex_unlock(&current->perf_counter_mutex);
4309
4310 fput_light(counter_file, fput_needed2);
4311
4312out_fput:
4313 fput_light(group_file, fput_needed);
4314
4315 return ret;
4316
4317err_free_put_context:
4318 kfree(counter);
4319
4320err_put_context:
4321 put_ctx(ctx);
4322
4323 goto out_fput;
4324}
4325
4326/*
4327 * inherit a counter from parent task to child task:
4328 */
4329static struct perf_counter *
4330inherit_counter(struct perf_counter *parent_counter,
4331 struct task_struct *parent,
4332 struct perf_counter_context *parent_ctx,
4333 struct task_struct *child,
4334 struct perf_counter *group_leader,
4335 struct perf_counter_context *child_ctx)
4336{
4337 struct perf_counter *child_counter;
4338
4339 /*
4340 * Instead of creating recursive hierarchies of counters,
4341 * we link inherited counters back to the original parent,
4342 * which has a filp for sure, which we use as the reference
4343 * count:
4344 */
4345 if (parent_counter->parent)
4346 parent_counter = parent_counter->parent;
4347
4348 child_counter = perf_counter_alloc(&parent_counter->attr,
4349 parent_counter->cpu, child_ctx,
4350 group_leader, parent_counter,
4351 GFP_KERNEL);
4352 if (IS_ERR(child_counter))
4353 return child_counter;
4354 get_ctx(child_ctx);
4355
4356 /*
4357 * Make the child state follow the state of the parent counter,
4358 * not its attr.disabled bit. We hold the parent's mutex,
4359 * so we won't race with perf_counter_{en, dis}able_family.
4360 */
4361 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4362 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4363 else
4364 child_counter->state = PERF_COUNTER_STATE_OFF;
4365
4366 if (parent_counter->attr.freq)
4367 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4368
4369 /*
4370 * Link it up in the child's context:
4371 */
4372 add_counter_to_ctx(child_counter, child_ctx);
4373
4374 /*
4375 * Get a reference to the parent filp - we will fput it
4376 * when the child counter exits. This is safe to do because
4377 * we are in the parent and we know that the filp still
4378 * exists and has a nonzero count:
4379 */
4380 atomic_long_inc(&parent_counter->filp->f_count);
4381
4382 /*
4383 * Link this into the parent counter's child list
4384 */
4385 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4386 mutex_lock(&parent_counter->child_mutex);
4387 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4388 mutex_unlock(&parent_counter->child_mutex);
4389
4390 return child_counter;
4391}
4392
4393static int inherit_group(struct perf_counter *parent_counter,
4394 struct task_struct *parent,
4395 struct perf_counter_context *parent_ctx,
4396 struct task_struct *child,
4397 struct perf_counter_context *child_ctx)
4398{
4399 struct perf_counter *leader;
4400 struct perf_counter *sub;
4401 struct perf_counter *child_ctr;
4402
4403 leader = inherit_counter(parent_counter, parent, parent_ctx,
4404 child, NULL, child_ctx);
4405 if (IS_ERR(leader))
4406 return PTR_ERR(leader);
4407 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4408 child_ctr = inherit_counter(sub, parent, parent_ctx,
4409 child, leader, child_ctx);
4410 if (IS_ERR(child_ctr))
4411 return PTR_ERR(child_ctr);
4412 }
4413 return 0;
4414}
4415
4416static void sync_child_counter(struct perf_counter *child_counter,
4417 struct task_struct *child)
4418{
4419 struct perf_counter *parent_counter = child_counter->parent;
4420 u64 child_val;
4421
4422 if (child_counter->attr.inherit_stat)
4423 perf_counter_read_event(child_counter, child);
4424
4425 child_val = atomic64_read(&child_counter->count);
4426
4427 /*
4428 * Add back the child's count to the parent's count:
4429 */
4430 atomic64_add(child_val, &parent_counter->count);
4431 atomic64_add(child_counter->total_time_enabled,
4432 &parent_counter->child_total_time_enabled);
4433 atomic64_add(child_counter->total_time_running,
4434 &parent_counter->child_total_time_running);
4435
4436 /*
4437 * Remove this counter from the parent's list
4438 */
4439 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4440 mutex_lock(&parent_counter->child_mutex);
4441 list_del_init(&child_counter->child_list);
4442 mutex_unlock(&parent_counter->child_mutex);
4443
4444 /*
4445 * Release the parent counter, if this was the last
4446 * reference to it.
4447 */
4448 fput(parent_counter->filp);
4449}
4450
4451static void
4452__perf_counter_exit_task(struct perf_counter *child_counter,
4453 struct perf_counter_context *child_ctx,
4454 struct task_struct *child)
4455{
4456 struct perf_counter *parent_counter;
4457
4458 update_counter_times(child_counter);
4459 perf_counter_remove_from_context(child_counter);
4460
4461 parent_counter = child_counter->parent;
4462 /*
4463 * It can happen that parent exits first, and has counters
4464 * that are still around due to the child reference. These
4465 * counters need to be zapped - but otherwise linger.
4466 */
4467 if (parent_counter) {
4468 sync_child_counter(child_counter, child);
4469 free_counter(child_counter);
4470 }
4471}
4472
4473/*
4474 * When a child task exits, feed back counter values to parent counters.
4475 */
4476void perf_counter_exit_task(struct task_struct *child)
4477{
4478 struct perf_counter *child_counter, *tmp;
4479 struct perf_counter_context *child_ctx;
4480 unsigned long flags;
4481
4482 if (likely(!child->perf_counter_ctxp)) {
4483 perf_counter_task(child, NULL, 0);
4484 return;
4485 }
4486
4487 local_irq_save(flags);
4488 /*
4489 * We can't reschedule here because interrupts are disabled,
4490 * and either child is current or it is a task that can't be
4491 * scheduled, so we are now safe from rescheduling changing
4492 * our context.
4493 */
4494 child_ctx = child->perf_counter_ctxp;
4495 __perf_counter_task_sched_out(child_ctx);
4496
4497 /*
4498 * Take the context lock here so that if find_get_context is
4499 * reading child->perf_counter_ctxp, we wait until it has
4500 * incremented the context's refcount before we do put_ctx below.
4501 */
4502 spin_lock(&child_ctx->lock);
4503 child->perf_counter_ctxp = NULL;
4504 /*
4505 * If this context is a clone; unclone it so it can't get
4506 * swapped to another process while we're removing all
4507 * the counters from it.
4508 */
4509 unclone_ctx(child_ctx);
4510 spin_unlock_irqrestore(&child_ctx->lock, flags);
4511
4512 /*
4513 * Report the task dead after unscheduling the counters so that we
4514 * won't get any samples after PERF_EVENT_EXIT. We can however still
4515 * get a few PERF_EVENT_READ events.
4516 */
4517 perf_counter_task(child, child_ctx, 0);
4518
4519 /*
4520 * We can recurse on the same lock type through:
4521 *
4522 * __perf_counter_exit_task()
4523 * sync_child_counter()
4524 * fput(parent_counter->filp)
4525 * perf_release()
4526 * mutex_lock(&ctx->mutex)
4527 *
4528 * But since its the parent context it won't be the same instance.
4529 */
4530 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4531
4532again:
4533 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4534 list_entry)
4535 __perf_counter_exit_task(child_counter, child_ctx, child);
4536
4537 /*
4538 * If the last counter was a group counter, it will have appended all
4539 * its siblings to the list, but we obtained 'tmp' before that which
4540 * will still point to the list head terminating the iteration.
4541 */
4542 if (!list_empty(&child_ctx->counter_list))
4543 goto again;
4544
4545 mutex_unlock(&child_ctx->mutex);
4546
4547 put_ctx(child_ctx);
4548}
4549
4550/*
4551 * free an unexposed, unused context as created by inheritance by
4552 * init_task below, used by fork() in case of fail.
4553 */
4554void perf_counter_free_task(struct task_struct *task)
4555{
4556 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4557 struct perf_counter *counter, *tmp;
4558
4559 if (!ctx)
4560 return;
4561
4562 mutex_lock(&ctx->mutex);
4563again:
4564 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4565 struct perf_counter *parent = counter->parent;
4566
4567 if (WARN_ON_ONCE(!parent))
4568 continue;
4569
4570 mutex_lock(&parent->child_mutex);
4571 list_del_init(&counter->child_list);
4572 mutex_unlock(&parent->child_mutex);
4573
4574 fput(parent->filp);
4575
4576 list_del_counter(counter, ctx);
4577 free_counter(counter);
4578 }
4579
4580 if (!list_empty(&ctx->counter_list))
4581 goto again;
4582
4583 mutex_unlock(&ctx->mutex);
4584
4585 put_ctx(ctx);
4586}
4587
4588/*
4589 * Initialize the perf_counter context in task_struct
4590 */
4591int perf_counter_init_task(struct task_struct *child)
4592{
4593 struct perf_counter_context *child_ctx, *parent_ctx;
4594 struct perf_counter_context *cloned_ctx;
4595 struct perf_counter *counter;
4596 struct task_struct *parent = current;
4597 int inherited_all = 1;
4598 int ret = 0;
4599
4600 child->perf_counter_ctxp = NULL;
4601
4602 mutex_init(&child->perf_counter_mutex);
4603 INIT_LIST_HEAD(&child->perf_counter_list);
4604
4605 if (likely(!parent->perf_counter_ctxp))
4606 return 0;
4607
4608 /*
4609 * This is executed from the parent task context, so inherit
4610 * counters that have been marked for cloning.
4611 * First allocate and initialize a context for the child.
4612 */
4613
4614 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4615 if (!child_ctx)
4616 return -ENOMEM;
4617
4618 __perf_counter_init_context(child_ctx, child);
4619 child->perf_counter_ctxp = child_ctx;
4620 get_task_struct(child);
4621
4622 /*
4623 * If the parent's context is a clone, pin it so it won't get
4624 * swapped under us.
4625 */
4626 parent_ctx = perf_pin_task_context(parent);
4627
4628 /*
4629 * No need to check if parent_ctx != NULL here; since we saw
4630 * it non-NULL earlier, the only reason for it to become NULL
4631 * is if we exit, and since we're currently in the middle of
4632 * a fork we can't be exiting at the same time.
4633 */
4634
4635 /*
4636 * Lock the parent list. No need to lock the child - not PID
4637 * hashed yet and not running, so nobody can access it.
4638 */
4639 mutex_lock(&parent_ctx->mutex);
4640
4641 /*
4642 * We dont have to disable NMIs - we are only looking at
4643 * the list, not manipulating it:
4644 */
4645 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4646 if (counter != counter->group_leader)
4647 continue;
4648
4649 if (!counter->attr.inherit) {
4650 inherited_all = 0;
4651 continue;
4652 }
4653
4654 ret = inherit_group(counter, parent, parent_ctx,
4655 child, child_ctx);
4656 if (ret) {
4657 inherited_all = 0;
4658 break;
4659 }
4660 }
4661
4662 if (inherited_all) {
4663 /*
4664 * Mark the child context as a clone of the parent
4665 * context, or of whatever the parent is a clone of.
4666 * Note that if the parent is a clone, it could get
4667 * uncloned at any point, but that doesn't matter
4668 * because the list of counters and the generation
4669 * count can't have changed since we took the mutex.
4670 */
4671 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4672 if (cloned_ctx) {
4673 child_ctx->parent_ctx = cloned_ctx;
4674 child_ctx->parent_gen = parent_ctx->parent_gen;
4675 } else {
4676 child_ctx->parent_ctx = parent_ctx;
4677 child_ctx->parent_gen = parent_ctx->generation;
4678 }
4679 get_ctx(child_ctx->parent_ctx);
4680 }
4681
4682 mutex_unlock(&parent_ctx->mutex);
4683
4684 perf_unpin_context(parent_ctx);
4685
4686 return ret;
4687}
4688
4689static void __cpuinit perf_counter_init_cpu(int cpu)
4690{
4691 struct perf_cpu_context *cpuctx;
4692
4693 cpuctx = &per_cpu(perf_cpu_context, cpu);
4694 __perf_counter_init_context(&cpuctx->ctx, NULL);
4695
4696 spin_lock(&perf_resource_lock);
4697 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4698 spin_unlock(&perf_resource_lock);
4699
4700 hw_perf_counter_setup(cpu);
4701}
4702
4703#ifdef CONFIG_HOTPLUG_CPU
4704static void __perf_counter_exit_cpu(void *info)
4705{
4706 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4707 struct perf_counter_context *ctx = &cpuctx->ctx;
4708 struct perf_counter *counter, *tmp;
4709
4710 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4711 __perf_counter_remove_from_context(counter);
4712}
4713static void perf_counter_exit_cpu(int cpu)
4714{
4715 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4716 struct perf_counter_context *ctx = &cpuctx->ctx;
4717
4718 mutex_lock(&ctx->mutex);
4719 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4720 mutex_unlock(&ctx->mutex);
4721}
4722#else
4723static inline void perf_counter_exit_cpu(int cpu) { }
4724#endif
4725
4726static int __cpuinit
4727perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4728{
4729 unsigned int cpu = (long)hcpu;
4730
4731 switch (action) {
4732
4733 case CPU_UP_PREPARE:
4734 case CPU_UP_PREPARE_FROZEN:
4735 perf_counter_init_cpu(cpu);
4736 break;
4737
4738 case CPU_ONLINE:
4739 case CPU_ONLINE_FROZEN:
4740 hw_perf_counter_setup_online(cpu);
4741 break;
4742
4743 case CPU_DOWN_PREPARE:
4744 case CPU_DOWN_PREPARE_FROZEN:
4745 perf_counter_exit_cpu(cpu);
4746 break;
4747
4748 default:
4749 break;
4750 }
4751
4752 return NOTIFY_OK;
4753}
4754
4755/*
4756 * This has to have a higher priority than migration_notifier in sched.c.
4757 */
4758static struct notifier_block __cpuinitdata perf_cpu_nb = {
4759 .notifier_call = perf_cpu_notify,
4760 .priority = 20,
4761};
4762
4763void __init perf_counter_init(void)
4764{
4765 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4766 (void *)(long)smp_processor_id());
4767 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4768 (void *)(long)smp_processor_id());
4769 register_cpu_notifier(&perf_cpu_nb);
4770}
4771
4772static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4773{
4774 return sprintf(buf, "%d\n", perf_reserved_percpu);
4775}
4776
4777static ssize_t
4778perf_set_reserve_percpu(struct sysdev_class *class,
4779 const char *buf,
4780 size_t count)
4781{
4782 struct perf_cpu_context *cpuctx;
4783 unsigned long val;
4784 int err, cpu, mpt;
4785
4786 err = strict_strtoul(buf, 10, &val);
4787 if (err)
4788 return err;
4789 if (val > perf_max_counters)
4790 return -EINVAL;
4791
4792 spin_lock(&perf_resource_lock);
4793 perf_reserved_percpu = val;
4794 for_each_online_cpu(cpu) {
4795 cpuctx = &per_cpu(perf_cpu_context, cpu);
4796 spin_lock_irq(&cpuctx->ctx.lock);
4797 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4798 perf_max_counters - perf_reserved_percpu);
4799 cpuctx->max_pertask = mpt;
4800 spin_unlock_irq(&cpuctx->ctx.lock);
4801 }
4802 spin_unlock(&perf_resource_lock);
4803
4804 return count;
4805}
4806
4807static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4808{
4809 return sprintf(buf, "%d\n", perf_overcommit);
4810}
4811
4812static ssize_t
4813perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4814{
4815 unsigned long val;
4816 int err;
4817
4818 err = strict_strtoul(buf, 10, &val);
4819 if (err)
4820 return err;
4821 if (val > 1)
4822 return -EINVAL;
4823
4824 spin_lock(&perf_resource_lock);
4825 perf_overcommit = val;
4826 spin_unlock(&perf_resource_lock);
4827
4828 return count;
4829}
4830
4831static SYSDEV_CLASS_ATTR(
4832 reserve_percpu,
4833 0644,
4834 perf_show_reserve_percpu,
4835 perf_set_reserve_percpu
4836 );
4837
4838static SYSDEV_CLASS_ATTR(
4839 overcommit,
4840 0644,
4841 perf_show_overcommit,
4842 perf_set_overcommit
4843 );
4844
4845static struct attribute *perfclass_attrs[] = {
4846 &attr_reserve_percpu.attr,
4847 &attr_overcommit.attr,
4848 NULL
4849};
4850
4851static struct attribute_group perfclass_attr_group = {
4852 .attrs = perfclass_attrs,
4853 .name = "perf_counters",
4854};
4855
4856static int __init perf_counter_sysfs_init(void)
4857{
4858 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4859 &perfclass_attr_group);
4860}
4861device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..0f86feb6db0c
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_event.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU events:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_events __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_events __read_mostly;
43static atomic_t nr_mmap_events __read_mostly;
44static atomic_t nr_comm_events __read_mostly;
45static atomic_t nr_task_events __read_mostly;
46
47/*
48 * perf event paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu events for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_event_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_event_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_event_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_event_paranoid > 1;
69}
70
71int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf event sample rate
75 */
76int sysctl_perf_event_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_event_id;
79
80/*
81 * Lock for (sysadmin-configurable) event reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_event_setup(int cpu) { barrier(); }
97void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_event *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_event_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_event_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, perf_disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(perf_disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(perf_disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_event_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_event_context *ctx;
141
142 ctx = container_of(head, struct perf_event_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_event_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_event_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit events we want to return the parent event id
167 * to userspace.
168 */
169static u64 primary_event_id(struct perf_event *event)
170{
171 u64 id = event->id;
172
173 if (event->parent)
174 id = event->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_event_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_event_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_event_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_event_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_event_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_event_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_event_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a event from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_event(struct perf_event *event, struct perf_event_context *ctx)
252{
253 struct perf_event *group_leader = event->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling event,
257 * add it straight to the context's event list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == event)
261 list_add_tail(&event->group_entry, &ctx->group_list);
262 else {
263 list_add_tail(&event->group_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&event->event_entry, &ctx->event_list);
268 ctx->nr_events++;
269 if (event->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a event from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_event(struct perf_event *event, struct perf_event_context *ctx)
279{
280 struct perf_event *sibling, *tmp;
281
282 if (list_empty(&event->group_entry))
283 return;
284 ctx->nr_events--;
285 if (event->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&event->group_entry);
289 list_del_rcu(&event->event_entry);
290
291 if (event->group_leader != event)
292 event->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group event with sibling events then
296 * upgrade the siblings to singleton events by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
300
301 list_move_tail(&sibling->group_entry, &ctx->group_list);
302 sibling->group_leader = sibling;
303 }
304}
305
306static void
307event_sched_out(struct perf_event *event,
308 struct perf_cpu_context *cpuctx,
309 struct perf_event_context *ctx)
310{
311 if (event->state != PERF_EVENT_STATE_ACTIVE)
312 return;
313
314 event->state = PERF_EVENT_STATE_INACTIVE;
315 if (event->pending_disable) {
316 event->pending_disable = 0;
317 event->state = PERF_EVENT_STATE_OFF;
318 }
319 event->tstamp_stopped = ctx->time;
320 event->pmu->disable(event);
321 event->oncpu = -1;
322
323 if (!is_software_event(event))
324 cpuctx->active_oncpu--;
325 ctx->nr_active--;
326 if (event->attr.exclusive || !cpuctx->active_oncpu)
327 cpuctx->exclusive = 0;
328}
329
330static void
331group_sched_out(struct perf_event *group_event,
332 struct perf_cpu_context *cpuctx,
333 struct perf_event_context *ctx)
334{
335 struct perf_event *event;
336
337 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
338 return;
339
340 event_sched_out(group_event, cpuctx, ctx);
341
342 /*
343 * Schedule out siblings (if any):
344 */
345 list_for_each_entry(event, &group_event->sibling_list, group_entry)
346 event_sched_out(event, cpuctx, ctx);
347
348 if (group_event->attr.exclusive)
349 cpuctx->exclusive = 0;
350}
351
352/*
353 * Cross CPU call to remove a performance event
354 *
355 * We disable the event on the hardware level first. After that we
356 * remove it from the context list.
357 */
358static void __perf_event_remove_from_context(void *info)
359{
360 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
361 struct perf_event *event = info;
362 struct perf_event_context *ctx = event->ctx;
363
364 /*
365 * If this is a task context, we need to check whether it is
366 * the current task context of this cpu. If not it has been
367 * scheduled out before the smp call arrived.
368 */
369 if (ctx->task && cpuctx->task_ctx != ctx)
370 return;
371
372 spin_lock(&ctx->lock);
373 /*
374 * Protect the list operation against NMI by disabling the
375 * events on a global level.
376 */
377 perf_disable();
378
379 event_sched_out(event, cpuctx, ctx);
380
381 list_del_event(event, ctx);
382
383 if (!ctx->task) {
384 /*
385 * Allow more per task events with respect to the
386 * reservation:
387 */
388 cpuctx->max_pertask =
389 min(perf_max_events - ctx->nr_events,
390 perf_max_events - perf_reserved_percpu);
391 }
392
393 perf_enable();
394 spin_unlock(&ctx->lock);
395}
396
397
398/*
399 * Remove the event from a task's (or a CPU's) list of events.
400 *
401 * Must be called with ctx->mutex held.
402 *
403 * CPU events are removed with a smp call. For task events we only
404 * call when the task is on a CPU.
405 *
406 * If event->ctx is a cloned context, callers must make sure that
407 * every task struct that event->ctx->task could possibly point to
408 * remains valid. This is OK when called from perf_release since
409 * that only calls us on the top-level context, which can't be a clone.
410 * When called from perf_event_exit_task, it's OK because the
411 * context has been detached from its task.
412 */
413static void perf_event_remove_from_context(struct perf_event *event)
414{
415 struct perf_event_context *ctx = event->ctx;
416 struct task_struct *task = ctx->task;
417
418 if (!task) {
419 /*
420 * Per cpu events are removed via an smp call and
421 * the removal is always sucessful.
422 */
423 smp_call_function_single(event->cpu,
424 __perf_event_remove_from_context,
425 event, 1);
426 return;
427 }
428
429retry:
430 task_oncpu_function_call(task, __perf_event_remove_from_context,
431 event);
432
433 spin_lock_irq(&ctx->lock);
434 /*
435 * If the context is active we need to retry the smp call.
436 */
437 if (ctx->nr_active && !list_empty(&event->group_entry)) {
438 spin_unlock_irq(&ctx->lock);
439 goto retry;
440 }
441
442 /*
443 * The lock prevents that this context is scheduled in so we
444 * can remove the event safely, if the call above did not
445 * succeed.
446 */
447 if (!list_empty(&event->group_entry)) {
448 list_del_event(event, ctx);
449 }
450 spin_unlock_irq(&ctx->lock);
451}
452
453static inline u64 perf_clock(void)
454{
455 return cpu_clock(smp_processor_id());
456}
457
458/*
459 * Update the record of the current time in a context.
460 */
461static void update_context_time(struct perf_event_context *ctx)
462{
463 u64 now = perf_clock();
464
465 ctx->time += now - ctx->timestamp;
466 ctx->timestamp = now;
467}
468
469/*
470 * Update the total_time_enabled and total_time_running fields for a event.
471 */
472static void update_event_times(struct perf_event *event)
473{
474 struct perf_event_context *ctx = event->ctx;
475 u64 run_end;
476
477 if (event->state < PERF_EVENT_STATE_INACTIVE ||
478 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 return;
480
481 event->total_time_enabled = ctx->time - event->tstamp_enabled;
482
483 if (event->state == PERF_EVENT_STATE_INACTIVE)
484 run_end = event->tstamp_stopped;
485 else
486 run_end = ctx->time;
487
488 event->total_time_running = run_end - event->tstamp_running;
489}
490
491/*
492 * Update total_time_enabled and total_time_running for all events in a group.
493 */
494static void update_group_times(struct perf_event *leader)
495{
496 struct perf_event *event;
497
498 update_event_times(leader);
499 list_for_each_entry(event, &leader->sibling_list, group_entry)
500 update_event_times(event);
501}
502
503/*
504 * Cross CPU call to disable a performance event
505 */
506static void __perf_event_disable(void *info)
507{
508 struct perf_event *event = info;
509 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
510 struct perf_event_context *ctx = event->ctx;
511
512 /*
513 * If this is a per-task event, need to check whether this
514 * event's task is the current task on this cpu.
515 */
516 if (ctx->task && cpuctx->task_ctx != ctx)
517 return;
518
519 spin_lock(&ctx->lock);
520
521 /*
522 * If the event is on, turn it off.
523 * If it is in error state, leave it in error state.
524 */
525 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
526 update_context_time(ctx);
527 update_group_times(event);
528 if (event == event->group_leader)
529 group_sched_out(event, cpuctx, ctx);
530 else
531 event_sched_out(event, cpuctx, ctx);
532 event->state = PERF_EVENT_STATE_OFF;
533 }
534
535 spin_unlock(&ctx->lock);
536}
537
538/*
539 * Disable a event.
540 *
541 * If event->ctx is a cloned context, callers must make sure that
542 * every task struct that event->ctx->task could possibly point to
543 * remains valid. This condition is satisifed when called through
544 * perf_event_for_each_child or perf_event_for_each because they
545 * hold the top-level event's child_mutex, so any descendant that
546 * goes to exit will block in sync_child_event.
547 * When called from perf_pending_event it's OK because event->ctx
548 * is the current context on this CPU and preemption is disabled,
549 * hence we can't get into perf_event_task_sched_out for this context.
550 */
551static void perf_event_disable(struct perf_event *event)
552{
553 struct perf_event_context *ctx = event->ctx;
554 struct task_struct *task = ctx->task;
555
556 if (!task) {
557 /*
558 * Disable the event on the cpu that it's on
559 */
560 smp_call_function_single(event->cpu, __perf_event_disable,
561 event, 1);
562 return;
563 }
564
565 retry:
566 task_oncpu_function_call(task, __perf_event_disable, event);
567
568 spin_lock_irq(&ctx->lock);
569 /*
570 * If the event is still active, we need to retry the cross-call.
571 */
572 if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 spin_unlock_irq(&ctx->lock);
574 goto retry;
575 }
576
577 /*
578 * Since we have the lock this context can't be scheduled
579 * in, so we can change the state safely.
580 */
581 if (event->state == PERF_EVENT_STATE_INACTIVE) {
582 update_group_times(event);
583 event->state = PERF_EVENT_STATE_OFF;
584 }
585
586 spin_unlock_irq(&ctx->lock);
587}
588
589static int
590event_sched_in(struct perf_event *event,
591 struct perf_cpu_context *cpuctx,
592 struct perf_event_context *ctx,
593 int cpu)
594{
595 if (event->state <= PERF_EVENT_STATE_OFF)
596 return 0;
597
598 event->state = PERF_EVENT_STATE_ACTIVE;
599 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
600 /*
601 * The new state must be visible before we turn it on in the hardware:
602 */
603 smp_wmb();
604
605 if (event->pmu->enable(event)) {
606 event->state = PERF_EVENT_STATE_INACTIVE;
607 event->oncpu = -1;
608 return -EAGAIN;
609 }
610
611 event->tstamp_running += ctx->time - event->tstamp_stopped;
612
613 if (!is_software_event(event))
614 cpuctx->active_oncpu++;
615 ctx->nr_active++;
616
617 if (event->attr.exclusive)
618 cpuctx->exclusive = 1;
619
620 return 0;
621}
622
623static int
624group_sched_in(struct perf_event *group_event,
625 struct perf_cpu_context *cpuctx,
626 struct perf_event_context *ctx,
627 int cpu)
628{
629 struct perf_event *event, *partial_group;
630 int ret;
631
632 if (group_event->state == PERF_EVENT_STATE_OFF)
633 return 0;
634
635 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
636 if (ret)
637 return ret < 0 ? ret : 0;
638
639 if (event_sched_in(group_event, cpuctx, ctx, cpu))
640 return -EAGAIN;
641
642 /*
643 * Schedule in siblings as one group (if any):
644 */
645 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
646 if (event_sched_in(event, cpuctx, ctx, cpu)) {
647 partial_group = event;
648 goto group_error;
649 }
650 }
651
652 return 0;
653
654group_error:
655 /*
656 * Groups can be scheduled in as one unit only, so undo any
657 * partial group before returning:
658 */
659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
660 if (event == partial_group)
661 break;
662 event_sched_out(event, cpuctx, ctx);
663 }
664 event_sched_out(group_event, cpuctx, ctx);
665
666 return -EAGAIN;
667}
668
669/*
670 * Return 1 for a group consisting entirely of software events,
671 * 0 if the group contains any hardware events.
672 */
673static int is_software_only_group(struct perf_event *leader)
674{
675 struct perf_event *event;
676
677 if (!is_software_event(leader))
678 return 0;
679
680 list_for_each_entry(event, &leader->sibling_list, group_entry)
681 if (!is_software_event(event))
682 return 0;
683
684 return 1;
685}
686
687/*
688 * Work out whether we can put this event group on the CPU now.
689 */
690static int group_can_go_on(struct perf_event *event,
691 struct perf_cpu_context *cpuctx,
692 int can_add_hw)
693{
694 /*
695 * Groups consisting entirely of software events can always go on.
696 */
697 if (is_software_only_group(event))
698 return 1;
699 /*
700 * If an exclusive group is already on, no other hardware
701 * events can go on.
702 */
703 if (cpuctx->exclusive)
704 return 0;
705 /*
706 * If this group is exclusive and there are already
707 * events on the CPU, it can't go on.
708 */
709 if (event->attr.exclusive && cpuctx->active_oncpu)
710 return 0;
711 /*
712 * Otherwise, try to add it if all previous groups were able
713 * to go on.
714 */
715 return can_add_hw;
716}
717
718static void add_event_to_ctx(struct perf_event *event,
719 struct perf_event_context *ctx)
720{
721 list_add_event(event, ctx);
722 event->tstamp_enabled = ctx->time;
723 event->tstamp_running = ctx->time;
724 event->tstamp_stopped = ctx->time;
725}
726
727/*
728 * Cross CPU call to install and enable a performance event
729 *
730 * Must be called with ctx->mutex held
731 */
732static void __perf_install_in_context(void *info)
733{
734 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
735 struct perf_event *event = info;
736 struct perf_event_context *ctx = event->ctx;
737 struct perf_event *leader = event->group_leader;
738 int cpu = smp_processor_id();
739 int err;
740
741 /*
742 * If this is a task context, we need to check whether it is
743 * the current task context of this cpu. If not it has been
744 * scheduled out before the smp call arrived.
745 * Or possibly this is the right context but it isn't
746 * on this cpu because it had no events.
747 */
748 if (ctx->task && cpuctx->task_ctx != ctx) {
749 if (cpuctx->task_ctx || ctx->task != current)
750 return;
751 cpuctx->task_ctx = ctx;
752 }
753
754 spin_lock(&ctx->lock);
755 ctx->is_active = 1;
756 update_context_time(ctx);
757
758 /*
759 * Protect the list operation against NMI by disabling the
760 * events on a global level. NOP for non NMI based events.
761 */
762 perf_disable();
763
764 add_event_to_ctx(event, ctx);
765
766 /*
767 * Don't put the event on if it is disabled or if
768 * it is in a group and the group isn't on.
769 */
770 if (event->state != PERF_EVENT_STATE_INACTIVE ||
771 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
772 goto unlock;
773
774 /*
775 * An exclusive event can't go on if there are already active
776 * hardware events, and no hardware event can go on if there
777 * is already an exclusive event on.
778 */
779 if (!group_can_go_on(event, cpuctx, 1))
780 err = -EEXIST;
781 else
782 err = event_sched_in(event, cpuctx, ctx, cpu);
783
784 if (err) {
785 /*
786 * This event couldn't go on. If it is in a group
787 * then we have to pull the whole group off.
788 * If the event group is pinned then put it in error state.
789 */
790 if (leader != event)
791 group_sched_out(leader, cpuctx, ctx);
792 if (leader->attr.pinned) {
793 update_group_times(leader);
794 leader->state = PERF_EVENT_STATE_ERROR;
795 }
796 }
797
798 if (!err && !ctx->task && cpuctx->max_pertask)
799 cpuctx->max_pertask--;
800
801 unlock:
802 perf_enable();
803
804 spin_unlock(&ctx->lock);
805}
806
807/*
808 * Attach a performance event to a context
809 *
810 * First we add the event to the list with the hardware enable bit
811 * in event->hw_config cleared.
812 *
813 * If the event is attached to a task which is on a CPU we use a smp
814 * call to enable it in the task context. The task might have been
815 * scheduled away, but we check this in the smp call again.
816 *
817 * Must be called with ctx->mutex held.
818 */
819static void
820perf_install_in_context(struct perf_event_context *ctx,
821 struct perf_event *event,
822 int cpu)
823{
824 struct task_struct *task = ctx->task;
825
826 if (!task) {
827 /*
828 * Per cpu events are installed via an smp call and
829 * the install is always sucessful.
830 */
831 smp_call_function_single(cpu, __perf_install_in_context,
832 event, 1);
833 return;
834 }
835
836retry:
837 task_oncpu_function_call(task, __perf_install_in_context,
838 event);
839
840 spin_lock_irq(&ctx->lock);
841 /*
842 * we need to retry the smp call.
843 */
844 if (ctx->is_active && list_empty(&event->group_entry)) {
845 spin_unlock_irq(&ctx->lock);
846 goto retry;
847 }
848
849 /*
850 * The lock prevents that this context is scheduled in so we
851 * can add the event safely, if it the call above did not
852 * succeed.
853 */
854 if (list_empty(&event->group_entry))
855 add_event_to_ctx(event, ctx);
856 spin_unlock_irq(&ctx->lock);
857}
858
859/*
860 * Put a event into inactive state and update time fields.
861 * Enabling the leader of a group effectively enables all
862 * the group members that aren't explicitly disabled, so we
863 * have to update their ->tstamp_enabled also.
864 * Note: this works for group members as well as group leaders
865 * since the non-leader members' sibling_lists will be empty.
866 */
867static void __perf_event_mark_enabled(struct perf_event *event,
868 struct perf_event_context *ctx)
869{
870 struct perf_event *sub;
871
872 event->state = PERF_EVENT_STATE_INACTIVE;
873 event->tstamp_enabled = ctx->time - event->total_time_enabled;
874 list_for_each_entry(sub, &event->sibling_list, group_entry)
875 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
876 sub->tstamp_enabled =
877 ctx->time - sub->total_time_enabled;
878}
879
880/*
881 * Cross CPU call to enable a performance event
882 */
883static void __perf_event_enable(void *info)
884{
885 struct perf_event *event = info;
886 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
887 struct perf_event_context *ctx = event->ctx;
888 struct perf_event *leader = event->group_leader;
889 int err;
890
891 /*
892 * If this is a per-task event, need to check whether this
893 * event's task is the current task on this cpu.
894 */
895 if (ctx->task && cpuctx->task_ctx != ctx) {
896 if (cpuctx->task_ctx || ctx->task != current)
897 return;
898 cpuctx->task_ctx = ctx;
899 }
900
901 spin_lock(&ctx->lock);
902 ctx->is_active = 1;
903 update_context_time(ctx);
904
905 if (event->state >= PERF_EVENT_STATE_INACTIVE)
906 goto unlock;
907 __perf_event_mark_enabled(event, ctx);
908
909 /*
910 * If the event is in a group and isn't the group leader,
911 * then don't put it on unless the group is on.
912 */
913 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
914 goto unlock;
915
916 if (!group_can_go_on(event, cpuctx, 1)) {
917 err = -EEXIST;
918 } else {
919 perf_disable();
920 if (event == leader)
921 err = group_sched_in(event, cpuctx, ctx,
922 smp_processor_id());
923 else
924 err = event_sched_in(event, cpuctx, ctx,
925 smp_processor_id());
926 perf_enable();
927 }
928
929 if (err) {
930 /*
931 * If this event can't go on and it's part of a
932 * group, then the whole group has to come off.
933 */
934 if (leader != event)
935 group_sched_out(leader, cpuctx, ctx);
936 if (leader->attr.pinned) {
937 update_group_times(leader);
938 leader->state = PERF_EVENT_STATE_ERROR;
939 }
940 }
941
942 unlock:
943 spin_unlock(&ctx->lock);
944}
945
946/*
947 * Enable a event.
948 *
949 * If event->ctx is a cloned context, callers must make sure that
950 * every task struct that event->ctx->task could possibly point to
951 * remains valid. This condition is satisfied when called through
952 * perf_event_for_each_child or perf_event_for_each as described
953 * for perf_event_disable.
954 */
955static void perf_event_enable(struct perf_event *event)
956{
957 struct perf_event_context *ctx = event->ctx;
958 struct task_struct *task = ctx->task;
959
960 if (!task) {
961 /*
962 * Enable the event on the cpu that it's on
963 */
964 smp_call_function_single(event->cpu, __perf_event_enable,
965 event, 1);
966 return;
967 }
968
969 spin_lock_irq(&ctx->lock);
970 if (event->state >= PERF_EVENT_STATE_INACTIVE)
971 goto out;
972
973 /*
974 * If the event is in error state, clear that first.
975 * That way, if we see the event in error state below, we
976 * know that it has gone back into error state, as distinct
977 * from the task having been scheduled away before the
978 * cross-call arrived.
979 */
980 if (event->state == PERF_EVENT_STATE_ERROR)
981 event->state = PERF_EVENT_STATE_OFF;
982
983 retry:
984 spin_unlock_irq(&ctx->lock);
985 task_oncpu_function_call(task, __perf_event_enable, event);
986
987 spin_lock_irq(&ctx->lock);
988
989 /*
990 * If the context is active and the event is still off,
991 * we need to retry the cross-call.
992 */
993 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
994 goto retry;
995
996 /*
997 * Since we have the lock this context can't be scheduled
998 * in, so we can change the state safely.
999 */
1000 if (event->state == PERF_EVENT_STATE_OFF)
1001 __perf_event_mark_enabled(event, ctx);
1002
1003 out:
1004 spin_unlock_irq(&ctx->lock);
1005}
1006
1007static int perf_event_refresh(struct perf_event *event, int refresh)
1008{
1009 /*
1010 * not supported on inherited events
1011 */
1012 if (event->attr.inherit)
1013 return -EINVAL;
1014
1015 atomic_add(refresh, &event->event_limit);
1016 perf_event_enable(event);
1017
1018 return 0;
1019}
1020
1021void __perf_event_sched_out(struct perf_event_context *ctx,
1022 struct perf_cpu_context *cpuctx)
1023{
1024 struct perf_event *event;
1025
1026 spin_lock(&ctx->lock);
1027 ctx->is_active = 0;
1028 if (likely(!ctx->nr_events))
1029 goto out;
1030 update_context_time(ctx);
1031
1032 perf_disable();
1033 if (ctx->nr_active) {
1034 list_for_each_entry(event, &ctx->group_list, group_entry) {
1035 if (event != event->group_leader)
1036 event_sched_out(event, cpuctx, ctx);
1037 else
1038 group_sched_out(event, cpuctx, ctx);
1039 }
1040 }
1041 perf_enable();
1042 out:
1043 spin_unlock(&ctx->lock);
1044}
1045
1046/*
1047 * Test whether two contexts are equivalent, i.e. whether they
1048 * have both been cloned from the same version of the same context
1049 * and they both have the same number of enabled events.
1050 * If the number of enabled events is the same, then the set
1051 * of enabled events should be the same, because these are both
1052 * inherited contexts, therefore we can't access individual events
1053 * in them directly with an fd; we can only enable/disable all
1054 * events via prctl, or enable/disable all events in a family
1055 * via ioctl, which will have the same effect on both contexts.
1056 */
1057static int context_equiv(struct perf_event_context *ctx1,
1058 struct perf_event_context *ctx2)
1059{
1060 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1061 && ctx1->parent_gen == ctx2->parent_gen
1062 && !ctx1->pin_count && !ctx2->pin_count;
1063}
1064
1065static void __perf_event_read(void *event);
1066
1067static void __perf_event_sync_stat(struct perf_event *event,
1068 struct perf_event *next_event)
1069{
1070 u64 value;
1071
1072 if (!event->attr.inherit_stat)
1073 return;
1074
1075 /*
1076 * Update the event value, we cannot use perf_event_read()
1077 * because we're in the middle of a context switch and have IRQs
1078 * disabled, which upsets smp_call_function_single(), however
1079 * we know the event must be on the current CPU, therefore we
1080 * don't need to use it.
1081 */
1082 switch (event->state) {
1083 case PERF_EVENT_STATE_ACTIVE:
1084 __perf_event_read(event);
1085 break;
1086
1087 case PERF_EVENT_STATE_INACTIVE:
1088 update_event_times(event);
1089 break;
1090
1091 default:
1092 break;
1093 }
1094
1095 /*
1096 * In order to keep per-task stats reliable we need to flip the event
1097 * values when we flip the contexts.
1098 */
1099 value = atomic64_read(&next_event->count);
1100 value = atomic64_xchg(&event->count, value);
1101 atomic64_set(&next_event->count, value);
1102
1103 swap(event->total_time_enabled, next_event->total_time_enabled);
1104 swap(event->total_time_running, next_event->total_time_running);
1105
1106 /*
1107 * Since we swizzled the values, update the user visible data too.
1108 */
1109 perf_event_update_userpage(event);
1110 perf_event_update_userpage(next_event);
1111}
1112
1113#define list_next_entry(pos, member) \
1114 list_entry(pos->member.next, typeof(*pos), member)
1115
1116static void perf_event_sync_stat(struct perf_event_context *ctx,
1117 struct perf_event_context *next_ctx)
1118{
1119 struct perf_event *event, *next_event;
1120
1121 if (!ctx->nr_stat)
1122 return;
1123
1124 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry);
1126
1127 next_event = list_first_entry(&next_ctx->event_list,
1128 struct perf_event, event_entry);
1129
1130 while (&event->event_entry != &ctx->event_list &&
1131 &next_event->event_entry != &next_ctx->event_list) {
1132
1133 __perf_event_sync_stat(event, next_event);
1134
1135 event = list_next_entry(event, event_entry);
1136 next_event = list_next_entry(next_event, event_entry);
1137 }
1138}
1139
1140/*
1141 * Called from scheduler to remove the events of the current task,
1142 * with interrupts disabled.
1143 *
1144 * We stop each event and update the event value in event->count.
1145 *
1146 * This does not protect us against NMI, but disable()
1147 * sets the disabled bit in the control field of event _before_
1148 * accessing the event control register. If a NMI hits, then it will
1149 * not restart the event.
1150 */
1151void perf_event_task_sched_out(struct task_struct *task,
1152 struct task_struct *next, int cpu)
1153{
1154 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1155 struct perf_event_context *ctx = task->perf_event_ctxp;
1156 struct perf_event_context *next_ctx;
1157 struct perf_event_context *parent;
1158 struct pt_regs *regs;
1159 int do_switch = 1;
1160
1161 regs = task_pt_regs(task);
1162 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1163
1164 if (likely(!ctx || !cpuctx->task_ctx))
1165 return;
1166
1167 update_context_time(ctx);
1168
1169 rcu_read_lock();
1170 parent = rcu_dereference(ctx->parent_ctx);
1171 next_ctx = next->perf_event_ctxp;
1172 if (parent && next_ctx &&
1173 rcu_dereference(next_ctx->parent_ctx) == parent) {
1174 /*
1175 * Looks like the two contexts are clones, so we might be
1176 * able to optimize the context switch. We lock both
1177 * contexts and check that they are clones under the
1178 * lock (including re-checking that neither has been
1179 * uncloned in the meantime). It doesn't matter which
1180 * order we take the locks because no other cpu could
1181 * be trying to lock both of these tasks.
1182 */
1183 spin_lock(&ctx->lock);
1184 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1185 if (context_equiv(ctx, next_ctx)) {
1186 /*
1187 * XXX do we need a memory barrier of sorts
1188 * wrt to rcu_dereference() of perf_event_ctxp
1189 */
1190 task->perf_event_ctxp = next_ctx;
1191 next->perf_event_ctxp = ctx;
1192 ctx->task = next;
1193 next_ctx->task = task;
1194 do_switch = 0;
1195
1196 perf_event_sync_stat(ctx, next_ctx);
1197 }
1198 spin_unlock(&next_ctx->lock);
1199 spin_unlock(&ctx->lock);
1200 }
1201 rcu_read_unlock();
1202
1203 if (do_switch) {
1204 __perf_event_sched_out(ctx, cpuctx);
1205 cpuctx->task_ctx = NULL;
1206 }
1207}
1208
1209/*
1210 * Called with IRQs disabled
1211 */
1212static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1213{
1214 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1215
1216 if (!cpuctx->task_ctx)
1217 return;
1218
1219 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1220 return;
1221
1222 __perf_event_sched_out(ctx, cpuctx);
1223 cpuctx->task_ctx = NULL;
1224}
1225
1226/*
1227 * Called with IRQs disabled
1228 */
1229static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1230{
1231 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1232}
1233
1234static void
1235__perf_event_sched_in(struct perf_event_context *ctx,
1236 struct perf_cpu_context *cpuctx, int cpu)
1237{
1238 struct perf_event *event;
1239 int can_add_hw = 1;
1240
1241 spin_lock(&ctx->lock);
1242 ctx->is_active = 1;
1243 if (likely(!ctx->nr_events))
1244 goto out;
1245
1246 ctx->timestamp = perf_clock();
1247
1248 perf_disable();
1249
1250 /*
1251 * First go through the list and put on any pinned groups
1252 * in order to give them the best chance of going on.
1253 */
1254 list_for_each_entry(event, &ctx->group_list, group_entry) {
1255 if (event->state <= PERF_EVENT_STATE_OFF ||
1256 !event->attr.pinned)
1257 continue;
1258 if (event->cpu != -1 && event->cpu != cpu)
1259 continue;
1260
1261 if (event != event->group_leader)
1262 event_sched_in(event, cpuctx, ctx, cpu);
1263 else {
1264 if (group_can_go_on(event, cpuctx, 1))
1265 group_sched_in(event, cpuctx, ctx, cpu);
1266 }
1267
1268 /*
1269 * If this pinned group hasn't been scheduled,
1270 * put it in error state.
1271 */
1272 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1273 update_group_times(event);
1274 event->state = PERF_EVENT_STATE_ERROR;
1275 }
1276 }
1277
1278 list_for_each_entry(event, &ctx->group_list, group_entry) {
1279 /*
1280 * Ignore events in OFF or ERROR state, and
1281 * ignore pinned events since we did them already.
1282 */
1283 if (event->state <= PERF_EVENT_STATE_OFF ||
1284 event->attr.pinned)
1285 continue;
1286
1287 /*
1288 * Listen to the 'cpu' scheduling filter constraint
1289 * of events:
1290 */
1291 if (event->cpu != -1 && event->cpu != cpu)
1292 continue;
1293
1294 if (event != event->group_leader) {
1295 if (event_sched_in(event, cpuctx, ctx, cpu))
1296 can_add_hw = 0;
1297 } else {
1298 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1299 if (group_sched_in(event, cpuctx, ctx, cpu))
1300 can_add_hw = 0;
1301 }
1302 }
1303 }
1304 perf_enable();
1305 out:
1306 spin_unlock(&ctx->lock);
1307}
1308
1309/*
1310 * Called from scheduler to add the events of the current task
1311 * with interrupts disabled.
1312 *
1313 * We restore the event value and then enable it.
1314 *
1315 * This does not protect us against NMI, but enable()
1316 * sets the enabled bit in the control field of event _before_
1317 * accessing the event control register. If a NMI hits, then it will
1318 * keep the event running.
1319 */
1320void perf_event_task_sched_in(struct task_struct *task, int cpu)
1321{
1322 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1323 struct perf_event_context *ctx = task->perf_event_ctxp;
1324
1325 if (likely(!ctx))
1326 return;
1327 if (cpuctx->task_ctx == ctx)
1328 return;
1329 __perf_event_sched_in(ctx, cpuctx, cpu);
1330 cpuctx->task_ctx = ctx;
1331}
1332
1333static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1334{
1335 struct perf_event_context *ctx = &cpuctx->ctx;
1336
1337 __perf_event_sched_in(ctx, cpuctx, cpu);
1338}
1339
1340#define MAX_INTERRUPTS (~0ULL)
1341
1342static void perf_log_throttle(struct perf_event *event, int enable);
1343
1344static void perf_adjust_period(struct perf_event *event, u64 events)
1345{
1346 struct hw_perf_event *hwc = &event->hw;
1347 u64 period, sample_period;
1348 s64 delta;
1349
1350 events *= hwc->sample_period;
1351 period = div64_u64(events, event->attr.sample_freq);
1352
1353 delta = (s64)(period - hwc->sample_period);
1354 delta = (delta + 7) / 8; /* low pass filter */
1355
1356 sample_period = hwc->sample_period + delta;
1357
1358 if (!sample_period)
1359 sample_period = 1;
1360
1361 hwc->sample_period = sample_period;
1362}
1363
1364static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1365{
1366 struct perf_event *event;
1367 struct hw_perf_event *hwc;
1368 u64 interrupts, freq;
1369
1370 spin_lock(&ctx->lock);
1371 list_for_each_entry(event, &ctx->group_list, group_entry) {
1372 if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 continue;
1374
1375 hwc = &event->hw;
1376
1377 interrupts = hwc->interrupts;
1378 hwc->interrupts = 0;
1379
1380 /*
1381 * unthrottle events on the tick
1382 */
1383 if (interrupts == MAX_INTERRUPTS) {
1384 perf_log_throttle(event, 1);
1385 event->pmu->unthrottle(event);
1386 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1387 }
1388
1389 if (!event->attr.freq || !event->attr.sample_freq)
1390 continue;
1391
1392 /*
1393 * if the specified freq < HZ then we need to skip ticks
1394 */
1395 if (event->attr.sample_freq < HZ) {
1396 freq = event->attr.sample_freq;
1397
1398 hwc->freq_count += freq;
1399 hwc->freq_interrupts += interrupts;
1400
1401 if (hwc->freq_count < HZ)
1402 continue;
1403
1404 interrupts = hwc->freq_interrupts;
1405 hwc->freq_interrupts = 0;
1406 hwc->freq_count -= HZ;
1407 } else
1408 freq = HZ;
1409
1410 perf_adjust_period(event, freq * interrupts);
1411
1412 /*
1413 * In order to avoid being stalled by an (accidental) huge
1414 * sample period, force reset the sample period if we didn't
1415 * get any events in this freq period.
1416 */
1417 if (!interrupts) {
1418 perf_disable();
1419 event->pmu->disable(event);
1420 atomic64_set(&hwc->period_left, 0);
1421 event->pmu->enable(event);
1422 perf_enable();
1423 }
1424 }
1425 spin_unlock(&ctx->lock);
1426}
1427
1428/*
1429 * Round-robin a context's events:
1430 */
1431static void rotate_ctx(struct perf_event_context *ctx)
1432{
1433 struct perf_event *event;
1434
1435 if (!ctx->nr_events)
1436 return;
1437
1438 spin_lock(&ctx->lock);
1439 /*
1440 * Rotate the first entry last (works just fine for group events too):
1441 */
1442 perf_disable();
1443 list_for_each_entry(event, &ctx->group_list, group_entry) {
1444 list_move_tail(&event->group_entry, &ctx->group_list);
1445 break;
1446 }
1447 perf_enable();
1448
1449 spin_unlock(&ctx->lock);
1450}
1451
1452void perf_event_task_tick(struct task_struct *curr, int cpu)
1453{
1454 struct perf_cpu_context *cpuctx;
1455 struct perf_event_context *ctx;
1456
1457 if (!atomic_read(&nr_events))
1458 return;
1459
1460 cpuctx = &per_cpu(perf_cpu_context, cpu);
1461 ctx = curr->perf_event_ctxp;
1462
1463 perf_ctx_adjust_freq(&cpuctx->ctx);
1464 if (ctx)
1465 perf_ctx_adjust_freq(ctx);
1466
1467 perf_event_cpu_sched_out(cpuctx);
1468 if (ctx)
1469 __perf_event_task_sched_out(ctx);
1470
1471 rotate_ctx(&cpuctx->ctx);
1472 if (ctx)
1473 rotate_ctx(ctx);
1474
1475 perf_event_cpu_sched_in(cpuctx, cpu);
1476 if (ctx)
1477 perf_event_task_sched_in(curr, cpu);
1478}
1479
1480/*
1481 * Enable all of a task's events that have been marked enable-on-exec.
1482 * This expects task == current.
1483 */
1484static void perf_event_enable_on_exec(struct task_struct *task)
1485{
1486 struct perf_event_context *ctx;
1487 struct perf_event *event;
1488 unsigned long flags;
1489 int enabled = 0;
1490
1491 local_irq_save(flags);
1492 ctx = task->perf_event_ctxp;
1493 if (!ctx || !ctx->nr_events)
1494 goto out;
1495
1496 __perf_event_task_sched_out(ctx);
1497
1498 spin_lock(&ctx->lock);
1499
1500 list_for_each_entry(event, &ctx->group_list, group_entry) {
1501 if (!event->attr.enable_on_exec)
1502 continue;
1503 event->attr.enable_on_exec = 0;
1504 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1505 continue;
1506 __perf_event_mark_enabled(event, ctx);
1507 enabled = 1;
1508 }
1509
1510 /*
1511 * Unclone this context if we enabled any event.
1512 */
1513 if (enabled)
1514 unclone_ctx(ctx);
1515
1516 spin_unlock(&ctx->lock);
1517
1518 perf_event_task_sched_in(task, smp_processor_id());
1519 out:
1520 local_irq_restore(flags);
1521}
1522
1523/*
1524 * Cross CPU call to read the hardware event
1525 */
1526static void __perf_event_read(void *info)
1527{
1528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 struct perf_event *event = info;
1530 struct perf_event_context *ctx = event->ctx;
1531 unsigned long flags;
1532
1533 /*
1534 * If this is a task context, we need to check whether it is
1535 * the current task context of this cpu. If not it has been
1536 * scheduled out before the smp call arrived. In that case
1537 * event->count would have been updated to a recent sample
1538 * when the event was scheduled out.
1539 */
1540 if (ctx->task && cpuctx->task_ctx != ctx)
1541 return;
1542
1543 local_irq_save(flags);
1544 if (ctx->is_active)
1545 update_context_time(ctx);
1546 event->pmu->read(event);
1547 update_event_times(event);
1548 local_irq_restore(flags);
1549}
1550
1551static u64 perf_event_read(struct perf_event *event)
1552{
1553 /*
1554 * If event is enabled and currently active on a CPU, update the
1555 * value in the event structure:
1556 */
1557 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1558 smp_call_function_single(event->oncpu,
1559 __perf_event_read, event, 1);
1560 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1561 update_event_times(event);
1562 }
1563
1564 return atomic64_read(&event->count);
1565}
1566
1567/*
1568 * Initialize the perf_event context in a task_struct:
1569 */
1570static void
1571__perf_event_init_context(struct perf_event_context *ctx,
1572 struct task_struct *task)
1573{
1574 memset(ctx, 0, sizeof(*ctx));
1575 spin_lock_init(&ctx->lock);
1576 mutex_init(&ctx->mutex);
1577 INIT_LIST_HEAD(&ctx->group_list);
1578 INIT_LIST_HEAD(&ctx->event_list);
1579 atomic_set(&ctx->refcount, 1);
1580 ctx->task = task;
1581}
1582
1583static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1584{
1585 struct perf_event_context *ctx;
1586 struct perf_cpu_context *cpuctx;
1587 struct task_struct *task;
1588 unsigned long flags;
1589 int err;
1590
1591 /*
1592 * If cpu is not a wildcard then this is a percpu event:
1593 */
1594 if (cpu != -1) {
1595 /* Must be root to operate on a CPU event: */
1596 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1597 return ERR_PTR(-EACCES);
1598
1599 if (cpu < 0 || cpu > num_possible_cpus())
1600 return ERR_PTR(-EINVAL);
1601
1602 /*
1603 * We could be clever and allow to attach a event to an
1604 * offline CPU and activate it when the CPU comes up, but
1605 * that's for later.
1606 */
1607 if (!cpu_isset(cpu, cpu_online_map))
1608 return ERR_PTR(-ENODEV);
1609
1610 cpuctx = &per_cpu(perf_cpu_context, cpu);
1611 ctx = &cpuctx->ctx;
1612 get_ctx(ctx);
1613
1614 return ctx;
1615 }
1616
1617 rcu_read_lock();
1618 if (!pid)
1619 task = current;
1620 else
1621 task = find_task_by_vpid(pid);
1622 if (task)
1623 get_task_struct(task);
1624 rcu_read_unlock();
1625
1626 if (!task)
1627 return ERR_PTR(-ESRCH);
1628
1629 /*
1630 * Can't attach events to a dying task.
1631 */
1632 err = -ESRCH;
1633 if (task->flags & PF_EXITING)
1634 goto errout;
1635
1636 /* Reuse ptrace permission checks for now. */
1637 err = -EACCES;
1638 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1639 goto errout;
1640
1641 retry:
1642 ctx = perf_lock_task_context(task, &flags);
1643 if (ctx) {
1644 unclone_ctx(ctx);
1645 spin_unlock_irqrestore(&ctx->lock, flags);
1646 }
1647
1648 if (!ctx) {
1649 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1650 err = -ENOMEM;
1651 if (!ctx)
1652 goto errout;
1653 __perf_event_init_context(ctx, task);
1654 get_ctx(ctx);
1655 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1656 /*
1657 * We raced with some other task; use
1658 * the context they set.
1659 */
1660 kfree(ctx);
1661 goto retry;
1662 }
1663 get_task_struct(task);
1664 }
1665
1666 put_task_struct(task);
1667 return ctx;
1668
1669 errout:
1670 put_task_struct(task);
1671 return ERR_PTR(err);
1672}
1673
1674static void free_event_rcu(struct rcu_head *head)
1675{
1676 struct perf_event *event;
1677
1678 event = container_of(head, struct perf_event, rcu_head);
1679 if (event->ns)
1680 put_pid_ns(event->ns);
1681 kfree(event);
1682}
1683
1684static void perf_pending_sync(struct perf_event *event);
1685
1686static void free_event(struct perf_event *event)
1687{
1688 perf_pending_sync(event);
1689
1690 if (!event->parent) {
1691 atomic_dec(&nr_events);
1692 if (event->attr.mmap)
1693 atomic_dec(&nr_mmap_events);
1694 if (event->attr.comm)
1695 atomic_dec(&nr_comm_events);
1696 if (event->attr.task)
1697 atomic_dec(&nr_task_events);
1698 }
1699
1700 if (event->output) {
1701 fput(event->output->filp);
1702 event->output = NULL;
1703 }
1704
1705 if (event->destroy)
1706 event->destroy(event);
1707
1708 put_ctx(event->ctx);
1709 call_rcu(&event->rcu_head, free_event_rcu);
1710}
1711
1712/*
1713 * Called when the last reference to the file is gone.
1714 */
1715static int perf_release(struct inode *inode, struct file *file)
1716{
1717 struct perf_event *event = file->private_data;
1718 struct perf_event_context *ctx = event->ctx;
1719
1720 file->private_data = NULL;
1721
1722 WARN_ON_ONCE(ctx->parent_ctx);
1723 mutex_lock(&ctx->mutex);
1724 perf_event_remove_from_context(event);
1725 mutex_unlock(&ctx->mutex);
1726
1727 mutex_lock(&event->owner->perf_event_mutex);
1728 list_del_init(&event->owner_entry);
1729 mutex_unlock(&event->owner->perf_event_mutex);
1730 put_task_struct(event->owner);
1731
1732 free_event(event);
1733
1734 return 0;
1735}
1736
1737static int perf_event_read_size(struct perf_event *event)
1738{
1739 int entry = sizeof(u64); /* value */
1740 int size = 0;
1741 int nr = 1;
1742
1743 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1744 size += sizeof(u64);
1745
1746 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1747 size += sizeof(u64);
1748
1749 if (event->attr.read_format & PERF_FORMAT_ID)
1750 entry += sizeof(u64);
1751
1752 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1753 nr += event->group_leader->nr_siblings;
1754 size += sizeof(u64);
1755 }
1756
1757 size += entry * nr;
1758
1759 return size;
1760}
1761
1762static u64 perf_event_read_value(struct perf_event *event)
1763{
1764 struct perf_event *child;
1765 u64 total = 0;
1766
1767 total += perf_event_read(event);
1768 list_for_each_entry(child, &event->child_list, child_list)
1769 total += perf_event_read(child);
1770
1771 return total;
1772}
1773
1774static int perf_event_read_entry(struct perf_event *event,
1775 u64 read_format, char __user *buf)
1776{
1777 int n = 0, count = 0;
1778 u64 values[2];
1779
1780 values[n++] = perf_event_read_value(event);
1781 if (read_format & PERF_FORMAT_ID)
1782 values[n++] = primary_event_id(event);
1783
1784 count = n * sizeof(u64);
1785
1786 if (copy_to_user(buf, values, count))
1787 return -EFAULT;
1788
1789 return count;
1790}
1791
1792static int perf_event_read_group(struct perf_event *event,
1793 u64 read_format, char __user *buf)
1794{
1795 struct perf_event *leader = event->group_leader, *sub;
1796 int n = 0, size = 0, err = -EFAULT;
1797 u64 values[3];
1798
1799 values[n++] = 1 + leader->nr_siblings;
1800 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1801 values[n++] = leader->total_time_enabled +
1802 atomic64_read(&leader->child_total_time_enabled);
1803 }
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1805 values[n++] = leader->total_time_running +
1806 atomic64_read(&leader->child_total_time_running);
1807 }
1808
1809 size = n * sizeof(u64);
1810
1811 if (copy_to_user(buf, values, size))
1812 return -EFAULT;
1813
1814 err = perf_event_read_entry(leader, read_format, buf + size);
1815 if (err < 0)
1816 return err;
1817
1818 size += err;
1819
1820 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821 err = perf_event_read_entry(sub, read_format,
1822 buf + size);
1823 if (err < 0)
1824 return err;
1825
1826 size += err;
1827 }
1828
1829 return size;
1830}
1831
1832static int perf_event_read_one(struct perf_event *event,
1833 u64 read_format, char __user *buf)
1834{
1835 u64 values[4];
1836 int n = 0;
1837
1838 values[n++] = perf_event_read_value(event);
1839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1840 values[n++] = event->total_time_enabled +
1841 atomic64_read(&event->child_total_time_enabled);
1842 }
1843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 values[n++] = event->total_time_running +
1845 atomic64_read(&event->child_total_time_running);
1846 }
1847 if (read_format & PERF_FORMAT_ID)
1848 values[n++] = primary_event_id(event);
1849
1850 if (copy_to_user(buf, values, n * sizeof(u64)))
1851 return -EFAULT;
1852
1853 return n * sizeof(u64);
1854}
1855
1856/*
1857 * Read the performance event - simple non blocking version for now
1858 */
1859static ssize_t
1860perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861{
1862 u64 read_format = event->attr.read_format;
1863 int ret;
1864
1865 /*
1866 * Return end-of-file for a read on a event that is in
1867 * error state (i.e. because it was pinned but it couldn't be
1868 * scheduled on to the CPU at some point).
1869 */
1870 if (event->state == PERF_EVENT_STATE_ERROR)
1871 return 0;
1872
1873 if (count < perf_event_read_size(event))
1874 return -ENOSPC;
1875
1876 WARN_ON_ONCE(event->ctx->parent_ctx);
1877 mutex_lock(&event->child_mutex);
1878 if (read_format & PERF_FORMAT_GROUP)
1879 ret = perf_event_read_group(event, read_format, buf);
1880 else
1881 ret = perf_event_read_one(event, read_format, buf);
1882 mutex_unlock(&event->child_mutex);
1883
1884 return ret;
1885}
1886
1887static ssize_t
1888perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1889{
1890 struct perf_event *event = file->private_data;
1891
1892 return perf_read_hw(event, buf, count);
1893}
1894
1895static unsigned int perf_poll(struct file *file, poll_table *wait)
1896{
1897 struct perf_event *event = file->private_data;
1898 struct perf_mmap_data *data;
1899 unsigned int events = POLL_HUP;
1900
1901 rcu_read_lock();
1902 data = rcu_dereference(event->data);
1903 if (data)
1904 events = atomic_xchg(&data->poll, 0);
1905 rcu_read_unlock();
1906
1907 poll_wait(file, &event->waitq, wait);
1908
1909 return events;
1910}
1911
1912static void perf_event_reset(struct perf_event *event)
1913{
1914 (void)perf_event_read(event);
1915 atomic64_set(&event->count, 0);
1916 perf_event_update_userpage(event);
1917}
1918
1919/*
1920 * Holding the top-level event's child_mutex means that any
1921 * descendant process that has inherited this event will block
1922 * in sync_child_event if it goes to exit, thus satisfying the
1923 * task existence requirements of perf_event_enable/disable.
1924 */
1925static void perf_event_for_each_child(struct perf_event *event,
1926 void (*func)(struct perf_event *))
1927{
1928 struct perf_event *child;
1929
1930 WARN_ON_ONCE(event->ctx->parent_ctx);
1931 mutex_lock(&event->child_mutex);
1932 func(event);
1933 list_for_each_entry(child, &event->child_list, child_list)
1934 func(child);
1935 mutex_unlock(&event->child_mutex);
1936}
1937
1938static void perf_event_for_each(struct perf_event *event,
1939 void (*func)(struct perf_event *))
1940{
1941 struct perf_event_context *ctx = event->ctx;
1942 struct perf_event *sibling;
1943
1944 WARN_ON_ONCE(ctx->parent_ctx);
1945 mutex_lock(&ctx->mutex);
1946 event = event->group_leader;
1947
1948 perf_event_for_each_child(event, func);
1949 func(event);
1950 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1951 perf_event_for_each_child(event, func);
1952 mutex_unlock(&ctx->mutex);
1953}
1954
1955static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956{
1957 struct perf_event_context *ctx = event->ctx;
1958 unsigned long size;
1959 int ret = 0;
1960 u64 value;
1961
1962 if (!event->attr.sample_period)
1963 return -EINVAL;
1964
1965 size = copy_from_user(&value, arg, sizeof(value));
1966 if (size != sizeof(value))
1967 return -EFAULT;
1968
1969 if (!value)
1970 return -EINVAL;
1971
1972 spin_lock_irq(&ctx->lock);
1973 if (event->attr.freq) {
1974 if (value > sysctl_perf_event_sample_rate) {
1975 ret = -EINVAL;
1976 goto unlock;
1977 }
1978
1979 event->attr.sample_freq = value;
1980 } else {
1981 event->attr.sample_period = value;
1982 event->hw.sample_period = value;
1983 }
1984unlock:
1985 spin_unlock_irq(&ctx->lock);
1986
1987 return ret;
1988}
1989
1990int perf_event_set_output(struct perf_event *event, int output_fd);
1991
1992static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1993{
1994 struct perf_event *event = file->private_data;
1995 void (*func)(struct perf_event *);
1996 u32 flags = arg;
1997
1998 switch (cmd) {
1999 case PERF_EVENT_IOC_ENABLE:
2000 func = perf_event_enable;
2001 break;
2002 case PERF_EVENT_IOC_DISABLE:
2003 func = perf_event_disable;
2004 break;
2005 case PERF_EVENT_IOC_RESET:
2006 func = perf_event_reset;
2007 break;
2008
2009 case PERF_EVENT_IOC_REFRESH:
2010 return perf_event_refresh(event, arg);
2011
2012 case PERF_EVENT_IOC_PERIOD:
2013 return perf_event_period(event, (u64 __user *)arg);
2014
2015 case PERF_EVENT_IOC_SET_OUTPUT:
2016 return perf_event_set_output(event, arg);
2017
2018 default:
2019 return -ENOTTY;
2020 }
2021
2022 if (flags & PERF_IOC_FLAG_GROUP)
2023 perf_event_for_each(event, func);
2024 else
2025 perf_event_for_each_child(event, func);
2026
2027 return 0;
2028}
2029
2030int perf_event_task_enable(void)
2031{
2032 struct perf_event *event;
2033
2034 mutex_lock(&current->perf_event_mutex);
2035 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2036 perf_event_for_each_child(event, perf_event_enable);
2037 mutex_unlock(&current->perf_event_mutex);
2038
2039 return 0;
2040}
2041
2042int perf_event_task_disable(void)
2043{
2044 struct perf_event *event;
2045
2046 mutex_lock(&current->perf_event_mutex);
2047 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2048 perf_event_for_each_child(event, perf_event_disable);
2049 mutex_unlock(&current->perf_event_mutex);
2050
2051 return 0;
2052}
2053
2054#ifndef PERF_EVENT_INDEX_OFFSET
2055# define PERF_EVENT_INDEX_OFFSET 0
2056#endif
2057
2058static int perf_event_index(struct perf_event *event)
2059{
2060 if (event->state != PERF_EVENT_STATE_ACTIVE)
2061 return 0;
2062
2063 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2064}
2065
2066/*
2067 * Callers need to ensure there can be no nesting of this function, otherwise
2068 * the seqlock logic goes bad. We can not serialize this because the arch
2069 * code calls this from NMI context.
2070 */
2071void perf_event_update_userpage(struct perf_event *event)
2072{
2073 struct perf_event_mmap_page *userpg;
2074 struct perf_mmap_data *data;
2075
2076 rcu_read_lock();
2077 data = rcu_dereference(event->data);
2078 if (!data)
2079 goto unlock;
2080
2081 userpg = data->user_page;
2082
2083 /*
2084 * Disable preemption so as to not let the corresponding user-space
2085 * spin too long if we get preempted.
2086 */
2087 preempt_disable();
2088 ++userpg->lock;
2089 barrier();
2090 userpg->index = perf_event_index(event);
2091 userpg->offset = atomic64_read(&event->count);
2092 if (event->state == PERF_EVENT_STATE_ACTIVE)
2093 userpg->offset -= atomic64_read(&event->hw.prev_count);
2094
2095 userpg->time_enabled = event->total_time_enabled +
2096 atomic64_read(&event->child_total_time_enabled);
2097
2098 userpg->time_running = event->total_time_running +
2099 atomic64_read(&event->child_total_time_running);
2100
2101 barrier();
2102 ++userpg->lock;
2103 preempt_enable();
2104unlock:
2105 rcu_read_unlock();
2106}
2107
2108static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2109{
2110 struct perf_event *event = vma->vm_file->private_data;
2111 struct perf_mmap_data *data;
2112 int ret = VM_FAULT_SIGBUS;
2113
2114 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2115 if (vmf->pgoff == 0)
2116 ret = 0;
2117 return ret;
2118 }
2119
2120 rcu_read_lock();
2121 data = rcu_dereference(event->data);
2122 if (!data)
2123 goto unlock;
2124
2125 if (vmf->pgoff == 0) {
2126 vmf->page = virt_to_page(data->user_page);
2127 } else {
2128 int nr = vmf->pgoff - 1;
2129
2130 if ((unsigned)nr > data->nr_pages)
2131 goto unlock;
2132
2133 if (vmf->flags & FAULT_FLAG_WRITE)
2134 goto unlock;
2135
2136 vmf->page = virt_to_page(data->data_pages[nr]);
2137 }
2138
2139 get_page(vmf->page);
2140 vmf->page->mapping = vma->vm_file->f_mapping;
2141 vmf->page->index = vmf->pgoff;
2142
2143 ret = 0;
2144unlock:
2145 rcu_read_unlock();
2146
2147 return ret;
2148}
2149
2150static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151{
2152 struct perf_mmap_data *data;
2153 unsigned long size;
2154 int i;
2155
2156 WARN_ON(atomic_read(&event->mmap_count));
2157
2158 size = sizeof(struct perf_mmap_data);
2159 size += nr_pages * sizeof(void *);
2160
2161 data = kzalloc(size, GFP_KERNEL);
2162 if (!data)
2163 goto fail;
2164
2165 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2166 if (!data->user_page)
2167 goto fail_user_page;
2168
2169 for (i = 0; i < nr_pages; i++) {
2170 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2171 if (!data->data_pages[i])
2172 goto fail_data_pages;
2173 }
2174
2175 data->nr_pages = nr_pages;
2176 atomic_set(&data->lock, -1);
2177
2178 if (event->attr.watermark) {
2179 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180 event->attr.wakeup_watermark);
2181 }
2182 if (!data->watermark)
2183 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2184
2185 rcu_assign_pointer(event->data, data);
2186
2187 return 0;
2188
2189fail_data_pages:
2190 for (i--; i >= 0; i--)
2191 free_page((unsigned long)data->data_pages[i]);
2192
2193 free_page((unsigned long)data->user_page);
2194
2195fail_user_page:
2196 kfree(data);
2197
2198fail:
2199 return -ENOMEM;
2200}
2201
2202static void perf_mmap_free_page(unsigned long addr)
2203{
2204 struct page *page = virt_to_page((void *)addr);
2205
2206 page->mapping = NULL;
2207 __free_page(page);
2208}
2209
2210static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2211{
2212 struct perf_mmap_data *data;
2213 int i;
2214
2215 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2216
2217 perf_mmap_free_page((unsigned long)data->user_page);
2218 for (i = 0; i < data->nr_pages; i++)
2219 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2220
2221 kfree(data);
2222}
2223
2224static void perf_mmap_data_free(struct perf_event *event)
2225{
2226 struct perf_mmap_data *data = event->data;
2227
2228 WARN_ON(atomic_read(&event->mmap_count));
2229
2230 rcu_assign_pointer(event->data, NULL);
2231 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2232}
2233
2234static void perf_mmap_open(struct vm_area_struct *vma)
2235{
2236 struct perf_event *event = vma->vm_file->private_data;
2237
2238 atomic_inc(&event->mmap_count);
2239}
2240
2241static void perf_mmap_close(struct vm_area_struct *vma)
2242{
2243 struct perf_event *event = vma->vm_file->private_data;
2244
2245 WARN_ON_ONCE(event->ctx->parent_ctx);
2246 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2247 struct user_struct *user = current_user();
2248
2249 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
2250 vma->vm_mm->locked_vm -= event->data->nr_locked;
2251 perf_mmap_data_free(event);
2252 mutex_unlock(&event->mmap_mutex);
2253 }
2254}
2255
2256static const struct vm_operations_struct perf_mmap_vmops = {
2257 .open = perf_mmap_open,
2258 .close = perf_mmap_close,
2259 .fault = perf_mmap_fault,
2260 .page_mkwrite = perf_mmap_fault,
2261};
2262
2263static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2264{
2265 struct perf_event *event = file->private_data;
2266 unsigned long user_locked, user_lock_limit;
2267 struct user_struct *user = current_user();
2268 unsigned long locked, lock_limit;
2269 unsigned long vma_size;
2270 unsigned long nr_pages;
2271 long user_extra, extra;
2272 int ret = 0;
2273
2274 if (!(vma->vm_flags & VM_SHARED))
2275 return -EINVAL;
2276
2277 vma_size = vma->vm_end - vma->vm_start;
2278 nr_pages = (vma_size / PAGE_SIZE) - 1;
2279
2280 /*
2281 * If we have data pages ensure they're a power-of-two number, so we
2282 * can do bitmasks instead of modulo.
2283 */
2284 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2285 return -EINVAL;
2286
2287 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2288 return -EINVAL;
2289
2290 if (vma->vm_pgoff != 0)
2291 return -EINVAL;
2292
2293 WARN_ON_ONCE(event->ctx->parent_ctx);
2294 mutex_lock(&event->mmap_mutex);
2295 if (event->output) {
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 if (atomic_inc_not_zero(&event->mmap_count)) {
2301 if (nr_pages != event->data->nr_pages)
2302 ret = -EINVAL;
2303 goto unlock;
2304 }
2305
2306 user_extra = nr_pages + 1;
2307 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2308
2309 /*
2310 * Increase the limit linearly with more CPUs:
2311 */
2312 user_lock_limit *= num_online_cpus();
2313
2314 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2315
2316 extra = 0;
2317 if (user_locked > user_lock_limit)
2318 extra = user_locked - user_lock_limit;
2319
2320 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2321 lock_limit >>= PAGE_SHIFT;
2322 locked = vma->vm_mm->locked_vm + extra;
2323
2324 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2325 !capable(CAP_IPC_LOCK)) {
2326 ret = -EPERM;
2327 goto unlock;
2328 }
2329
2330 WARN_ON(event->data);
2331 ret = perf_mmap_data_alloc(event, nr_pages);
2332 if (ret)
2333 goto unlock;
2334
2335 atomic_set(&event->mmap_count, 1);
2336 atomic_long_add(user_extra, &user->locked_vm);
2337 vma->vm_mm->locked_vm += extra;
2338 event->data->nr_locked = extra;
2339 if (vma->vm_flags & VM_WRITE)
2340 event->data->writable = 1;
2341
2342unlock:
2343 mutex_unlock(&event->mmap_mutex);
2344
2345 vma->vm_flags |= VM_RESERVED;
2346 vma->vm_ops = &perf_mmap_vmops;
2347
2348 return ret;
2349}
2350
2351static int perf_fasync(int fd, struct file *filp, int on)
2352{
2353 struct inode *inode = filp->f_path.dentry->d_inode;
2354 struct perf_event *event = filp->private_data;
2355 int retval;
2356
2357 mutex_lock(&inode->i_mutex);
2358 retval = fasync_helper(fd, filp, on, &event->fasync);
2359 mutex_unlock(&inode->i_mutex);
2360
2361 if (retval < 0)
2362 return retval;
2363
2364 return 0;
2365}
2366
2367static const struct file_operations perf_fops = {
2368 .release = perf_release,
2369 .read = perf_read,
2370 .poll = perf_poll,
2371 .unlocked_ioctl = perf_ioctl,
2372 .compat_ioctl = perf_ioctl,
2373 .mmap = perf_mmap,
2374 .fasync = perf_fasync,
2375};
2376
2377/*
2378 * Perf event wakeup
2379 *
2380 * If there's data, ensure we set the poll() state and publish everything
2381 * to user-space before waking everybody up.
2382 */
2383
2384void perf_event_wakeup(struct perf_event *event)
2385{
2386 wake_up_all(&event->waitq);
2387
2388 if (event->pending_kill) {
2389 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2390 event->pending_kill = 0;
2391 }
2392}
2393
2394/*
2395 * Pending wakeups
2396 *
2397 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2398 *
2399 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2400 * single linked list and use cmpxchg() to add entries lockless.
2401 */
2402
2403static void perf_pending_event(struct perf_pending_entry *entry)
2404{
2405 struct perf_event *event = container_of(entry,
2406 struct perf_event, pending);
2407
2408 if (event->pending_disable) {
2409 event->pending_disable = 0;
2410 __perf_event_disable(event);
2411 }
2412
2413 if (event->pending_wakeup) {
2414 event->pending_wakeup = 0;
2415 perf_event_wakeup(event);
2416 }
2417}
2418
2419#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2420
2421static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2422 PENDING_TAIL,
2423};
2424
2425static void perf_pending_queue(struct perf_pending_entry *entry,
2426 void (*func)(struct perf_pending_entry *))
2427{
2428 struct perf_pending_entry **head;
2429
2430 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2431 return;
2432
2433 entry->func = func;
2434
2435 head = &get_cpu_var(perf_pending_head);
2436
2437 do {
2438 entry->next = *head;
2439 } while (cmpxchg(head, entry->next, entry) != entry->next);
2440
2441 set_perf_event_pending();
2442
2443 put_cpu_var(perf_pending_head);
2444}
2445
2446static int __perf_pending_run(void)
2447{
2448 struct perf_pending_entry *list;
2449 int nr = 0;
2450
2451 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2452 while (list != PENDING_TAIL) {
2453 void (*func)(struct perf_pending_entry *);
2454 struct perf_pending_entry *entry = list;
2455
2456 list = list->next;
2457
2458 func = entry->func;
2459 entry->next = NULL;
2460 /*
2461 * Ensure we observe the unqueue before we issue the wakeup,
2462 * so that we won't be waiting forever.
2463 * -- see perf_not_pending().
2464 */
2465 smp_wmb();
2466
2467 func(entry);
2468 nr++;
2469 }
2470
2471 return nr;
2472}
2473
2474static inline int perf_not_pending(struct perf_event *event)
2475{
2476 /*
2477 * If we flush on whatever cpu we run, there is a chance we don't
2478 * need to wait.
2479 */
2480 get_cpu();
2481 __perf_pending_run();
2482 put_cpu();
2483
2484 /*
2485 * Ensure we see the proper queue state before going to sleep
2486 * so that we do not miss the wakeup. -- see perf_pending_handle()
2487 */
2488 smp_rmb();
2489 return event->pending.next == NULL;
2490}
2491
2492static void perf_pending_sync(struct perf_event *event)
2493{
2494 wait_event(event->waitq, perf_not_pending(event));
2495}
2496
2497void perf_event_do_pending(void)
2498{
2499 __perf_pending_run();
2500}
2501
2502/*
2503 * Callchain support -- arch specific
2504 */
2505
2506__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2507{
2508 return NULL;
2509}
2510
2511/*
2512 * Output
2513 */
2514static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2515 unsigned long offset, unsigned long head)
2516{
2517 unsigned long mask;
2518
2519 if (!data->writable)
2520 return true;
2521
2522 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2523
2524 offset = (offset - tail) & mask;
2525 head = (head - tail) & mask;
2526
2527 if ((int)(head - offset) < 0)
2528 return false;
2529
2530 return true;
2531}
2532
2533static void perf_output_wakeup(struct perf_output_handle *handle)
2534{
2535 atomic_set(&handle->data->poll, POLL_IN);
2536
2537 if (handle->nmi) {
2538 handle->event->pending_wakeup = 1;
2539 perf_pending_queue(&handle->event->pending,
2540 perf_pending_event);
2541 } else
2542 perf_event_wakeup(handle->event);
2543}
2544
2545/*
2546 * Curious locking construct.
2547 *
2548 * We need to ensure a later event_id doesn't publish a head when a former
2549 * event_id isn't done writing. However since we need to deal with NMIs we
2550 * cannot fully serialize things.
2551 *
2552 * What we do is serialize between CPUs so we only have to deal with NMI
2553 * nesting on a single CPU.
2554 *
2555 * We only publish the head (and generate a wakeup) when the outer-most
2556 * event_id completes.
2557 */
2558static void perf_output_lock(struct perf_output_handle *handle)
2559{
2560 struct perf_mmap_data *data = handle->data;
2561 int cpu;
2562
2563 handle->locked = 0;
2564
2565 local_irq_save(handle->flags);
2566 cpu = smp_processor_id();
2567
2568 if (in_nmi() && atomic_read(&data->lock) == cpu)
2569 return;
2570
2571 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 cpu_relax();
2573
2574 handle->locked = 1;
2575}
2576
2577static void perf_output_unlock(struct perf_output_handle *handle)
2578{
2579 struct perf_mmap_data *data = handle->data;
2580 unsigned long head;
2581 int cpu;
2582
2583 data->done_head = data->head;
2584
2585 if (!handle->locked)
2586 goto out;
2587
2588again:
2589 /*
2590 * The xchg implies a full barrier that ensures all writes are done
2591 * before we publish the new head, matched by a rmb() in userspace when
2592 * reading this position.
2593 */
2594 while ((head = atomic_long_xchg(&data->done_head, 0)))
2595 data->user_page->data_head = head;
2596
2597 /*
2598 * NMI can happen here, which means we can miss a done_head update.
2599 */
2600
2601 cpu = atomic_xchg(&data->lock, -1);
2602 WARN_ON_ONCE(cpu != smp_processor_id());
2603
2604 /*
2605 * Therefore we have to validate we did not indeed do so.
2606 */
2607 if (unlikely(atomic_long_read(&data->done_head))) {
2608 /*
2609 * Since we had it locked, we can lock it again.
2610 */
2611 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2612 cpu_relax();
2613
2614 goto again;
2615 }
2616
2617 if (atomic_xchg(&data->wakeup, 0))
2618 perf_output_wakeup(handle);
2619out:
2620 local_irq_restore(handle->flags);
2621}
2622
2623void perf_output_copy(struct perf_output_handle *handle,
2624 const void *buf, unsigned int len)
2625{
2626 unsigned int pages_mask;
2627 unsigned int offset;
2628 unsigned int size;
2629 void **pages;
2630
2631 offset = handle->offset;
2632 pages_mask = handle->data->nr_pages - 1;
2633 pages = handle->data->data_pages;
2634
2635 do {
2636 unsigned int page_offset;
2637 int nr;
2638
2639 nr = (offset >> PAGE_SHIFT) & pages_mask;
2640 page_offset = offset & (PAGE_SIZE - 1);
2641 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2642
2643 memcpy(pages[nr] + page_offset, buf, size);
2644
2645 len -= size;
2646 buf += size;
2647 offset += size;
2648 } while (len);
2649
2650 handle->offset = offset;
2651
2652 /*
2653 * Check we didn't copy past our reservation window, taking the
2654 * possible unsigned int wrap into account.
2655 */
2656 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2657}
2658
2659int perf_output_begin(struct perf_output_handle *handle,
2660 struct perf_event *event, unsigned int size,
2661 int nmi, int sample)
2662{
2663 struct perf_event *output_event;
2664 struct perf_mmap_data *data;
2665 unsigned long tail, offset, head;
2666 int have_lost;
2667 struct {
2668 struct perf_event_header header;
2669 u64 id;
2670 u64 lost;
2671 } lost_event;
2672
2673 rcu_read_lock();
2674 /*
2675 * For inherited events we send all the output towards the parent.
2676 */
2677 if (event->parent)
2678 event = event->parent;
2679
2680 output_event = rcu_dereference(event->output);
2681 if (output_event)
2682 event = output_event;
2683
2684 data = rcu_dereference(event->data);
2685 if (!data)
2686 goto out;
2687
2688 handle->data = data;
2689 handle->event = event;
2690 handle->nmi = nmi;
2691 handle->sample = sample;
2692
2693 if (!data->nr_pages)
2694 goto fail;
2695
2696 have_lost = atomic_read(&data->lost);
2697 if (have_lost)
2698 size += sizeof(lost_event);
2699
2700 perf_output_lock(handle);
2701
2702 do {
2703 /*
2704 * Userspace could choose to issue a mb() before updating the
2705 * tail pointer. So that all reads will be completed before the
2706 * write is issued.
2707 */
2708 tail = ACCESS_ONCE(data->user_page->data_tail);
2709 smp_rmb();
2710 offset = head = atomic_long_read(&data->head);
2711 head += size;
2712 if (unlikely(!perf_output_space(data, tail, offset, head)))
2713 goto fail;
2714 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2715
2716 handle->offset = offset;
2717 handle->head = head;
2718
2719 if (head - tail > data->watermark)
2720 atomic_set(&data->wakeup, 1);
2721
2722 if (have_lost) {
2723 lost_event.header.type = PERF_RECORD_LOST;
2724 lost_event.header.misc = 0;
2725 lost_event.header.size = sizeof(lost_event);
2726 lost_event.id = event->id;
2727 lost_event.lost = atomic_xchg(&data->lost, 0);
2728
2729 perf_output_put(handle, lost_event);
2730 }
2731
2732 return 0;
2733
2734fail:
2735 atomic_inc(&data->lost);
2736 perf_output_unlock(handle);
2737out:
2738 rcu_read_unlock();
2739
2740 return -ENOSPC;
2741}
2742
2743void perf_output_end(struct perf_output_handle *handle)
2744{
2745 struct perf_event *event = handle->event;
2746 struct perf_mmap_data *data = handle->data;
2747
2748 int wakeup_events = event->attr.wakeup_events;
2749
2750 if (handle->sample && wakeup_events) {
2751 int events = atomic_inc_return(&data->events);
2752 if (events >= wakeup_events) {
2753 atomic_sub(wakeup_events, &data->events);
2754 atomic_set(&data->wakeup, 1);
2755 }
2756 }
2757
2758 perf_output_unlock(handle);
2759 rcu_read_unlock();
2760}
2761
2762static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2763{
2764 /*
2765 * only top level events have the pid namespace they were created in
2766 */
2767 if (event->parent)
2768 event = event->parent;
2769
2770 return task_tgid_nr_ns(p, event->ns);
2771}
2772
2773static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2774{
2775 /*
2776 * only top level events have the pid namespace they were created in
2777 */
2778 if (event->parent)
2779 event = event->parent;
2780
2781 return task_pid_nr_ns(p, event->ns);
2782}
2783
2784static void perf_output_read_one(struct perf_output_handle *handle,
2785 struct perf_event *event)
2786{
2787 u64 read_format = event->attr.read_format;
2788 u64 values[4];
2789 int n = 0;
2790
2791 values[n++] = atomic64_read(&event->count);
2792 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2793 values[n++] = event->total_time_enabled +
2794 atomic64_read(&event->child_total_time_enabled);
2795 }
2796 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2797 values[n++] = event->total_time_running +
2798 atomic64_read(&event->child_total_time_running);
2799 }
2800 if (read_format & PERF_FORMAT_ID)
2801 values[n++] = primary_event_id(event);
2802
2803 perf_output_copy(handle, values, n * sizeof(u64));
2804}
2805
2806/*
2807 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2808 */
2809static void perf_output_read_group(struct perf_output_handle *handle,
2810 struct perf_event *event)
2811{
2812 struct perf_event *leader = event->group_leader, *sub;
2813 u64 read_format = event->attr.read_format;
2814 u64 values[5];
2815 int n = 0;
2816
2817 values[n++] = 1 + leader->nr_siblings;
2818
2819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2820 values[n++] = leader->total_time_enabled;
2821
2822 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2823 values[n++] = leader->total_time_running;
2824
2825 if (leader != event)
2826 leader->pmu->read(leader);
2827
2828 values[n++] = atomic64_read(&leader->count);
2829 if (read_format & PERF_FORMAT_ID)
2830 values[n++] = primary_event_id(leader);
2831
2832 perf_output_copy(handle, values, n * sizeof(u64));
2833
2834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2835 n = 0;
2836
2837 if (sub != event)
2838 sub->pmu->read(sub);
2839
2840 values[n++] = atomic64_read(&sub->count);
2841 if (read_format & PERF_FORMAT_ID)
2842 values[n++] = primary_event_id(sub);
2843
2844 perf_output_copy(handle, values, n * sizeof(u64));
2845 }
2846}
2847
2848static void perf_output_read(struct perf_output_handle *handle,
2849 struct perf_event *event)
2850{
2851 if (event->attr.read_format & PERF_FORMAT_GROUP)
2852 perf_output_read_group(handle, event);
2853 else
2854 perf_output_read_one(handle, event);
2855}
2856
2857void perf_output_sample(struct perf_output_handle *handle,
2858 struct perf_event_header *header,
2859 struct perf_sample_data *data,
2860 struct perf_event *event)
2861{
2862 u64 sample_type = data->type;
2863
2864 perf_output_put(handle, *header);
2865
2866 if (sample_type & PERF_SAMPLE_IP)
2867 perf_output_put(handle, data->ip);
2868
2869 if (sample_type & PERF_SAMPLE_TID)
2870 perf_output_put(handle, data->tid_entry);
2871
2872 if (sample_type & PERF_SAMPLE_TIME)
2873 perf_output_put(handle, data->time);
2874
2875 if (sample_type & PERF_SAMPLE_ADDR)
2876 perf_output_put(handle, data->addr);
2877
2878 if (sample_type & PERF_SAMPLE_ID)
2879 perf_output_put(handle, data->id);
2880
2881 if (sample_type & PERF_SAMPLE_STREAM_ID)
2882 perf_output_put(handle, data->stream_id);
2883
2884 if (sample_type & PERF_SAMPLE_CPU)
2885 perf_output_put(handle, data->cpu_entry);
2886
2887 if (sample_type & PERF_SAMPLE_PERIOD)
2888 perf_output_put(handle, data->period);
2889
2890 if (sample_type & PERF_SAMPLE_READ)
2891 perf_output_read(handle, event);
2892
2893 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2894 if (data->callchain) {
2895 int size = 1;
2896
2897 if (data->callchain)
2898 size += data->callchain->nr;
2899
2900 size *= sizeof(u64);
2901
2902 perf_output_copy(handle, data->callchain, size);
2903 } else {
2904 u64 nr = 0;
2905 perf_output_put(handle, nr);
2906 }
2907 }
2908
2909 if (sample_type & PERF_SAMPLE_RAW) {
2910 if (data->raw) {
2911 perf_output_put(handle, data->raw->size);
2912 perf_output_copy(handle, data->raw->data,
2913 data->raw->size);
2914 } else {
2915 struct {
2916 u32 size;
2917 u32 data;
2918 } raw = {
2919 .size = sizeof(u32),
2920 .data = 0,
2921 };
2922 perf_output_put(handle, raw);
2923 }
2924 }
2925}
2926
2927void perf_prepare_sample(struct perf_event_header *header,
2928 struct perf_sample_data *data,
2929 struct perf_event *event,
2930 struct pt_regs *regs)
2931{
2932 u64 sample_type = event->attr.sample_type;
2933
2934 data->type = sample_type;
2935
2936 header->type = PERF_RECORD_SAMPLE;
2937 header->size = sizeof(*header);
2938
2939 header->misc = 0;
2940 header->misc |= perf_misc_flags(regs);
2941
2942 if (sample_type & PERF_SAMPLE_IP) {
2943 data->ip = perf_instruction_pointer(regs);
2944
2945 header->size += sizeof(data->ip);
2946 }
2947
2948 if (sample_type & PERF_SAMPLE_TID) {
2949 /* namespace issues */
2950 data->tid_entry.pid = perf_event_pid(event, current);
2951 data->tid_entry.tid = perf_event_tid(event, current);
2952
2953 header->size += sizeof(data->tid_entry);
2954 }
2955
2956 if (sample_type & PERF_SAMPLE_TIME) {
2957 data->time = perf_clock();
2958
2959 header->size += sizeof(data->time);
2960 }
2961
2962 if (sample_type & PERF_SAMPLE_ADDR)
2963 header->size += sizeof(data->addr);
2964
2965 if (sample_type & PERF_SAMPLE_ID) {
2966 data->id = primary_event_id(event);
2967
2968 header->size += sizeof(data->id);
2969 }
2970
2971 if (sample_type & PERF_SAMPLE_STREAM_ID) {
2972 data->stream_id = event->id;
2973
2974 header->size += sizeof(data->stream_id);
2975 }
2976
2977 if (sample_type & PERF_SAMPLE_CPU) {
2978 data->cpu_entry.cpu = raw_smp_processor_id();
2979 data->cpu_entry.reserved = 0;
2980
2981 header->size += sizeof(data->cpu_entry);
2982 }
2983
2984 if (sample_type & PERF_SAMPLE_PERIOD)
2985 header->size += sizeof(data->period);
2986
2987 if (sample_type & PERF_SAMPLE_READ)
2988 header->size += perf_event_read_size(event);
2989
2990 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2991 int size = 1;
2992
2993 data->callchain = perf_callchain(regs);
2994
2995 if (data->callchain)
2996 size += data->callchain->nr;
2997
2998 header->size += size * sizeof(u64);
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 int size = sizeof(u32);
3003
3004 if (data->raw)
3005 size += data->raw->size;
3006 else
3007 size += sizeof(u32);
3008
3009 WARN_ON_ONCE(size & (sizeof(u64)-1));
3010 header->size += size;
3011 }
3012}
3013
3014static void perf_event_output(struct perf_event *event, int nmi,
3015 struct perf_sample_data *data,
3016 struct pt_regs *regs)
3017{
3018 struct perf_output_handle handle;
3019 struct perf_event_header header;
3020
3021 perf_prepare_sample(&header, data, event, regs);
3022
3023 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3024 return;
3025
3026 perf_output_sample(&handle, &header, data, event);
3027
3028 perf_output_end(&handle);
3029}
3030
3031/*
3032 * read event_id
3033 */
3034
3035struct perf_read_event {
3036 struct perf_event_header header;
3037
3038 u32 pid;
3039 u32 tid;
3040};
3041
3042static void
3043perf_event_read_event(struct perf_event *event,
3044 struct task_struct *task)
3045{
3046 struct perf_output_handle handle;
3047 struct perf_read_event read_event = {
3048 .header = {
3049 .type = PERF_RECORD_READ,
3050 .misc = 0,
3051 .size = sizeof(read_event) + perf_event_read_size(event),
3052 },
3053 .pid = perf_event_pid(event, task),
3054 .tid = perf_event_tid(event, task),
3055 };
3056 int ret;
3057
3058 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3059 if (ret)
3060 return;
3061
3062 perf_output_put(&handle, read_event);
3063 perf_output_read(&handle, event);
3064
3065 perf_output_end(&handle);
3066}
3067
3068/*
3069 * task tracking -- fork/exit
3070 *
3071 * enabled by: attr.comm | attr.mmap | attr.task
3072 */
3073
3074struct perf_task_event {
3075 struct task_struct *task;
3076 struct perf_event_context *task_ctx;
3077
3078 struct {
3079 struct perf_event_header header;
3080
3081 u32 pid;
3082 u32 ppid;
3083 u32 tid;
3084 u32 ptid;
3085 u64 time;
3086 } event_id;
3087};
3088
3089static void perf_event_task_output(struct perf_event *event,
3090 struct perf_task_event *task_event)
3091{
3092 struct perf_output_handle handle;
3093 int size;
3094 struct task_struct *task = task_event->task;
3095 int ret;
3096
3097 size = task_event->event_id.header.size;
3098 ret = perf_output_begin(&handle, event, size, 0, 0);
3099
3100 if (ret)
3101 return;
3102
3103 task_event->event_id.pid = perf_event_pid(event, task);
3104 task_event->event_id.ppid = perf_event_pid(event, current);
3105
3106 task_event->event_id.tid = perf_event_tid(event, task);
3107 task_event->event_id.ptid = perf_event_tid(event, current);
3108
3109 task_event->event_id.time = perf_clock();
3110
3111 perf_output_put(&handle, task_event->event_id);
3112
3113 perf_output_end(&handle);
3114}
3115
3116static int perf_event_task_match(struct perf_event *event)
3117{
3118 if (event->attr.comm || event->attr.mmap || event->attr.task)
3119 return 1;
3120
3121 return 0;
3122}
3123
3124static void perf_event_task_ctx(struct perf_event_context *ctx,
3125 struct perf_task_event *task_event)
3126{
3127 struct perf_event *event;
3128
3129 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3130 return;
3131
3132 rcu_read_lock();
3133 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3134 if (perf_event_task_match(event))
3135 perf_event_task_output(event, task_event);
3136 }
3137 rcu_read_unlock();
3138}
3139
3140static void perf_event_task_event(struct perf_task_event *task_event)
3141{
3142 struct perf_cpu_context *cpuctx;
3143 struct perf_event_context *ctx = task_event->task_ctx;
3144
3145 cpuctx = &get_cpu_var(perf_cpu_context);
3146 perf_event_task_ctx(&cpuctx->ctx, task_event);
3147 put_cpu_var(perf_cpu_context);
3148
3149 rcu_read_lock();
3150 if (!ctx)
3151 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3152 if (ctx)
3153 perf_event_task_ctx(ctx, task_event);
3154 rcu_read_unlock();
3155}
3156
3157static void perf_event_task(struct task_struct *task,
3158 struct perf_event_context *task_ctx,
3159 int new)
3160{
3161 struct perf_task_event task_event;
3162
3163 if (!atomic_read(&nr_comm_events) &&
3164 !atomic_read(&nr_mmap_events) &&
3165 !atomic_read(&nr_task_events))
3166 return;
3167
3168 task_event = (struct perf_task_event){
3169 .task = task,
3170 .task_ctx = task_ctx,
3171 .event_id = {
3172 .header = {
3173 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3174 .misc = 0,
3175 .size = sizeof(task_event.event_id),
3176 },
3177 /* .pid */
3178 /* .ppid */
3179 /* .tid */
3180 /* .ptid */
3181 },
3182 };
3183
3184 perf_event_task_event(&task_event);
3185}
3186
3187void perf_event_fork(struct task_struct *task)
3188{
3189 perf_event_task(task, NULL, 1);
3190}
3191
3192/*
3193 * comm tracking
3194 */
3195
3196struct perf_comm_event {
3197 struct task_struct *task;
3198 char *comm;
3199 int comm_size;
3200
3201 struct {
3202 struct perf_event_header header;
3203
3204 u32 pid;
3205 u32 tid;
3206 } event_id;
3207};
3208
3209static void perf_event_comm_output(struct perf_event *event,
3210 struct perf_comm_event *comm_event)
3211{
3212 struct perf_output_handle handle;
3213 int size = comm_event->event_id.header.size;
3214 int ret = perf_output_begin(&handle, event, size, 0, 0);
3215
3216 if (ret)
3217 return;
3218
3219 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3220 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3221
3222 perf_output_put(&handle, comm_event->event_id);
3223 perf_output_copy(&handle, comm_event->comm,
3224 comm_event->comm_size);
3225 perf_output_end(&handle);
3226}
3227
3228static int perf_event_comm_match(struct perf_event *event)
3229{
3230 if (event->attr.comm)
3231 return 1;
3232
3233 return 0;
3234}
3235
3236static void perf_event_comm_ctx(struct perf_event_context *ctx,
3237 struct perf_comm_event *comm_event)
3238{
3239 struct perf_event *event;
3240
3241 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3242 return;
3243
3244 rcu_read_lock();
3245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3246 if (perf_event_comm_match(event))
3247 perf_event_comm_output(event, comm_event);
3248 }
3249 rcu_read_unlock();
3250}
3251
3252static void perf_event_comm_event(struct perf_comm_event *comm_event)
3253{
3254 struct perf_cpu_context *cpuctx;
3255 struct perf_event_context *ctx;
3256 unsigned int size;
3257 char comm[TASK_COMM_LEN];
3258
3259 memset(comm, 0, sizeof(comm));
3260 strncpy(comm, comm_event->task->comm, sizeof(comm));
3261 size = ALIGN(strlen(comm)+1, sizeof(u64));
3262
3263 comm_event->comm = comm;
3264 comm_event->comm_size = size;
3265
3266 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267
3268 cpuctx = &get_cpu_var(perf_cpu_context);
3269 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270 put_cpu_var(perf_cpu_context);
3271
3272 rcu_read_lock();
3273 /*
3274 * doesn't really matter which of the child contexts the
3275 * events ends up in.
3276 */
3277 ctx = rcu_dereference(current->perf_event_ctxp);
3278 if (ctx)
3279 perf_event_comm_ctx(ctx, comm_event);
3280 rcu_read_unlock();
3281}
3282
3283void perf_event_comm(struct task_struct *task)
3284{
3285 struct perf_comm_event comm_event;
3286
3287 if (task->perf_event_ctxp)
3288 perf_event_enable_on_exec(task);
3289
3290 if (!atomic_read(&nr_comm_events))
3291 return;
3292
3293 comm_event = (struct perf_comm_event){
3294 .task = task,
3295 /* .comm */
3296 /* .comm_size */
3297 .event_id = {
3298 .header = {
3299 .type = PERF_RECORD_COMM,
3300 .misc = 0,
3301 /* .size */
3302 },
3303 /* .pid */
3304 /* .tid */
3305 },
3306 };
3307
3308 perf_event_comm_event(&comm_event);
3309}
3310
3311/*
3312 * mmap tracking
3313 */
3314
3315struct perf_mmap_event {
3316 struct vm_area_struct *vma;
3317
3318 const char *file_name;
3319 int file_size;
3320
3321 struct {
3322 struct perf_event_header header;
3323
3324 u32 pid;
3325 u32 tid;
3326 u64 start;
3327 u64 len;
3328 u64 pgoff;
3329 } event_id;
3330};
3331
3332static void perf_event_mmap_output(struct perf_event *event,
3333 struct perf_mmap_event *mmap_event)
3334{
3335 struct perf_output_handle handle;
3336 int size = mmap_event->event_id.header.size;
3337 int ret = perf_output_begin(&handle, event, size, 0, 0);
3338
3339 if (ret)
3340 return;
3341
3342 mmap_event->event_id.pid = perf_event_pid(event, current);
3343 mmap_event->event_id.tid = perf_event_tid(event, current);
3344
3345 perf_output_put(&handle, mmap_event->event_id);
3346 perf_output_copy(&handle, mmap_event->file_name,
3347 mmap_event->file_size);
3348 perf_output_end(&handle);
3349}
3350
3351static int perf_event_mmap_match(struct perf_event *event,
3352 struct perf_mmap_event *mmap_event)
3353{
3354 if (event->attr.mmap)
3355 return 1;
3356
3357 return 0;
3358}
3359
3360static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3361 struct perf_mmap_event *mmap_event)
3362{
3363 struct perf_event *event;
3364
3365 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3366 return;
3367
3368 rcu_read_lock();
3369 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3370 if (perf_event_mmap_match(event, mmap_event))
3371 perf_event_mmap_output(event, mmap_event);
3372 }
3373 rcu_read_unlock();
3374}
3375
3376static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3377{
3378 struct perf_cpu_context *cpuctx;
3379 struct perf_event_context *ctx;
3380 struct vm_area_struct *vma = mmap_event->vma;
3381 struct file *file = vma->vm_file;
3382 unsigned int size;
3383 char tmp[16];
3384 char *buf = NULL;
3385 const char *name;
3386
3387 memset(tmp, 0, sizeof(tmp));
3388
3389 if (file) {
3390 /*
3391 * d_path works from the end of the buffer backwards, so we
3392 * need to add enough zero bytes after the string to handle
3393 * the 64bit alignment we do later.
3394 */
3395 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3396 if (!buf) {
3397 name = strncpy(tmp, "//enomem", sizeof(tmp));
3398 goto got_name;
3399 }
3400 name = d_path(&file->f_path, buf, PATH_MAX);
3401 if (IS_ERR(name)) {
3402 name = strncpy(tmp, "//toolong", sizeof(tmp));
3403 goto got_name;
3404 }
3405 } else {
3406 if (arch_vma_name(mmap_event->vma)) {
3407 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3408 sizeof(tmp));
3409 goto got_name;
3410 }
3411
3412 if (!vma->vm_mm) {
3413 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3414 goto got_name;
3415 }
3416
3417 name = strncpy(tmp, "//anon", sizeof(tmp));
3418 goto got_name;
3419 }
3420
3421got_name:
3422 size = ALIGN(strlen(name)+1, sizeof(u64));
3423
3424 mmap_event->file_name = name;
3425 mmap_event->file_size = size;
3426
3427 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428
3429 cpuctx = &get_cpu_var(perf_cpu_context);
3430 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 put_cpu_var(perf_cpu_context);
3432
3433 rcu_read_lock();
3434 /*
3435 * doesn't really matter which of the child contexts the
3436 * events ends up in.
3437 */
3438 ctx = rcu_dereference(current->perf_event_ctxp);
3439 if (ctx)
3440 perf_event_mmap_ctx(ctx, mmap_event);
3441 rcu_read_unlock();
3442
3443 kfree(buf);
3444}
3445
3446void __perf_event_mmap(struct vm_area_struct *vma)
3447{
3448 struct perf_mmap_event mmap_event;
3449
3450 if (!atomic_read(&nr_mmap_events))
3451 return;
3452
3453 mmap_event = (struct perf_mmap_event){
3454 .vma = vma,
3455 /* .file_name */
3456 /* .file_size */
3457 .event_id = {
3458 .header = {
3459 .type = PERF_RECORD_MMAP,
3460 .misc = 0,
3461 /* .size */
3462 },
3463 /* .pid */
3464 /* .tid */
3465 .start = vma->vm_start,
3466 .len = vma->vm_end - vma->vm_start,
3467 .pgoff = vma->vm_pgoff,
3468 },
3469 };
3470
3471 perf_event_mmap_event(&mmap_event);
3472}
3473
3474/*
3475 * IRQ throttle logging
3476 */
3477
3478static void perf_log_throttle(struct perf_event *event, int enable)
3479{
3480 struct perf_output_handle handle;
3481 int ret;
3482
3483 struct {
3484 struct perf_event_header header;
3485 u64 time;
3486 u64 id;
3487 u64 stream_id;
3488 } throttle_event = {
3489 .header = {
3490 .type = PERF_RECORD_THROTTLE,
3491 .misc = 0,
3492 .size = sizeof(throttle_event),
3493 },
3494 .time = perf_clock(),
3495 .id = primary_event_id(event),
3496 .stream_id = event->id,
3497 };
3498
3499 if (enable)
3500 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3501
3502 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3503 if (ret)
3504 return;
3505
3506 perf_output_put(&handle, throttle_event);
3507 perf_output_end(&handle);
3508}
3509
3510/*
3511 * Generic event overflow handling, sampling.
3512 */
3513
3514static int __perf_event_overflow(struct perf_event *event, int nmi,
3515 int throttle, struct perf_sample_data *data,
3516 struct pt_regs *regs)
3517{
3518 int events = atomic_read(&event->event_limit);
3519 struct hw_perf_event *hwc = &event->hw;
3520 int ret = 0;
3521
3522 throttle = (throttle && event->pmu->unthrottle != NULL);
3523
3524 if (!throttle) {
3525 hwc->interrupts++;
3526 } else {
3527 if (hwc->interrupts != MAX_INTERRUPTS) {
3528 hwc->interrupts++;
3529 if (HZ * hwc->interrupts >
3530 (u64)sysctl_perf_event_sample_rate) {
3531 hwc->interrupts = MAX_INTERRUPTS;
3532 perf_log_throttle(event, 0);
3533 ret = 1;
3534 }
3535 } else {
3536 /*
3537 * Keep re-disabling events even though on the previous
3538 * pass we disabled it - just in case we raced with a
3539 * sched-in and the event got enabled again:
3540 */
3541 ret = 1;
3542 }
3543 }
3544
3545 if (event->attr.freq) {
3546 u64 now = perf_clock();
3547 s64 delta = now - hwc->freq_stamp;
3548
3549 hwc->freq_stamp = now;
3550
3551 if (delta > 0 && delta < TICK_NSEC)
3552 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3553 }
3554
3555 /*
3556 * XXX event_limit might not quite work as expected on inherited
3557 * events
3558 */
3559
3560 event->pending_kill = POLL_IN;
3561 if (events && atomic_dec_and_test(&event->event_limit)) {
3562 ret = 1;
3563 event->pending_kill = POLL_HUP;
3564 if (nmi) {
3565 event->pending_disable = 1;
3566 perf_pending_queue(&event->pending,
3567 perf_pending_event);
3568 } else
3569 perf_event_disable(event);
3570 }
3571
3572 perf_event_output(event, nmi, data, regs);
3573 return ret;
3574}
3575
3576int perf_event_overflow(struct perf_event *event, int nmi,
3577 struct perf_sample_data *data,
3578 struct pt_regs *regs)
3579{
3580 return __perf_event_overflow(event, nmi, 1, data, regs);
3581}
3582
3583/*
3584 * Generic software event infrastructure
3585 */
3586
3587/*
3588 * We directly increment event->count and keep a second value in
3589 * event->hw.period_left to count intervals. This period event
3590 * is kept in the range [-sample_period, 0] so that we can use the
3591 * sign as trigger.
3592 */
3593
3594static u64 perf_swevent_set_period(struct perf_event *event)
3595{
3596 struct hw_perf_event *hwc = &event->hw;
3597 u64 period = hwc->last_period;
3598 u64 nr, offset;
3599 s64 old, val;
3600
3601 hwc->last_period = hwc->sample_period;
3602
3603again:
3604 old = val = atomic64_read(&hwc->period_left);
3605 if (val < 0)
3606 return 0;
3607
3608 nr = div64_u64(period + val, period);
3609 offset = nr * period;
3610 val -= offset;
3611 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3612 goto again;
3613
3614 return nr;
3615}
3616
3617static void perf_swevent_overflow(struct perf_event *event,
3618 int nmi, struct perf_sample_data *data,
3619 struct pt_regs *regs)
3620{
3621 struct hw_perf_event *hwc = &event->hw;
3622 int throttle = 0;
3623 u64 overflow;
3624
3625 data->period = event->hw.last_period;
3626 overflow = perf_swevent_set_period(event);
3627
3628 if (hwc->interrupts == MAX_INTERRUPTS)
3629 return;
3630
3631 for (; overflow; overflow--) {
3632 if (__perf_event_overflow(event, nmi, throttle,
3633 data, regs)) {
3634 /*
3635 * We inhibit the overflow from happening when
3636 * hwc->interrupts == MAX_INTERRUPTS.
3637 */
3638 break;
3639 }
3640 throttle = 1;
3641 }
3642}
3643
3644static void perf_swevent_unthrottle(struct perf_event *event)
3645{
3646 /*
3647 * Nothing to do, we already reset hwc->interrupts.
3648 */
3649}
3650
3651static void perf_swevent_add(struct perf_event *event, u64 nr,
3652 int nmi, struct perf_sample_data *data,
3653 struct pt_regs *regs)
3654{
3655 struct hw_perf_event *hwc = &event->hw;
3656
3657 atomic64_add(nr, &event->count);
3658
3659 if (!hwc->sample_period)
3660 return;
3661
3662 if (!regs)
3663 return;
3664
3665 if (!atomic64_add_negative(nr, &hwc->period_left))
3666 perf_swevent_overflow(event, nmi, data, regs);
3667}
3668
3669static int perf_swevent_is_counting(struct perf_event *event)
3670{
3671 /*
3672 * The event is active, we're good!
3673 */
3674 if (event->state == PERF_EVENT_STATE_ACTIVE)
3675 return 1;
3676
3677 /*
3678 * The event is off/error, not counting.
3679 */
3680 if (event->state != PERF_EVENT_STATE_INACTIVE)
3681 return 0;
3682
3683 /*
3684 * The event is inactive, if the context is active
3685 * we're part of a group that didn't make it on the 'pmu',
3686 * not counting.
3687 */
3688 if (event->ctx->is_active)
3689 return 0;
3690
3691 /*
3692 * We're inactive and the context is too, this means the
3693 * task is scheduled out, we're counting events that happen
3694 * to us, like migration events.
3695 */
3696 return 1;
3697}
3698
3699static int perf_swevent_match(struct perf_event *event,
3700 enum perf_type_id type,
3701 u32 event_id, struct pt_regs *regs)
3702{
3703 if (!perf_swevent_is_counting(event))
3704 return 0;
3705
3706 if (event->attr.type != type)
3707 return 0;
3708 if (event->attr.config != event_id)
3709 return 0;
3710
3711 if (regs) {
3712 if (event->attr.exclude_user && user_mode(regs))
3713 return 0;
3714
3715 if (event->attr.exclude_kernel && !user_mode(regs))
3716 return 0;
3717 }
3718
3719 return 1;
3720}
3721
3722static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3723 enum perf_type_id type,
3724 u32 event_id, u64 nr, int nmi,
3725 struct perf_sample_data *data,
3726 struct pt_regs *regs)
3727{
3728 struct perf_event *event;
3729
3730 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 return;
3732
3733 rcu_read_lock();
3734 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3735 if (perf_swevent_match(event, type, event_id, regs))
3736 perf_swevent_add(event, nr, nmi, data, regs);
3737 }
3738 rcu_read_unlock();
3739}
3740
3741static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3742{
3743 if (in_nmi())
3744 return &cpuctx->recursion[3];
3745
3746 if (in_irq())
3747 return &cpuctx->recursion[2];
3748
3749 if (in_softirq())
3750 return &cpuctx->recursion[1];
3751
3752 return &cpuctx->recursion[0];
3753}
3754
3755static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756 u64 nr, int nmi,
3757 struct perf_sample_data *data,
3758 struct pt_regs *regs)
3759{
3760 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3761 int *recursion = perf_swevent_recursion_context(cpuctx);
3762 struct perf_event_context *ctx;
3763
3764 if (*recursion)
3765 goto out;
3766
3767 (*recursion)++;
3768 barrier();
3769
3770 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771 nr, nmi, data, regs);
3772 rcu_read_lock();
3773 /*
3774 * doesn't really matter which of the child contexts the
3775 * events ends up in.
3776 */
3777 ctx = rcu_dereference(current->perf_event_ctxp);
3778 if (ctx)
3779 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780 rcu_read_unlock();
3781
3782 barrier();
3783 (*recursion)--;
3784
3785out:
3786 put_cpu_var(perf_cpu_context);
3787}
3788
3789void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790 struct pt_regs *regs, u64 addr)
3791{
3792 struct perf_sample_data data = {
3793 .addr = addr,
3794 };
3795
3796 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3797 &data, regs);
3798}
3799
3800static void perf_swevent_read(struct perf_event *event)
3801{
3802}
3803
3804static int perf_swevent_enable(struct perf_event *event)
3805{
3806 struct hw_perf_event *hwc = &event->hw;
3807
3808 if (hwc->sample_period) {
3809 hwc->last_period = hwc->sample_period;
3810 perf_swevent_set_period(event);
3811 }
3812 return 0;
3813}
3814
3815static void perf_swevent_disable(struct perf_event *event)
3816{
3817}
3818
3819static const struct pmu perf_ops_generic = {
3820 .enable = perf_swevent_enable,
3821 .disable = perf_swevent_disable,
3822 .read = perf_swevent_read,
3823 .unthrottle = perf_swevent_unthrottle,
3824};
3825
3826/*
3827 * hrtimer based swevent callback
3828 */
3829
3830static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3831{
3832 enum hrtimer_restart ret = HRTIMER_RESTART;
3833 struct perf_sample_data data;
3834 struct pt_regs *regs;
3835 struct perf_event *event;
3836 u64 period;
3837
3838 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3839 event->pmu->read(event);
3840
3841 data.addr = 0;
3842 regs = get_irq_regs();
3843 /*
3844 * In case we exclude kernel IPs or are somehow not in interrupt
3845 * context, provide the next best thing, the user IP.
3846 */
3847 if ((event->attr.exclude_kernel || !regs) &&
3848 !event->attr.exclude_user)
3849 regs = task_pt_regs(current);
3850
3851 if (regs) {
3852 if (perf_event_overflow(event, 0, &data, regs))
3853 ret = HRTIMER_NORESTART;
3854 }
3855
3856 period = max_t(u64, 10000, event->hw.sample_period);
3857 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3858
3859 return ret;
3860}
3861
3862/*
3863 * Software event: cpu wall time clock
3864 */
3865
3866static void cpu_clock_perf_event_update(struct perf_event *event)
3867{
3868 int cpu = raw_smp_processor_id();
3869 s64 prev;
3870 u64 now;
3871
3872 now = cpu_clock(cpu);
3873 prev = atomic64_read(&event->hw.prev_count);
3874 atomic64_set(&event->hw.prev_count, now);
3875 atomic64_add(now - prev, &event->count);
3876}
3877
3878static int cpu_clock_perf_event_enable(struct perf_event *event)
3879{
3880 struct hw_perf_event *hwc = &event->hw;
3881 int cpu = raw_smp_processor_id();
3882
3883 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3885 hwc->hrtimer.function = perf_swevent_hrtimer;
3886 if (hwc->sample_period) {
3887 u64 period = max_t(u64, 10000, hwc->sample_period);
3888 __hrtimer_start_range_ns(&hwc->hrtimer,
3889 ns_to_ktime(period), 0,
3890 HRTIMER_MODE_REL, 0);
3891 }
3892
3893 return 0;
3894}
3895
3896static void cpu_clock_perf_event_disable(struct perf_event *event)
3897{
3898 if (event->hw.sample_period)
3899 hrtimer_cancel(&event->hw.hrtimer);
3900 cpu_clock_perf_event_update(event);
3901}
3902
3903static void cpu_clock_perf_event_read(struct perf_event *event)
3904{
3905 cpu_clock_perf_event_update(event);
3906}
3907
3908static const struct pmu perf_ops_cpu_clock = {
3909 .enable = cpu_clock_perf_event_enable,
3910 .disable = cpu_clock_perf_event_disable,
3911 .read = cpu_clock_perf_event_read,
3912};
3913
3914/*
3915 * Software event: task time clock
3916 */
3917
3918static void task_clock_perf_event_update(struct perf_event *event, u64 now)
3919{
3920 u64 prev;
3921 s64 delta;
3922
3923 prev = atomic64_xchg(&event->hw.prev_count, now);
3924 delta = now - prev;
3925 atomic64_add(delta, &event->count);
3926}
3927
3928static int task_clock_perf_event_enable(struct perf_event *event)
3929{
3930 struct hw_perf_event *hwc = &event->hw;
3931 u64 now;
3932
3933 now = event->ctx->time;
3934
3935 atomic64_set(&hwc->prev_count, now);
3936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3937 hwc->hrtimer.function = perf_swevent_hrtimer;
3938 if (hwc->sample_period) {
3939 u64 period = max_t(u64, 10000, hwc->sample_period);
3940 __hrtimer_start_range_ns(&hwc->hrtimer,
3941 ns_to_ktime(period), 0,
3942 HRTIMER_MODE_REL, 0);
3943 }
3944
3945 return 0;
3946}
3947
3948static void task_clock_perf_event_disable(struct perf_event *event)
3949{
3950 if (event->hw.sample_period)
3951 hrtimer_cancel(&event->hw.hrtimer);
3952 task_clock_perf_event_update(event, event->ctx->time);
3953
3954}
3955
3956static void task_clock_perf_event_read(struct perf_event *event)
3957{
3958 u64 time;
3959
3960 if (!in_nmi()) {
3961 update_context_time(event->ctx);
3962 time = event->ctx->time;
3963 } else {
3964 u64 now = perf_clock();
3965 u64 delta = now - event->ctx->timestamp;
3966 time = event->ctx->time + delta;
3967 }
3968
3969 task_clock_perf_event_update(event, time);
3970}
3971
3972static const struct pmu perf_ops_task_clock = {
3973 .enable = task_clock_perf_event_enable,
3974 .disable = task_clock_perf_event_disable,
3975 .read = task_clock_perf_event_read,
3976};
3977
3978#ifdef CONFIG_EVENT_PROFILE
3979void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980 int entry_size)
3981{
3982 struct perf_raw_record raw = {
3983 .size = entry_size,
3984 .data = record,
3985 };
3986
3987 struct perf_sample_data data = {
3988 .addr = addr,
3989 .raw = &raw,
3990 };
3991
3992 struct pt_regs *regs = get_irq_regs();
3993
3994 if (!regs)
3995 regs = task_pt_regs(current);
3996
3997 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998 &data, regs);
3999}
4000EXPORT_SYMBOL_GPL(perf_tp_event);
4001
4002extern int ftrace_profile_enable(int);
4003extern void ftrace_profile_disable(int);
4004
4005static void tp_perf_event_destroy(struct perf_event *event)
4006{
4007 ftrace_profile_disable(event->attr.config);
4008}
4009
4010static const struct pmu *tp_perf_event_init(struct perf_event *event)
4011{
4012 /*
4013 * Raw tracepoint data is a severe data leak, only allow root to
4014 * have these.
4015 */
4016 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4017 perf_paranoid_tracepoint_raw() &&
4018 !capable(CAP_SYS_ADMIN))
4019 return ERR_PTR(-EPERM);
4020
4021 if (ftrace_profile_enable(event->attr.config))
4022 return NULL;
4023
4024 event->destroy = tp_perf_event_destroy;
4025
4026 return &perf_ops_generic;
4027}
4028#else
4029static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030{
4031 return NULL;
4032}
4033#endif
4034
4035atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4036
4037static void sw_perf_event_destroy(struct perf_event *event)
4038{
4039 u64 event_id = event->attr.config;
4040
4041 WARN_ON(event->parent);
4042
4043 atomic_dec(&perf_swevent_enabled[event_id]);
4044}
4045
4046static const struct pmu *sw_perf_event_init(struct perf_event *event)
4047{
4048 const struct pmu *pmu = NULL;
4049 u64 event_id = event->attr.config;
4050
4051 /*
4052 * Software events (currently) can't in general distinguish
4053 * between user, kernel and hypervisor events.
4054 * However, context switches and cpu migrations are considered
4055 * to be kernel events, and page faults are never hypervisor
4056 * events.
4057 */
4058 switch (event_id) {
4059 case PERF_COUNT_SW_CPU_CLOCK:
4060 pmu = &perf_ops_cpu_clock;
4061
4062 break;
4063 case PERF_COUNT_SW_TASK_CLOCK:
4064 /*
4065 * If the user instantiates this as a per-cpu event,
4066 * use the cpu_clock event instead.
4067 */
4068 if (event->ctx->task)
4069 pmu = &perf_ops_task_clock;
4070 else
4071 pmu = &perf_ops_cpu_clock;
4072
4073 break;
4074 case PERF_COUNT_SW_PAGE_FAULTS:
4075 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4076 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4077 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4078 case PERF_COUNT_SW_CPU_MIGRATIONS:
4079 if (!event->parent) {
4080 atomic_inc(&perf_swevent_enabled[event_id]);
4081 event->destroy = sw_perf_event_destroy;
4082 }
4083 pmu = &perf_ops_generic;
4084 break;
4085 }
4086
4087 return pmu;
4088}
4089
4090/*
4091 * Allocate and initialize a event structure
4092 */
4093static struct perf_event *
4094perf_event_alloc(struct perf_event_attr *attr,
4095 int cpu,
4096 struct perf_event_context *ctx,
4097 struct perf_event *group_leader,
4098 struct perf_event *parent_event,
4099 gfp_t gfpflags)
4100{
4101 const struct pmu *pmu;
4102 struct perf_event *event;
4103 struct hw_perf_event *hwc;
4104 long err;
4105
4106 event = kzalloc(sizeof(*event), gfpflags);
4107 if (!event)
4108 return ERR_PTR(-ENOMEM);
4109
4110 /*
4111 * Single events are their own group leaders, with an
4112 * empty sibling list:
4113 */
4114 if (!group_leader)
4115 group_leader = event;
4116
4117 mutex_init(&event->child_mutex);
4118 INIT_LIST_HEAD(&event->child_list);
4119
4120 INIT_LIST_HEAD(&event->group_entry);
4121 INIT_LIST_HEAD(&event->event_entry);
4122 INIT_LIST_HEAD(&event->sibling_list);
4123 init_waitqueue_head(&event->waitq);
4124
4125 mutex_init(&event->mmap_mutex);
4126
4127 event->cpu = cpu;
4128 event->attr = *attr;
4129 event->group_leader = group_leader;
4130 event->pmu = NULL;
4131 event->ctx = ctx;
4132 event->oncpu = -1;
4133
4134 event->parent = parent_event;
4135
4136 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4137 event->id = atomic64_inc_return(&perf_event_id);
4138
4139 event->state = PERF_EVENT_STATE_INACTIVE;
4140
4141 if (attr->disabled)
4142 event->state = PERF_EVENT_STATE_OFF;
4143
4144 pmu = NULL;
4145
4146 hwc = &event->hw;
4147 hwc->sample_period = attr->sample_period;
4148 if (attr->freq && attr->sample_freq)
4149 hwc->sample_period = 1;
4150 hwc->last_period = hwc->sample_period;
4151
4152 atomic64_set(&hwc->period_left, hwc->sample_period);
4153
4154 /*
4155 * we currently do not support PERF_FORMAT_GROUP on inherited events
4156 */
4157 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4158 goto done;
4159
4160 switch (attr->type) {
4161 case PERF_TYPE_RAW:
4162 case PERF_TYPE_HARDWARE:
4163 case PERF_TYPE_HW_CACHE:
4164 pmu = hw_perf_event_init(event);
4165 break;
4166
4167 case PERF_TYPE_SOFTWARE:
4168 pmu = sw_perf_event_init(event);
4169 break;
4170
4171 case PERF_TYPE_TRACEPOINT:
4172 pmu = tp_perf_event_init(event);
4173 break;
4174
4175 default:
4176 break;
4177 }
4178done:
4179 err = 0;
4180 if (!pmu)
4181 err = -EINVAL;
4182 else if (IS_ERR(pmu))
4183 err = PTR_ERR(pmu);
4184
4185 if (err) {
4186 if (event->ns)
4187 put_pid_ns(event->ns);
4188 kfree(event);
4189 return ERR_PTR(err);
4190 }
4191
4192 event->pmu = pmu;
4193
4194 if (!event->parent) {
4195 atomic_inc(&nr_events);
4196 if (event->attr.mmap)
4197 atomic_inc(&nr_mmap_events);
4198 if (event->attr.comm)
4199 atomic_inc(&nr_comm_events);
4200 if (event->attr.task)
4201 atomic_inc(&nr_task_events);
4202 }
4203
4204 return event;
4205}
4206
4207static int perf_copy_attr(struct perf_event_attr __user *uattr,
4208 struct perf_event_attr *attr)
4209{
4210 u32 size;
4211 int ret;
4212
4213 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4214 return -EFAULT;
4215
4216 /*
4217 * zero the full structure, so that a short copy will be nice.
4218 */
4219 memset(attr, 0, sizeof(*attr));
4220
4221 ret = get_user(size, &uattr->size);
4222 if (ret)
4223 return ret;
4224
4225 if (size > PAGE_SIZE) /* silly large */
4226 goto err_size;
4227
4228 if (!size) /* abi compat */
4229 size = PERF_ATTR_SIZE_VER0;
4230
4231 if (size < PERF_ATTR_SIZE_VER0)
4232 goto err_size;
4233
4234 /*
4235 * If we're handed a bigger struct than we know of,
4236 * ensure all the unknown bits are 0 - i.e. new
4237 * user-space does not rely on any kernel feature
4238 * extensions we dont know about yet.
4239 */
4240 if (size > sizeof(*attr)) {
4241 unsigned char __user *addr;
4242 unsigned char __user *end;
4243 unsigned char val;
4244
4245 addr = (void __user *)uattr + sizeof(*attr);
4246 end = (void __user *)uattr + size;
4247
4248 for (; addr < end; addr++) {
4249 ret = get_user(val, addr);
4250 if (ret)
4251 return ret;
4252 if (val)
4253 goto err_size;
4254 }
4255 size = sizeof(*attr);
4256 }
4257
4258 ret = copy_from_user(attr, uattr, size);
4259 if (ret)
4260 return -EFAULT;
4261
4262 /*
4263 * If the type exists, the corresponding creation will verify
4264 * the attr->config.
4265 */
4266 if (attr->type >= PERF_TYPE_MAX)
4267 return -EINVAL;
4268
4269 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4270 return -EINVAL;
4271
4272 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4273 return -EINVAL;
4274
4275 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4276 return -EINVAL;
4277
4278out:
4279 return ret;
4280
4281err_size:
4282 put_user(sizeof(*attr), &uattr->size);
4283 ret = -E2BIG;
4284 goto out;
4285}
4286
4287int perf_event_set_output(struct perf_event *event, int output_fd)
4288{
4289 struct perf_event *output_event = NULL;
4290 struct file *output_file = NULL;
4291 struct perf_event *old_output;
4292 int fput_needed = 0;
4293 int ret = -EINVAL;
4294
4295 if (!output_fd)
4296 goto set;
4297
4298 output_file = fget_light(output_fd, &fput_needed);
4299 if (!output_file)
4300 return -EBADF;
4301
4302 if (output_file->f_op != &perf_fops)
4303 goto out;
4304
4305 output_event = output_file->private_data;
4306
4307 /* Don't chain output fds */
4308 if (output_event->output)
4309 goto out;
4310
4311 /* Don't set an output fd when we already have an output channel */
4312 if (event->data)
4313 goto out;
4314
4315 atomic_long_inc(&output_file->f_count);
4316
4317set:
4318 mutex_lock(&event->mmap_mutex);
4319 old_output = event->output;
4320 rcu_assign_pointer(event->output, output_event);
4321 mutex_unlock(&event->mmap_mutex);
4322
4323 if (old_output) {
4324 /*
4325 * we need to make sure no existing perf_output_*()
4326 * is still referencing this event.
4327 */
4328 synchronize_rcu();
4329 fput(old_output->filp);
4330 }
4331
4332 ret = 0;
4333out:
4334 fput_light(output_file, fput_needed);
4335 return ret;
4336}
4337
4338/**
4339 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4340 *
4341 * @attr_uptr: event_id type attributes for monitoring/sampling
4342 * @pid: target pid
4343 * @cpu: target cpu
4344 * @group_fd: group leader event fd
4345 */
4346SYSCALL_DEFINE5(perf_event_open,
4347 struct perf_event_attr __user *, attr_uptr,
4348 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4349{
4350 struct perf_event *event, *group_leader;
4351 struct perf_event_attr attr;
4352 struct perf_event_context *ctx;
4353 struct file *event_file = NULL;
4354 struct file *group_file = NULL;
4355 int fput_needed = 0;
4356 int fput_needed2 = 0;
4357 int err;
4358
4359 /* for future expandability... */
4360 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4361 return -EINVAL;
4362
4363 err = perf_copy_attr(attr_uptr, &attr);
4364 if (err)
4365 return err;
4366
4367 if (!attr.exclude_kernel) {
4368 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4369 return -EACCES;
4370 }
4371
4372 if (attr.freq) {
4373 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4374 return -EINVAL;
4375 }
4376
4377 /*
4378 * Get the target context (task or percpu):
4379 */
4380 ctx = find_get_context(pid, cpu);
4381 if (IS_ERR(ctx))
4382 return PTR_ERR(ctx);
4383
4384 /*
4385 * Look up the group leader (we will attach this event to it):
4386 */
4387 group_leader = NULL;
4388 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4389 err = -EINVAL;
4390 group_file = fget_light(group_fd, &fput_needed);
4391 if (!group_file)
4392 goto err_put_context;
4393 if (group_file->f_op != &perf_fops)
4394 goto err_put_context;
4395
4396 group_leader = group_file->private_data;
4397 /*
4398 * Do not allow a recursive hierarchy (this new sibling
4399 * becoming part of another group-sibling):
4400 */
4401 if (group_leader->group_leader != group_leader)
4402 goto err_put_context;
4403 /*
4404 * Do not allow to attach to a group in a different
4405 * task or CPU context:
4406 */
4407 if (group_leader->ctx != ctx)
4408 goto err_put_context;
4409 /*
4410 * Only a group leader can be exclusive or pinned
4411 */
4412 if (attr.exclusive || attr.pinned)
4413 goto err_put_context;
4414 }
4415
4416 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417 NULL, GFP_KERNEL);
4418 err = PTR_ERR(event);
4419 if (IS_ERR(event))
4420 goto err_put_context;
4421
4422 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4423 if (err < 0)
4424 goto err_free_put_context;
4425
4426 event_file = fget_light(err, &fput_needed2);
4427 if (!event_file)
4428 goto err_free_put_context;
4429
4430 if (flags & PERF_FLAG_FD_OUTPUT) {
4431 err = perf_event_set_output(event, group_fd);
4432 if (err)
4433 goto err_fput_free_put_context;
4434 }
4435
4436 event->filp = event_file;
4437 WARN_ON_ONCE(ctx->parent_ctx);
4438 mutex_lock(&ctx->mutex);
4439 perf_install_in_context(ctx, event, cpu);
4440 ++ctx->generation;
4441 mutex_unlock(&ctx->mutex);
4442
4443 event->owner = current;
4444 get_task_struct(current);
4445 mutex_lock(&current->perf_event_mutex);
4446 list_add_tail(&event->owner_entry, &current->perf_event_list);
4447 mutex_unlock(&current->perf_event_mutex);
4448
4449err_fput_free_put_context:
4450 fput_light(event_file, fput_needed2);
4451
4452err_free_put_context:
4453 if (err < 0)
4454 kfree(event);
4455
4456err_put_context:
4457 if (err < 0)
4458 put_ctx(ctx);
4459
4460 fput_light(group_file, fput_needed);
4461
4462 return err;
4463}
4464
4465/*
4466 * inherit a event from parent task to child task:
4467 */
4468static struct perf_event *
4469inherit_event(struct perf_event *parent_event,
4470 struct task_struct *parent,
4471 struct perf_event_context *parent_ctx,
4472 struct task_struct *child,
4473 struct perf_event *group_leader,
4474 struct perf_event_context *child_ctx)
4475{
4476 struct perf_event *child_event;
4477
4478 /*
4479 * Instead of creating recursive hierarchies of events,
4480 * we link inherited events back to the original parent,
4481 * which has a filp for sure, which we use as the reference
4482 * count:
4483 */
4484 if (parent_event->parent)
4485 parent_event = parent_event->parent;
4486
4487 child_event = perf_event_alloc(&parent_event->attr,
4488 parent_event->cpu, child_ctx,
4489 group_leader, parent_event,
4490 GFP_KERNEL);
4491 if (IS_ERR(child_event))
4492 return child_event;
4493 get_ctx(child_ctx);
4494
4495 /*
4496 * Make the child state follow the state of the parent event,
4497 * not its attr.disabled bit. We hold the parent's mutex,
4498 * so we won't race with perf_event_{en, dis}able_family.
4499 */
4500 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4501 child_event->state = PERF_EVENT_STATE_INACTIVE;
4502 else
4503 child_event->state = PERF_EVENT_STATE_OFF;
4504
4505 if (parent_event->attr.freq)
4506 child_event->hw.sample_period = parent_event->hw.sample_period;
4507
4508 /*
4509 * Link it up in the child's context:
4510 */
4511 add_event_to_ctx(child_event, child_ctx);
4512
4513 /*
4514 * Get a reference to the parent filp - we will fput it
4515 * when the child event exits. This is safe to do because
4516 * we are in the parent and we know that the filp still
4517 * exists and has a nonzero count:
4518 */
4519 atomic_long_inc(&parent_event->filp->f_count);
4520
4521 /*
4522 * Link this into the parent event's child list
4523 */
4524 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4525 mutex_lock(&parent_event->child_mutex);
4526 list_add_tail(&child_event->child_list, &parent_event->child_list);
4527 mutex_unlock(&parent_event->child_mutex);
4528
4529 return child_event;
4530}
4531
4532static int inherit_group(struct perf_event *parent_event,
4533 struct task_struct *parent,
4534 struct perf_event_context *parent_ctx,
4535 struct task_struct *child,
4536 struct perf_event_context *child_ctx)
4537{
4538 struct perf_event *leader;
4539 struct perf_event *sub;
4540 struct perf_event *child_ctr;
4541
4542 leader = inherit_event(parent_event, parent, parent_ctx,
4543 child, NULL, child_ctx);
4544 if (IS_ERR(leader))
4545 return PTR_ERR(leader);
4546 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4547 child_ctr = inherit_event(sub, parent, parent_ctx,
4548 child, leader, child_ctx);
4549 if (IS_ERR(child_ctr))
4550 return PTR_ERR(child_ctr);
4551 }
4552 return 0;
4553}
4554
4555static void sync_child_event(struct perf_event *child_event,
4556 struct task_struct *child)
4557{
4558 struct perf_event *parent_event = child_event->parent;
4559 u64 child_val;
4560
4561 if (child_event->attr.inherit_stat)
4562 perf_event_read_event(child_event, child);
4563
4564 child_val = atomic64_read(&child_event->count);
4565
4566 /*
4567 * Add back the child's count to the parent's count:
4568 */
4569 atomic64_add(child_val, &parent_event->count);
4570 atomic64_add(child_event->total_time_enabled,
4571 &parent_event->child_total_time_enabled);
4572 atomic64_add(child_event->total_time_running,
4573 &parent_event->child_total_time_running);
4574
4575 /*
4576 * Remove this event from the parent's list
4577 */
4578 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4579 mutex_lock(&parent_event->child_mutex);
4580 list_del_init(&child_event->child_list);
4581 mutex_unlock(&parent_event->child_mutex);
4582
4583 /*
4584 * Release the parent event, if this was the last
4585 * reference to it.
4586 */
4587 fput(parent_event->filp);
4588}
4589
4590static void
4591__perf_event_exit_task(struct perf_event *child_event,
4592 struct perf_event_context *child_ctx,
4593 struct task_struct *child)
4594{
4595 struct perf_event *parent_event;
4596
4597 update_event_times(child_event);
4598 perf_event_remove_from_context(child_event);
4599
4600 parent_event = child_event->parent;
4601 /*
4602 * It can happen that parent exits first, and has events
4603 * that are still around due to the child reference. These
4604 * events need to be zapped - but otherwise linger.
4605 */
4606 if (parent_event) {
4607 sync_child_event(child_event, child);
4608 free_event(child_event);
4609 }
4610}
4611
4612/*
4613 * When a child task exits, feed back event values to parent events.
4614 */
4615void perf_event_exit_task(struct task_struct *child)
4616{
4617 struct perf_event *child_event, *tmp;
4618 struct perf_event_context *child_ctx;
4619 unsigned long flags;
4620
4621 if (likely(!child->perf_event_ctxp)) {
4622 perf_event_task(child, NULL, 0);
4623 return;
4624 }
4625
4626 local_irq_save(flags);
4627 /*
4628 * We can't reschedule here because interrupts are disabled,
4629 * and either child is current or it is a task that can't be
4630 * scheduled, so we are now safe from rescheduling changing
4631 * our context.
4632 */
4633 child_ctx = child->perf_event_ctxp;
4634 __perf_event_task_sched_out(child_ctx);
4635
4636 /*
4637 * Take the context lock here so that if find_get_context is
4638 * reading child->perf_event_ctxp, we wait until it has
4639 * incremented the context's refcount before we do put_ctx below.
4640 */
4641 spin_lock(&child_ctx->lock);
4642 child->perf_event_ctxp = NULL;
4643 /*
4644 * If this context is a clone; unclone it so it can't get
4645 * swapped to another process while we're removing all
4646 * the events from it.
4647 */
4648 unclone_ctx(child_ctx);
4649 spin_unlock_irqrestore(&child_ctx->lock, flags);
4650
4651 /*
4652 * Report the task dead after unscheduling the events so that we
4653 * won't get any samples after PERF_RECORD_EXIT. We can however still
4654 * get a few PERF_RECORD_READ events.
4655 */
4656 perf_event_task(child, child_ctx, 0);
4657
4658 /*
4659 * We can recurse on the same lock type through:
4660 *
4661 * __perf_event_exit_task()
4662 * sync_child_event()
4663 * fput(parent_event->filp)
4664 * perf_release()
4665 * mutex_lock(&ctx->mutex)
4666 *
4667 * But since its the parent context it won't be the same instance.
4668 */
4669 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4670
4671again:
4672 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4673 group_entry)
4674 __perf_event_exit_task(child_event, child_ctx, child);
4675
4676 /*
4677 * If the last event was a group event, it will have appended all
4678 * its siblings to the list, but we obtained 'tmp' before that which
4679 * will still point to the list head terminating the iteration.
4680 */
4681 if (!list_empty(&child_ctx->group_list))
4682 goto again;
4683
4684 mutex_unlock(&child_ctx->mutex);
4685
4686 put_ctx(child_ctx);
4687}
4688
4689/*
4690 * free an unexposed, unused context as created by inheritance by
4691 * init_task below, used by fork() in case of fail.
4692 */
4693void perf_event_free_task(struct task_struct *task)
4694{
4695 struct perf_event_context *ctx = task->perf_event_ctxp;
4696 struct perf_event *event, *tmp;
4697
4698 if (!ctx)
4699 return;
4700
4701 mutex_lock(&ctx->mutex);
4702again:
4703 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4704 struct perf_event *parent = event->parent;
4705
4706 if (WARN_ON_ONCE(!parent))
4707 continue;
4708
4709 mutex_lock(&parent->child_mutex);
4710 list_del_init(&event->child_list);
4711 mutex_unlock(&parent->child_mutex);
4712
4713 fput(parent->filp);
4714
4715 list_del_event(event, ctx);
4716 free_event(event);
4717 }
4718
4719 if (!list_empty(&ctx->group_list))
4720 goto again;
4721
4722 mutex_unlock(&ctx->mutex);
4723
4724 put_ctx(ctx);
4725}
4726
4727/*
4728 * Initialize the perf_event context in task_struct
4729 */
4730int perf_event_init_task(struct task_struct *child)
4731{
4732 struct perf_event_context *child_ctx, *parent_ctx;
4733 struct perf_event_context *cloned_ctx;
4734 struct perf_event *event;
4735 struct task_struct *parent = current;
4736 int inherited_all = 1;
4737 int ret = 0;
4738
4739 child->perf_event_ctxp = NULL;
4740
4741 mutex_init(&child->perf_event_mutex);
4742 INIT_LIST_HEAD(&child->perf_event_list);
4743
4744 if (likely(!parent->perf_event_ctxp))
4745 return 0;
4746
4747 /*
4748 * This is executed from the parent task context, so inherit
4749 * events that have been marked for cloning.
4750 * First allocate and initialize a context for the child.
4751 */
4752
4753 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4754 if (!child_ctx)
4755 return -ENOMEM;
4756
4757 __perf_event_init_context(child_ctx, child);
4758 child->perf_event_ctxp = child_ctx;
4759 get_task_struct(child);
4760
4761 /*
4762 * If the parent's context is a clone, pin it so it won't get
4763 * swapped under us.
4764 */
4765 parent_ctx = perf_pin_task_context(parent);
4766
4767 /*
4768 * No need to check if parent_ctx != NULL here; since we saw
4769 * it non-NULL earlier, the only reason for it to become NULL
4770 * is if we exit, and since we're currently in the middle of
4771 * a fork we can't be exiting at the same time.
4772 */
4773
4774 /*
4775 * Lock the parent list. No need to lock the child - not PID
4776 * hashed yet and not running, so nobody can access it.
4777 */
4778 mutex_lock(&parent_ctx->mutex);
4779
4780 /*
4781 * We dont have to disable NMIs - we are only looking at
4782 * the list, not manipulating it:
4783 */
4784 list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
4785 if (event != event->group_leader)
4786 continue;
4787
4788 if (!event->attr.inherit) {
4789 inherited_all = 0;
4790 continue;
4791 }
4792
4793 ret = inherit_group(event, parent, parent_ctx,
4794 child, child_ctx);
4795 if (ret) {
4796 inherited_all = 0;
4797 break;
4798 }
4799 }
4800
4801 if (inherited_all) {
4802 /*
4803 * Mark the child context as a clone of the parent
4804 * context, or of whatever the parent is a clone of.
4805 * Note that if the parent is a clone, it could get
4806 * uncloned at any point, but that doesn't matter
4807 * because the list of events and the generation
4808 * count can't have changed since we took the mutex.
4809 */
4810 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4811 if (cloned_ctx) {
4812 child_ctx->parent_ctx = cloned_ctx;
4813 child_ctx->parent_gen = parent_ctx->parent_gen;
4814 } else {
4815 child_ctx->parent_ctx = parent_ctx;
4816 child_ctx->parent_gen = parent_ctx->generation;
4817 }
4818 get_ctx(child_ctx->parent_ctx);
4819 }
4820
4821 mutex_unlock(&parent_ctx->mutex);
4822
4823 perf_unpin_context(parent_ctx);
4824
4825 return ret;
4826}
4827
4828static void __cpuinit perf_event_init_cpu(int cpu)
4829{
4830 struct perf_cpu_context *cpuctx;
4831
4832 cpuctx = &per_cpu(perf_cpu_context, cpu);
4833 __perf_event_init_context(&cpuctx->ctx, NULL);
4834
4835 spin_lock(&perf_resource_lock);
4836 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4837 spin_unlock(&perf_resource_lock);
4838
4839 hw_perf_event_setup(cpu);
4840}
4841
4842#ifdef CONFIG_HOTPLUG_CPU
4843static void __perf_event_exit_cpu(void *info)
4844{
4845 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4846 struct perf_event_context *ctx = &cpuctx->ctx;
4847 struct perf_event *event, *tmp;
4848
4849 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4850 __perf_event_remove_from_context(event);
4851}
4852static void perf_event_exit_cpu(int cpu)
4853{
4854 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4855 struct perf_event_context *ctx = &cpuctx->ctx;
4856
4857 mutex_lock(&ctx->mutex);
4858 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4859 mutex_unlock(&ctx->mutex);
4860}
4861#else
4862static inline void perf_event_exit_cpu(int cpu) { }
4863#endif
4864
4865static int __cpuinit
4866perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4867{
4868 unsigned int cpu = (long)hcpu;
4869
4870 switch (action) {
4871
4872 case CPU_UP_PREPARE:
4873 case CPU_UP_PREPARE_FROZEN:
4874 perf_event_init_cpu(cpu);
4875 break;
4876
4877 case CPU_ONLINE:
4878 case CPU_ONLINE_FROZEN:
4879 hw_perf_event_setup_online(cpu);
4880 break;
4881
4882 case CPU_DOWN_PREPARE:
4883 case CPU_DOWN_PREPARE_FROZEN:
4884 perf_event_exit_cpu(cpu);
4885 break;
4886
4887 default:
4888 break;
4889 }
4890
4891 return NOTIFY_OK;
4892}
4893
4894/*
4895 * This has to have a higher priority than migration_notifier in sched.c.
4896 */
4897static struct notifier_block __cpuinitdata perf_cpu_nb = {
4898 .notifier_call = perf_cpu_notify,
4899 .priority = 20,
4900};
4901
4902void __init perf_event_init(void)
4903{
4904 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4905 (void *)(long)smp_processor_id());
4906 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4907 (void *)(long)smp_processor_id());
4908 register_cpu_notifier(&perf_cpu_nb);
4909}
4910
4911static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4912{
4913 return sprintf(buf, "%d\n", perf_reserved_percpu);
4914}
4915
4916static ssize_t
4917perf_set_reserve_percpu(struct sysdev_class *class,
4918 const char *buf,
4919 size_t count)
4920{
4921 struct perf_cpu_context *cpuctx;
4922 unsigned long val;
4923 int err, cpu, mpt;
4924
4925 err = strict_strtoul(buf, 10, &val);
4926 if (err)
4927 return err;
4928 if (val > perf_max_events)
4929 return -EINVAL;
4930
4931 spin_lock(&perf_resource_lock);
4932 perf_reserved_percpu = val;
4933 for_each_online_cpu(cpu) {
4934 cpuctx = &per_cpu(perf_cpu_context, cpu);
4935 spin_lock_irq(&cpuctx->ctx.lock);
4936 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
4937 perf_max_events - perf_reserved_percpu);
4938 cpuctx->max_pertask = mpt;
4939 spin_unlock_irq(&cpuctx->ctx.lock);
4940 }
4941 spin_unlock(&perf_resource_lock);
4942
4943 return count;
4944}
4945
4946static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4947{
4948 return sprintf(buf, "%d\n", perf_overcommit);
4949}
4950
4951static ssize_t
4952perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4953{
4954 unsigned long val;
4955 int err;
4956
4957 err = strict_strtoul(buf, 10, &val);
4958 if (err)
4959 return err;
4960 if (val > 1)
4961 return -EINVAL;
4962
4963 spin_lock(&perf_resource_lock);
4964 perf_overcommit = val;
4965 spin_unlock(&perf_resource_lock);
4966
4967 return count;
4968}
4969
4970static SYSDEV_CLASS_ATTR(
4971 reserve_percpu,
4972 0644,
4973 perf_show_reserve_percpu,
4974 perf_set_reserve_percpu
4975 );
4976
4977static SYSDEV_CLASS_ATTR(
4978 overcommit,
4979 0644,
4980 perf_show_overcommit,
4981 perf_set_overcommit
4982 );
4983
4984static struct attribute *perfclass_attrs[] = {
4985 &attr_reserve_percpu.attr,
4986 &attr_overcommit.attr,
4987 NULL
4988};
4989
4990static struct attribute_group perfclass_attr_group = {
4991 .attrs = perfclass_attrs,
4992 .name = "perf_events",
4993};
4994
4995static int __init perf_event_sysfs_init(void)
4996{
4997 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4998 &perfclass_attr_group);
4999}
5000device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 821722ae58a7..86b3796b0436 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old
118{ 118{
119 if (!(flags & CLONE_NEWPID)) 119 if (!(flags & CLONE_NEWPID))
120 return get_pid_ns(old_ns); 120 return get_pid_ns(old_ns);
121 if (flags & CLONE_THREAD) 121 if (flags & (CLONE_THREAD|CLONE_PARENT))
122 return ERR_PTR(-EINVAL); 122 return ERR_PTR(-EINVAL);
123 return create_pid_namespace(old_ns); 123 return create_pid_namespace(old_ns);
124} 124}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..5c9dc228747b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,17 +8,18 @@
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
11 12
12/* 13/*
13 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
14 */ 15 */
15void update_rlimit_cpu(unsigned long rlim_new) 16void update_rlimit_cpu(unsigned long rlim_new)
16{ 17{
17 cputime_t cputime; 18 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
18 20
19 cputime = secs_to_cputime(rlim_new); 21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
542 now); 543 now);
543} 544}
544 545
546static inline int expires_gt(cputime_t expires, cputime_t new_exp)
547{
548 return cputime_eq(expires, cputime_zero) ||
549 cputime_gt(expires, new_exp);
550}
551
552static inline int expires_le(cputime_t expires, cputime_t new_exp)
553{
554 return !cputime_eq(expires, cputime_zero) &&
555 cputime_le(expires, new_exp);
556}
545/* 557/*
546 * Insert the timer on the appropriate list before any timers that 558 * Insert the timer on the appropriate list before any timers that
547 * expire later. This must be called with the tasklist_lock held 559 * expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
586 */ 598 */
587 599
588 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 600 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
601 union cpu_time_count *exp = &nt->expires;
602
589 switch (CPUCLOCK_WHICH(timer->it_clock)) { 603 switch (CPUCLOCK_WHICH(timer->it_clock)) {
590 default: 604 default:
591 BUG(); 605 BUG();
592 case CPUCLOCK_PROF: 606 case CPUCLOCK_PROF:
593 if (cputime_eq(p->cputime_expires.prof_exp, 607 if (expires_gt(p->cputime_expires.prof_exp,
594 cputime_zero) || 608 exp->cpu))
595 cputime_gt(p->cputime_expires.prof_exp, 609 p->cputime_expires.prof_exp = exp->cpu;
596 nt->expires.cpu))
597 p->cputime_expires.prof_exp =
598 nt->expires.cpu;
599 break; 610 break;
600 case CPUCLOCK_VIRT: 611 case CPUCLOCK_VIRT:
601 if (cputime_eq(p->cputime_expires.virt_exp, 612 if (expires_gt(p->cputime_expires.virt_exp,
602 cputime_zero) || 613 exp->cpu))
603 cputime_gt(p->cputime_expires.virt_exp, 614 p->cputime_expires.virt_exp = exp->cpu;
604 nt->expires.cpu))
605 p->cputime_expires.virt_exp =
606 nt->expires.cpu;
607 break; 615 break;
608 case CPUCLOCK_SCHED: 616 case CPUCLOCK_SCHED:
609 if (p->cputime_expires.sched_exp == 0 || 617 if (p->cputime_expires.sched_exp == 0 ||
610 p->cputime_expires.sched_exp > 618 p->cputime_expires.sched_exp > exp->sched)
611 nt->expires.sched)
612 p->cputime_expires.sched_exp = 619 p->cputime_expires.sched_exp =
613 nt->expires.sched; 620 exp->sched;
614 break; 621 break;
615 } 622 }
616 } else { 623 } else {
624 struct signal_struct *const sig = p->signal;
625 union cpu_time_count *exp = &timer->it.cpu.expires;
626
617 /* 627 /*
618 * For a process timer, set the cached expiration time. 628 * For a process timer, set the cached expiration time.
619 */ 629 */
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
621 default: 631 default:
622 BUG(); 632 BUG();
623 case CPUCLOCK_VIRT: 633 case CPUCLOCK_VIRT:
624 if (!cputime_eq(p->signal->it_virt_expires, 634 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
625 cputime_zero) && 635 exp->cpu))
626 cputime_lt(p->signal->it_virt_expires,
627 timer->it.cpu.expires.cpu))
628 break; 636 break;
629 p->signal->cputime_expires.virt_exp = 637 sig->cputime_expires.virt_exp = exp->cpu;
630 timer->it.cpu.expires.cpu;
631 break; 638 break;
632 case CPUCLOCK_PROF: 639 case CPUCLOCK_PROF:
633 if (!cputime_eq(p->signal->it_prof_expires, 640 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
634 cputime_zero) && 641 exp->cpu))
635 cputime_lt(p->signal->it_prof_expires,
636 timer->it.cpu.expires.cpu))
637 break; 642 break;
638 i = p->signal->rlim[RLIMIT_CPU].rlim_cur; 643 i = sig->rlim[RLIMIT_CPU].rlim_cur;
639 if (i != RLIM_INFINITY && 644 if (i != RLIM_INFINITY &&
640 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 645 i <= cputime_to_secs(exp->cpu))
641 break; 646 break;
642 p->signal->cputime_expires.prof_exp = 647 sig->cputime_expires.prof_exp = exp->cpu;
643 timer->it.cpu.expires.cpu;
644 break; 648 break;
645 case CPUCLOCK_SCHED: 649 case CPUCLOCK_SCHED:
646 p->signal->cputime_expires.sched_exp = 650 sig->cputime_expires.sched_exp = exp->sched;
647 timer->it.cpu.expires.sched;
648 break; 651 break;
649 } 652 }
650 } 653 }
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1072} 1075}
1073 1076
1077static u32 onecputick;
1078
1079static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1080 cputime_t *expires, cputime_t cur_time, int signo)
1081{
1082 if (cputime_eq(it->expires, cputime_zero))
1083 return;
1084
1085 if (cputime_ge(cur_time, it->expires)) {
1086 if (!cputime_eq(it->incr, cputime_zero)) {
1087 it->expires = cputime_add(it->expires, it->incr);
1088 it->error += it->incr_error;
1089 if (it->error >= onecputick) {
1090 it->expires = cputime_sub(it->expires,
1091 cputime_one_jiffy);
1092 it->error -= onecputick;
1093 }
1094 } else {
1095 it->expires = cputime_zero;
1096 }
1097
1098 trace_itimer_expire(signo == SIGPROF ?
1099 ITIMER_PROF : ITIMER_VIRTUAL,
1100 tsk->signal->leader_pid, cur_time);
1101 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1102 }
1103
1104 if (!cputime_eq(it->expires, cputime_zero) &&
1105 (cputime_eq(*expires, cputime_zero) ||
1106 cputime_lt(it->expires, *expires))) {
1107 *expires = it->expires;
1108 }
1109}
1110
1074/* 1111/*
1075 * Check for any per-thread CPU timers that have fired and move them 1112 * Check for any per-thread CPU timers that have fired and move them
1076 * off the tsk->*_timers list onto the firing list. Per-thread timers 1113 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
1090 * Don't sample the current process CPU clocks if there are no timers. 1127 * Don't sample the current process CPU clocks if there are no timers.
1091 */ 1128 */
1092 if (list_empty(&timers[CPUCLOCK_PROF]) && 1129 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1093 cputime_eq(sig->it_prof_expires, cputime_zero) && 1130 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1094 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1131 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1095 list_empty(&timers[CPUCLOCK_VIRT]) && 1132 list_empty(&timers[CPUCLOCK_VIRT]) &&
1096 cputime_eq(sig->it_virt_expires, cputime_zero) && 1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1097 list_empty(&timers[CPUCLOCK_SCHED])) { 1134 list_empty(&timers[CPUCLOCK_SCHED])) {
1098 stop_process_timers(tsk); 1135 stop_process_timers(tsk);
1099 return; 1136 return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
1153 /* 1190 /*
1154 * Check for the special case process timers. 1191 * Check for the special case process timers.
1155 */ 1192 */
1156 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1193 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1157 if (cputime_ge(ptime, sig->it_prof_expires)) { 1194 SIGPROF);
1158 /* ITIMER_PROF fires and reloads. */ 1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1159 sig->it_prof_expires = sig->it_prof_incr; 1196 SIGVTALRM);
1160 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1197
1161 sig->it_prof_expires = cputime_add(
1162 sig->it_prof_expires, ptime);
1163 }
1164 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1165 }
1166 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1167 (cputime_eq(prof_expires, cputime_zero) ||
1168 cputime_lt(sig->it_prof_expires, prof_expires))) {
1169 prof_expires = sig->it_prof_expires;
1170 }
1171 }
1172 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1173 if (cputime_ge(utime, sig->it_virt_expires)) {
1174 /* ITIMER_VIRTUAL fires and reloads. */
1175 sig->it_virt_expires = sig->it_virt_incr;
1176 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1177 sig->it_virt_expires = cputime_add(
1178 sig->it_virt_expires, utime);
1179 }
1180 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1181 }
1182 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1183 (cputime_eq(virt_expires, cputime_zero) ||
1184 cputime_lt(sig->it_virt_expires, virt_expires))) {
1185 virt_expires = sig->it_virt_expires;
1186 }
1187 }
1188 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1189 unsigned long psecs = cputime_to_secs(ptime); 1199 unsigned long psecs = cputime_to_secs(ptime);
1190 cputime_t x; 1200 cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 if (!cputime_eq(*oldval, cputime_zero)) { 1467 if (!cputime_eq(*oldval, cputime_zero)) {
1458 if (cputime_le(*oldval, now.cpu)) { 1468 if (cputime_le(*oldval, now.cpu)) {
1459 /* Just about to fire. */ 1469 /* Just about to fire. */
1460 *oldval = jiffies_to_cputime(1); 1470 *oldval = cputime_one_jiffy;
1461 } else { 1471 } else {
1462 *oldval = cputime_sub(*oldval, now.cpu); 1472 *oldval = cputime_sub(*oldval, now.cpu);
1463 } 1473 }
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
1703 .nsleep = thread_cpu_nsleep, 1713 .nsleep = thread_cpu_nsleep,
1704 .nsleep_restart = thread_cpu_nsleep_restart, 1714 .nsleep_restart = thread_cpu_nsleep_restart,
1705 }; 1715 };
1716 struct timespec ts;
1706 1717
1707 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1718 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1708 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1719 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1709 1720
1721 cputime_to_timespec(cputime_one_jiffy, &ts);
1722 onecputick = ts.tv_nsec;
1723 WARN_ON(ts.tv_sec != 0);
1724
1710 return 0; 1725 return 0;
1711} 1726}
1712__initcall(init_posix_cpu_timers); 1727__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 8ba052c86d48..b101cdc4df3f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -13,7 +13,6 @@
13 13
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h>
17#include <linux/delay.h> 16#include <linux/delay.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
19#include <linux/genhd.h> 18#include <linux/genhd.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..f38b07f78a4e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -198,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
198#ifdef CONFIG_BOOT_PRINTK_DELAY 206#ifdef CONFIG_BOOT_PRINTK_DELAY
199 207
200static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 208static unsigned int boot_delay; /* msecs delay after each printk during bootup */
201static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ 209static unsigned long long loops_per_msec; /* based on boot_delay */
202 210
203static int __init boot_delay_setup(char *str) 211static int __init boot_delay_setup(char *str)
204{ 212{
205 unsigned long lpj; 213 unsigned long lpj;
206 unsigned long long loops_per_msec;
207 214
208 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ 215 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
209 loops_per_msec = (unsigned long long)lpj / 1000 * HZ; 216 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -212,10 +219,9 @@ static int __init boot_delay_setup(char *str)
212 if (boot_delay > 10 * 1000) 219 if (boot_delay > 10 * 1000)
213 boot_delay = 0; 220 boot_delay = 0;
214 221
215 printk_delay_msec = loops_per_msec; 222 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
216 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 223 "HZ: %d, loops_per_msec: %llu\n",
217 "HZ: %d, printk_delay_msec: %llu\n", 224 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
218 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
219 return 1; 225 return 1;
220} 226}
221__setup("boot_delay=", boot_delay_setup); 227__setup("boot_delay=", boot_delay_setup);
@@ -228,7 +234,7 @@ static void boot_delay_msec(void)
228 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 234 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
229 return; 235 return;
230 236
231 k = (unsigned long long)printk_delay_msec * boot_delay; 237 k = (unsigned long long)loops_per_msec * boot_delay;
232 238
233 timeout = jiffies + msecs_to_jiffies(boot_delay); 239 timeout = jiffies + msecs_to_jiffies(boot_delay);
234 while (k) { 240 while (k) {
@@ -372,10 +378,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 378 logged_chars = 0;
373 break; 379 break;
374 case 6: /* Disable logging to console */ 380 case 6: /* Disable logging to console */
381 if (saved_console_loglevel == -1)
382 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 383 console_loglevel = minimum_console_loglevel;
376 break; 384 break;
377 case 7: /* Enable logging to console */ 385 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 386 if (saved_console_loglevel != -1) {
387 console_loglevel = saved_console_loglevel;
388 saved_console_loglevel = -1;
389 }
379 break; 390 break;
380 case 8: /* Set level of messages printed to console */ 391 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 392 error = -EINVAL;
@@ -384,6 +395,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 395 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 396 len = minimum_console_loglevel;
386 console_loglevel = len; 397 console_loglevel = len;
398 /* Implicitly re-enable logging to console */
399 saved_console_loglevel = -1;
387 error = 0; 400 error = 0;
388 break; 401 break;
389 case 9: /* Number of chars in the log buffer */ 402 case 9: /* Number of chars in the log buffer */
@@ -412,7 +425,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 425{
413 struct console *con; 426 struct console *con;
414 427
415 for (con = console_drivers; con; con = con->next) { 428 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 429 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 430 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 431 (con->flags & CON_ANYTIME)))
@@ -544,7 +557,7 @@ static int have_callable_console(void)
544{ 557{
545 struct console *con; 558 struct console *con;
546 559
547 for (con = console_drivers; con; con = con->next) 560 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 561 if (con->flags & CON_ANYTIME)
549 return 1; 562 return 1;
550 563
@@ -640,6 +653,20 @@ static int recursion_bug;
640static int new_text_line = 1; 653static int new_text_line = 1;
641static char printk_buf[1024]; 654static char printk_buf[1024];
642 655
656int printk_delay_msec __read_mostly;
657
658static inline void printk_delay(void)
659{
660 if (unlikely(printk_delay_msec)) {
661 int m = printk_delay_msec;
662
663 while (m--) {
664 mdelay(1);
665 touch_nmi_watchdog();
666 }
667 }
668}
669
643asmlinkage int vprintk(const char *fmt, va_list args) 670asmlinkage int vprintk(const char *fmt, va_list args)
644{ 671{
645 int printed_len = 0; 672 int printed_len = 0;
@@ -649,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
649 char *p; 676 char *p;
650 677
651 boot_delay_msec(); 678 boot_delay_msec();
679 printk_delay();
652 680
653 preempt_disable(); 681 preempt_disable();
654 /* This stops the holder of console_sem just where we want him */ 682 /* This stops the holder of console_sem just where we want him */
@@ -1060,12 +1088,6 @@ void __sched console_conditional_schedule(void)
1060} 1088}
1061EXPORT_SYMBOL(console_conditional_schedule); 1089EXPORT_SYMBOL(console_conditional_schedule);
1062 1090
1063void console_print(const char *s)
1064{
1065 printk(KERN_EMERG "%s", s);
1066}
1067EXPORT_SYMBOL(console_print);
1068
1069void console_unblank(void) 1091void console_unblank(void)
1070{ 1092{
1071 struct console *c; 1093 struct console *c;
@@ -1082,7 +1104,7 @@ void console_unblank(void)
1082 1104
1083 console_locked = 1; 1105 console_locked = 1;
1084 console_may_schedule = 0; 1106 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1107 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1108 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1109 c->unblank();
1088 release_console_sem(); 1110 release_console_sem();
@@ -1097,7 +1119,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1119 struct tty_driver *driver = NULL;
1098 1120
1099 acquire_console_sem(); 1121 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1122 for_each_console(c) {
1101 if (!c->device) 1123 if (!c->device)
1102 continue; 1124 continue;
1103 driver = c->device(c, index); 1125 driver = c->device(c, index);
@@ -1134,25 +1156,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1156 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1157 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1158 * console driver was initialized.
1159 *
1160 * This can happen pretty early during the boot process (because of
1161 * early_printk) - sometimes before setup_arch() completes - be careful
1162 * of what kernel features are used - they may not be initialised yet.
1163 *
1164 * There are two types of consoles - bootconsoles (early_printk) and
1165 * "real" consoles (everything which is not a bootconsole) which are
1166 * handled differently.
1167 * - Any number of bootconsoles can be registered at any time.
1168 * - As soon as a "real" console is registered, all bootconsoles
1169 * will be unregistered automatically.
1170 * - Once a "real" console is registered, any attempt to register a
1171 * bootconsoles will be rejected
1137 */ 1172 */
1138void register_console(struct console *console) 1173void register_console(struct console *newcon)
1139{ 1174{
1140 int i; 1175 int i;
1141 unsigned long flags; 1176 unsigned long flags;
1142 struct console *bootconsole = NULL; 1177 struct console *bcon = NULL;
1143 1178
1144 if (console_drivers) { 1179 /*
1145 if (console->flags & CON_BOOT) 1180 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1181 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1182 */
1148 bootconsole = console_drivers; 1183 if (console_drivers && newcon->flags & CON_BOOT) {
1184 /* find the last or real console */
1185 for_each_console(bcon) {
1186 if (!(bcon->flags & CON_BOOT)) {
1187 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1188 newcon->name, newcon->index);
1189 return;
1190 }
1191 }
1149 } 1192 }
1150 1193
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1194 if (console_drivers && console_drivers->flags & CON_BOOT)
1195 bcon = console_drivers;
1196
1197 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1198 preferred_console = selected_console;
1153 1199
1154 if (console->early_setup) 1200 if (newcon->early_setup)
1155 console->early_setup(); 1201 newcon->early_setup();
1156 1202
1157 /* 1203 /*
1158 * See if we want to use this console driver. If we 1204 * See if we want to use this console driver. If we
@@ -1160,13 +1206,13 @@ void register_console(struct console *console)
1160 * that registers here. 1206 * that registers here.
1161 */ 1207 */
1162 if (preferred_console < 0) { 1208 if (preferred_console < 0) {
1163 if (console->index < 0) 1209 if (newcon->index < 0)
1164 console->index = 0; 1210 newcon->index = 0;
1165 if (console->setup == NULL || 1211 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1212 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1213 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1214 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1215 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1216 preferred_console = 0;
1171 } 1217 }
1172 } 1218 }
@@ -1178,64 +1224,62 @@ void register_console(struct console *console)
1178 */ 1224 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1225 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1226 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1227 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1228 continue;
1183 if (console->index >= 0 && 1229 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1230 newcon->index != console_cmdline[i].index)
1185 continue; 1231 continue;
1186 if (console->index < 0) 1232 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1233 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1234#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1235 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1236 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1237 braille_register_console(newcon,
1192 console_cmdline[i].index, 1238 console_cmdline[i].index,
1193 console_cmdline[i].options, 1239 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1240 console_cmdline[i].brl_options);
1195 return; 1241 return;
1196 } 1242 }
1197#endif 1243#endif
1198 if (console->setup && 1244 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1245 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1246 break;
1201 console->flags |= CON_ENABLED; 1247 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1248 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1249 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1250 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1251 preferred_console = selected_console;
1206 } 1252 }
1207 break; 1253 break;
1208 } 1254 }
1209 1255
1210 if (!(console->flags & CON_ENABLED)) 1256 if (!(newcon->flags & CON_ENABLED))
1211 return; 1257 return;
1212 1258
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1259 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1260 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1261 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1262 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1263 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1264 */
1219 } else { 1265 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1266 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1267
1224 /* 1268 /*
1225 * Put this console in the list - keep the 1269 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1270 * preferred driver at the head of the list.
1227 */ 1271 */
1228 acquire_console_sem(); 1272 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1273 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1274 newcon->next = console_drivers;
1231 console_drivers = console; 1275 console_drivers = newcon;
1232 if (console->next) 1276 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1277 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1278 } else {
1235 console->next = console_drivers->next; 1279 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1280 console_drivers->next = newcon;
1237 } 1281 }
1238 if (console->flags & CON_PRINTBUFFER) { 1282 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1283 /*
1240 * release_console_sem() will print out the buffered messages 1284 * release_console_sem() will print out the buffered messages
1241 * for us. 1285 * for us.
@@ -1245,6 +1289,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1289 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1290 }
1247 release_console_sem(); 1291 release_console_sem();
1292
1293 /*
1294 * By unregistering the bootconsoles after we enable the real console
1295 * we get the "console xxx enabled" message on all the consoles -
1296 * boot consoles, real consoles, etc - this is to ensure that end
1297 * users know there might be something in the kernel's log buffer that
1298 * went to the bootconsole (that they do not see on the real console)
1299 */
1300 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1301 /* we need to iterate through twice, to make sure we print
1302 * everything out, before we unregister the console(s)
1303 */
1304 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1305 newcon->name, newcon->index);
1306 for_each_console(bcon)
1307 if (bcon->flags & CON_BOOT)
1308 unregister_console(bcon);
1309 } else {
1310 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1311 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1312 newcon->name, newcon->index);
1313 }
1248} 1314}
1249EXPORT_SYMBOL(register_console); 1315EXPORT_SYMBOL(register_console);
1250 1316
@@ -1287,11 +1353,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1353
1288static int __init disable_boot_consoles(void) 1354static int __init disable_boot_consoles(void)
1289{ 1355{
1290 if (console_drivers != NULL) { 1356 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1357
1358 for_each_console(con) {
1359 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1360 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1361 con->name, con->index);
1294 return unregister_console(console_drivers); 1362 unregister_console(con);
1295 } 1363 }
1296 } 1364 }
1297 return 0; 1365 return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082c320e4dbf..23bd09cd042e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh)
266 * or self-reaping. Do notification now if it would have happened earlier. 266 * or self-reaping. Do notification now if it would have happened earlier.
267 * If it should reap itself, return true. 267 * If it should reap itself, return true.
268 * 268 *
269 * If it's our own child, there is no notification to do. 269 * If it's our own child, there is no notification to do. But if our normal
270 * But if our normal children self-reap, then this child 270 * children self-reap, then this child was prevented by ptrace and we must
271 * was prevented by ptrace and we must reap it now. 271 * reap it now, in that case we must also wake up sub-threads sleeping in
272 * do_wait().
272 */ 273 */
273static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) 274static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
274{ 275{
@@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
278 if (!task_detached(p) && thread_group_empty(p)) { 279 if (!task_detached(p) && thread_group_empty(p)) {
279 if (!same_thread_group(p->real_parent, tracer)) 280 if (!same_thread_group(p->real_parent, tracer))
280 do_notify_parent(p, p->exit_signal); 281 do_notify_parent(p, p->exit_signal);
281 else if (ignoring_children(tracer->sighand)) 282 else if (ignoring_children(tracer->sighand)) {
283 __wake_up_parent(p, tracer);
282 p->exit_signal = -1; 284 p->exit_signal = -1;
285 }
283 } 286 }
284 if (task_detached(p)) { 287 if (task_detached(p)) {
285 /* Mark it as in the process of being reaped. */ 288 /* Mark it as in the process of being reaped. */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..37ac45483082 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head)
74 complete(&rcu->completion); 74 complete(&rcu->completion);
75} 75}
76 76
77#ifdef CONFIG_TREE_PREEMPT_RCU
78
77/** 79/**
78 * synchronize_rcu - wait until a grace period has elapsed. 80 * synchronize_rcu - wait until a grace period has elapsed.
79 * 81 *
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
87{ 89{
88 struct rcu_synchronize rcu; 90 struct rcu_synchronize rcu;
89 91
90 if (rcu_blocking_is_gp()) 92 if (!rcu_scheduler_active)
91 return; 93 return;
92 94
93 init_completion(&rcu.completion); 95 init_completion(&rcu.completion);
@@ -98,6 +100,70 @@ void synchronize_rcu(void)
98} 100}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 101EXPORT_SYMBOL_GPL(synchronize_rcu);
100 102
103#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
104
105/**
106 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
107 *
108 * Control will return to the caller some time after a full rcu-sched
109 * grace period has elapsed, in other words after all currently executing
110 * rcu-sched read-side critical sections have completed. These read-side
111 * critical sections are delimited by rcu_read_lock_sched() and
112 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
113 * local_irq_disable(), and so on may be used in place of
114 * rcu_read_lock_sched().
115 *
116 * This means that all preempt_disable code sequences, including NMI and
117 * hardware-interrupt handlers, in progress on entry will have completed
118 * before this primitive returns. However, this does not guarantee that
119 * softirq handlers will have completed, since in some kernels, these
120 * handlers can run in process context, and can block.
121 *
122 * This primitive provides the guarantees made by the (now removed)
123 * synchronize_kernel() API. In contrast, synchronize_rcu() only
124 * guarantees that rcu_read_lock() sections will have completed.
125 * In "classic RCU", these two guarantees happen to be one and
126 * the same, but can differ in realtime RCU implementations.
127 */
128void synchronize_sched(void)
129{
130 struct rcu_synchronize rcu;
131
132 if (rcu_blocking_is_gp())
133 return;
134
135 init_completion(&rcu.completion);
136 /* Will wake me after RCU finished. */
137 call_rcu_sched(&rcu.head, wakeme_after_rcu);
138 /* Wait for it. */
139 wait_for_completion(&rcu.completion);
140}
141EXPORT_SYMBOL_GPL(synchronize_sched);
142
143/**
144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
145 *
146 * Control will return to the caller some time after a full rcu_bh grace
147 * period has elapsed, in other words after all currently executing rcu_bh
148 * read-side critical sections have completed. RCU read-side critical
149 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
150 * and may be nested.
151 */
152void synchronize_rcu_bh(void)
153{
154 struct rcu_synchronize rcu;
155
156 if (rcu_blocking_is_gp())
157 return;
158
159 init_completion(&rcu.completion);
160 /* Will wake me after RCU finished. */
161 call_rcu_bh(&rcu.head, wakeme_after_rcu);
162 /* Wait for it. */
163 wait_for_completion(&rcu.completion);
164}
165EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
166
101static void rcu_barrier_callback(struct rcu_head *notused) 167static void rcu_barrier_callback(struct rcu_head *notused)
102{ 168{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 169 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +195,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 195static inline void wait_migrated_callbacks(void)
130{ 196{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 197 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
198 smp_mb(); /* In case we didn't sleep. */
132} 199}
133 200
134/* 201/*
@@ -192,9 +259,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 259 wake_up(&rcu_migrate_wq);
193} 260}
194 261
262extern int rcu_cpu_notify(struct notifier_block *self,
263 unsigned long action, void *hcpu);
264
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 265static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 266 unsigned long action, void *hcpu)
197{ 267{
268 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 269 if (action == CPU_DYING) {
199 /* 270 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 271 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +280,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 280 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 281 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 282 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 283 } else if (action == CPU_DOWN_PREPARE) {
284 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 285 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 286 wait_migrated_callbacks();
215 } 287 }
@@ -219,8 +291,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 291
220void __init rcu_init(void) 292void __init rcu_init(void)
221{ 293{
294 int i;
295
222 __rcu_init(); 296 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 297 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
298
299 /*
300 * We don't need protection against CPU-hotplug here because
301 * this is called early in boot, before either interrupts
302 * or the scheduler are operational.
303 */
304 for_each_online_cpu(i)
305 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 306}
225 307
226void rcu_scheduler_starting(void) 308void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..233768f21f97 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -257,17 +257,18 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -320,7 +324,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 324 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 325 rcu_torture_free(rp);
322 } else 326 } else
323 cur_ops->deferredfree(rp); 327 cur_ops->deferred_free(rp);
324} 328}
325 329
326static void rcu_torture_deferred_free(struct rcu_torture *p) 330static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +333,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 333}
330 334
331static struct rcu_torture_ops rcu_ops = { 335static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 336 .init = NULL,
333 .cleanup = NULL, 337 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 338 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 339 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 340 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 341 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 342 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 343 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 344 .cb_barrier = rcu_barrier,
341 .stats = NULL, 345 .stats = NULL,
342 .irqcapable = 1, 346 .irq_capable = 1,
343 .name = "rcu" 347 .name = "rcu"
344}; 348};
345 349
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 350static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +374,18 @@ static void rcu_sync_torture_init(void)
370} 374}
371 375
372static struct rcu_torture_ops rcu_sync_ops = { 376static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 377 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 378 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 379 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 380 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 381 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 382 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 383 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 384 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 385 .cb_barrier = NULL,
382 .stats = NULL, 386 .stats = NULL,
383 .irqcapable = 1, 387 .irq_capable = 1,
384 .name = "rcu_sync" 388 .name = "rcu_sync"
385}; 389};
386 390
387/* 391/*
@@ -432,33 +436,33 @@ static void rcu_bh_torture_synchronize(void)
432} 436}
433 437
434static struct rcu_torture_ops rcu_bh_ops = { 438static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 439 .init = NULL,
436 .cleanup = NULL, 440 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 441 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 442 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 443 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 444 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 445 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 446 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 447 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 448 .stats = NULL,
445 .irqcapable = 1, 449 .irq_capable = 1,
446 .name = "rcu_bh" 450 .name = "rcu_bh"
447}; 451};
448 452
449static struct rcu_torture_ops rcu_bh_sync_ops = { 453static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 455 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 456 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 457 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 458 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 459 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 460 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 461 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 462 .cb_barrier = NULL,
459 .stats = NULL, 463 .stats = NULL,
460 .irqcapable = 1, 464 .irq_capable = 1,
461 .name = "rcu_bh_sync" 465 .name = "rcu_bh_sync"
462}; 466};
463 467
464/* 468/*
@@ -530,17 +534,17 @@ static int srcu_torture_stats(char *page)
530} 534}
531 535
532static struct rcu_torture_ops srcu_ops = { 536static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 537 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 538 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 539 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 540 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 541 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 542 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 543 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 544 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 545 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 546 .stats = srcu_torture_stats,
543 .name = "srcu" 547 .name = "srcu"
544}; 548};
545 549
546/* 550/*
@@ -574,32 +578,49 @@ static void sched_torture_synchronize(void)
574} 578}
575 579
576static struct rcu_torture_ops sched_ops = { 580static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 581 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 582 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 583 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 585 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 586 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 587 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 588 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 589 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 590 .stats = NULL,
587 .irqcapable = 1, 591 .irq_capable = 1,
588 .name = "sched" 592 .name = "sched"
589}; 593};
590 594
591static struct rcu_torture_ops sched_ops_sync = { 595static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 596 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 597 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 598 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 600 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 601 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 602 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 603 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 604 .cb_barrier = NULL,
601 .stats = NULL, 605 .stats = NULL,
602 .name = "sched_sync" 606 .name = "sched_sync"
607};
608
609extern int rcu_expedited_torture_stats(char *page);
610
611static struct rcu_torture_ops sched_expedited_ops = {
612 .init = rcu_sync_torture_init,
613 .cleanup = NULL,
614 .readlock = sched_torture_read_lock,
615 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
616 .readunlock = sched_torture_read_unlock,
617 .completed = sched_torture_completed,
618 .deferred_free = rcu_sync_torture_deferred_free,
619 .sync = synchronize_sched_expedited,
620 .cb_barrier = NULL,
621 .stats = rcu_expedited_torture_stats,
622 .irq_capable = 1,
623 .name = "sched_expedited"
603}; 624};
604 625
605/* 626/*
@@ -621,7 +642,8 @@ rcu_torture_writer(void *arg)
621 642
622 do { 643 do {
623 schedule_timeout_uninterruptible(1); 644 schedule_timeout_uninterruptible(1);
624 if ((rp = rcu_torture_alloc()) == NULL) 645 rp = rcu_torture_alloc();
646 if (rp == NULL)
625 continue; 647 continue;
626 rp->rtort_pipe_count = 0; 648 rp->rtort_pipe_count = 0;
627 udelay(rcu_random(&rand) & 0x3ff); 649 udelay(rcu_random(&rand) & 0x3ff);
@@ -635,7 +657,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 657 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 658 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 659 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 660 cur_ops->deferred_free(old_rp);
639 } 661 }
640 rcu_torture_current_version++; 662 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 663 oldbatch = cur_ops->completed();
@@ -700,7 +722,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 722 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 723 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 724 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 725 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 726 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 727 spin_unlock(&rand_lock);
706 preempt_disable(); 728 preempt_disable();
@@ -738,11 +760,11 @@ rcu_torture_reader(void *arg)
738 760
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 761 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 762 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 763 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 764 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 765
744 do { 766 do {
745 if (irqreader && cur_ops->irqcapable) { 767 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 768 if (!timer_pending(&t))
747 mod_timer(&t, 1); 769 mod_timer(&t, 1);
748 } 770 }
@@ -757,7 +779,7 @@ rcu_torture_reader(void *arg)
757 } 779 }
758 if (p->rtort_mbtest == 0) 780 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 781 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 782 cur_ops->read_delay(&rand);
761 preempt_disable(); 783 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 784 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 785 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +800,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 800 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 801 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 802 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 803 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 804 del_timer_sync(&t);
783 while (!kthread_should_stop()) 805 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 806 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1100,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1100 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1101 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1102 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1103 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1104 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1105
1083 mutex_lock(&fullstop_mutex); 1106 mutex_lock(&fullstop_mutex);
@@ -1092,7 +1115,7 @@ rcu_torture_init(void)
1092 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1093 torture_type); 1116 torture_type);
1094 mutex_unlock(&fullstop_mutex); 1117 mutex_unlock(&fullstop_mutex);
1095 return (-EINVAL); 1118 return -EINVAL;
1096 } 1119 }
1097 if (cur_ops->init) 1120 if (cur_ops->init)
1098 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1121 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1143,7 +1166,7 @@ rcu_torture_init(void)
1143 goto unwind; 1166 goto unwind;
1144 } 1167 }
1145 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1168 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1146 GFP_KERNEL); 1169 GFP_KERNEL);
1147 if (fakewriter_tasks == NULL) { 1170 if (fakewriter_tasks == NULL) {
1148 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1171 VERBOSE_PRINTK_ERRSTRING("out of memory");
1149 firsterr = -ENOMEM; 1172 firsterr = -ENOMEM;
@@ -1152,7 +1175,7 @@ rcu_torture_init(void)
1152 for (i = 0; i < nfakewriters; i++) { 1175 for (i = 0; i < nfakewriters; i++) {
1153 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1176 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1154 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1177 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1155 "rcu_torture_fakewriter"); 1178 "rcu_torture_fakewriter");
1156 if (IS_ERR(fakewriter_tasks[i])) { 1179 if (IS_ERR(fakewriter_tasks[i])) {
1157 firsterr = PTR_ERR(fakewriter_tasks[i]); 1180 firsterr = PTR_ERR(fakewriter_tasks[i]);
1158 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1181 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..52b06f6e158c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,55 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 struct rcu_data *rdp;
90 rdp->passed_quiesc = 1; 111
112 rdp = &per_cpu(rcu_sched_data, cpu);
91 rdp->passed_quiesc_completed = rdp->completed; 113 rdp->passed_quiesc_completed = rdp->completed;
114 barrier();
115 rdp->passed_quiesc = 1;
116 rcu_preempt_note_context_switch(cpu);
92} 117}
93 118
94void rcu_bh_qsctr_inc(int cpu) 119void rcu_bh_qs(int cpu)
95{ 120{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 121 struct rcu_data *rdp;
97 rdp->passed_quiesc = 1; 122
123 rdp = &per_cpu(rcu_bh_data, cpu);
98 rdp->passed_quiesc_completed = rdp->completed; 124 rdp->passed_quiesc_completed = rdp->completed;
125 barrier();
126 rdp->passed_quiesc = 1;
99} 127}
100 128
101#ifdef CONFIG_NO_HZ 129#ifdef CONFIG_NO_HZ
@@ -110,15 +138,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 138static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 139
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 140static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
141static int rcu_pending(int cpu);
113 142
114/* 143/*
115 * Return the number of RCU batches processed thus far for debug & stats. 144 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 145 */
117long rcu_batches_completed(void) 146long rcu_batches_completed_sched(void)
118{ 147{
119 return rcu_state.completed; 148 return rcu_sched_state.completed;
120} 149}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 150EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 151
123/* 152/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 153 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +210,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 210 return 1;
182 } 211 }
183 212
213 /* If preemptable RCU, no point in sending reschedule IPI. */
214 if (rdp->preemptable)
215 return 0;
216
184 /* The CPU is online, so send it a reschedule IPI. */ 217 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 218 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 219 smp_send_reschedule(rdp->cpu);
@@ -193,7 +226,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 226#endif /* #ifdef CONFIG_SMP */
194 227
195#ifdef CONFIG_NO_HZ 228#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 229
198/** 230/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 231 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +245,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 245 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 246 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 247 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 248 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 249 local_irq_restore(flags);
218} 250}
219 251
@@ -232,7 +264,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 264 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 265 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 266 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 267 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 268 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 269 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 270}
@@ -251,7 +283,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 283 if (rdtp->dynticks & 0x1)
252 return; 284 return;
253 rdtp->dynticks_nmi++; 285 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 286 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 287 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 288}
257 289
@@ -270,7 +302,7 @@ void rcu_nmi_exit(void)
270 return; 302 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 303 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 304 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 305 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 306}
275 307
276/** 308/**
@@ -286,7 +318,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 318 if (rdtp->dynticks_nesting++)
287 return; 319 return;
288 rdtp->dynticks++; 320 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 321 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 322 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 323}
292 324
@@ -305,10 +337,10 @@ void rcu_irq_exit(void)
305 return; 337 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 338 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 339 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 340 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 341
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 342 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 343 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 344 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 345 set_need_resched();
314} 346}
@@ -461,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 493
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 494 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 495 for (; rnp_cur < rnp_end; rnp_cur++) {
496 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 497 if (rnp_cur->qsmask == 0)
465 continue; 498 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 499 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +502,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 502 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 503 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 504 smp_processor_id(), (long)(jiffies - rsp->gp_start));
505 trigger_all_cpu_backtrace();
506
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 507 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 508}
474 509
@@ -479,12 +514,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 514
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 516 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 517 trigger_all_cpu_backtrace();
518
483 spin_lock_irqsave(&rnp->lock, flags); 519 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 520 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 521 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 522 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 523 spin_unlock_irqrestore(&rnp->lock, flags);
524
488 set_need_resched(); /* kick ourselves to get things going. */ 525 set_need_resched(); /* kick ourselves to get things going. */
489} 526}
490 527
@@ -564,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
564{ 601{
565 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 602 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
566 struct rcu_node *rnp = rcu_get_root(rsp); 603 struct rcu_node *rnp = rcu_get_root(rsp);
567 struct rcu_node *rnp_cur;
568 struct rcu_node *rnp_end;
569 604
570 if (!cpu_needs_another_gp(rsp, rdp)) { 605 if (!cpu_needs_another_gp(rsp, rdp)) {
571 spin_unlock_irqrestore(&rnp->lock, flags); 606 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -574,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
574 609
575 /* Advance to a new grace period and initialize state. */ 610 /* Advance to a new grace period and initialize state. */
576 rsp->gpnum++; 611 rsp->gpnum++;
612 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
577 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
578 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
579 record_gp_stall_check_time(rsp); 615 record_gp_stall_check_time(rsp);
@@ -590,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
590 626
591 /* Special-case the common single-level case. */ 627 /* Special-case the common single-level case. */
592 if (NUM_RCU_NODES == 1) { 628 if (NUM_RCU_NODES == 1) {
629 rcu_preempt_check_blocked_tasks(rnp);
593 rnp->qsmask = rnp->qsmaskinit; 630 rnp->qsmask = rnp->qsmaskinit;
631 rnp->gpnum = rsp->gpnum;
594 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
595 spin_unlock_irqrestore(&rnp->lock, flags); 633 spin_unlock_irqrestore(&rnp->lock, flags);
596 return; 634 return;
@@ -603,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
603 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 641 spin_lock(&rsp->onofflock); /* irqs already disabled. */
604 642
605 /* 643 /*
606 * Set the quiescent-state-needed bits in all the non-leaf RCU 644 * Set the quiescent-state-needed bits in all the rcu_node
607 * nodes for all currently online CPUs. This operation relies 645 * structures for all currently online CPUs in breadth-first
608 * on the layout of the hierarchy within the rsp->node[] array. 646 * order, starting from the root rcu_node structure. This
609 * Note that other CPUs will access only the leaves of the 647 * operation relies on the layout of the hierarchy within the
610 * hierarchy, which still indicate that no grace period is in 648 * rsp->node[] array. Note that other CPUs will access only
611 * progress. In addition, we have excluded CPU-hotplug operations. 649 * the leaves of the hierarchy, which still indicate that no
612 * 650 * grace period is in progress, at least until the corresponding
613 * We therefore do not need to hold any locks. Any required 651 * leaf node has been initialized. In addition, we have excluded
614 * memory barriers will be supplied by the locks guarding the 652 * CPU-hotplug operations.
615 * leaf rcu_nodes in the hierarchy.
616 */
617
618 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
619 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621
622 /*
623 * Now set up the leaf nodes. Here we must be careful. First,
624 * we need to hold the lock in order to exclude other CPUs, which
625 * might be contending for the leaf nodes' locks. Second, as
626 * soon as we initialize a given leaf node, its CPUs might run
627 * up the rest of the hierarchy. We must therefore acquire locks
628 * for each node that we touch during this stage. (But we still
629 * are excluding CPU-hotplug operations.)
630 * 653 *
631 * Note that the grace period cannot complete until we finish 654 * Note that the grace period cannot complete until we finish
632 * the initialization process, as there will be at least one 655 * the initialization process, as there will be at least one
633 * qsmask bit set in the root node until that time, namely the 656 * qsmask bit set in the root node until that time, namely the
634 * one corresponding to this CPU. 657 * one corresponding to this CPU, due to the fact that we have
658 * irqs disabled.
635 */ 659 */
636 rnp_end = &rsp->node[NUM_RCU_NODES]; 660 for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
637 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 661 spin_lock(&rnp->lock); /* irqs already disabled. */
638 for (; rnp_cur < rnp_end; rnp_cur++) { 662 rcu_preempt_check_blocked_tasks(rnp);
639 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 663 rnp->qsmask = rnp->qsmaskinit;
640 rnp_cur->qsmask = rnp_cur->qsmaskinit; 664 rnp->gpnum = rsp->gpnum;
641 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 665 spin_unlock(&rnp->lock); /* irqs already disabled. */
642 } 666 }
643 667
644 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -674,6 +698,20 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 698}
675 699
676/* 700/*
701 * Clean up after the prior grace period and let rcu_start_gp() start up
702 * the next grace period if one is needed. Note that the caller must
703 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
704 */
705static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
706 __releases(rnp->lock)
707{
708 WARN_ON_ONCE(rsp->completed == rsp->gpnum);
709 rsp->completed = rsp->gpnum;
710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
712}
713
714/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 715 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 716 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 717 * group must be represented by the same leaf rcu_node structure.
@@ -685,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
685 unsigned long flags) 723 unsigned long flags)
686 __releases(rnp->lock) 724 __releases(rnp->lock)
687{ 725{
726 struct rcu_node *rnp_c;
727
688 /* Walk up the rcu_node hierarchy. */ 728 /* Walk up the rcu_node hierarchy. */
689 for (;;) { 729 for (;;) {
690 if (!(rnp->qsmask & mask)) { 730 if (!(rnp->qsmask & mask)) {
@@ -694,7 +734,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 734 return;
695 } 735 }
696 rnp->qsmask &= ~mask; 736 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 737 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 738
699 /* Other bits still set at this level, so done. */ 739 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 740 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -708,28 +748,26 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
708 break; 748 break;
709 } 749 }
710 spin_unlock_irqrestore(&rnp->lock, flags); 750 spin_unlock_irqrestore(&rnp->lock, flags);
751 rnp_c = rnp;
711 rnp = rnp->parent; 752 rnp = rnp->parent;
712 spin_lock_irqsave(&rnp->lock, flags); 753 spin_lock_irqsave(&rnp->lock, flags);
754 WARN_ON_ONCE(rnp_c->qsmask);
713 } 755 }
714 756
715 /* 757 /*
716 * Get here if we are the last CPU to pass through a quiescent 758 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 759 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 760 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 761 */
722 rsp->completed = rsp->gpnum; 762 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 763}
726 764
727/* 765/*
728 * Record a quiescent state for the specified CPU, which must either be 766 * Record a quiescent state for the specified CPU, which must either be
729 * the current CPU or an offline CPU. The lastcomp argument is used to 767 * the current CPU. The lastcomp argument is used to make sure we are
730 * make sure we are still in the grace period of interest. We don't want 768 * still in the grace period of interest. We don't want to end the current
731 * to end the current grace period based on quiescent states detected in 769 * grace period based on quiescent states detected in an earlier grace
732 * an earlier grace period! 770 * period!
733 */ 771 */
734static void 772static void
735cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 773cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -764,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
764 * This GP can't end until cpu checks in, so all of our 802 * This GP can't end until cpu checks in, so all of our
765 * callbacks can be processed during the next GP. 803 * callbacks can be processed during the next GP.
766 */ 804 */
767 rdp = rsp->rda[smp_processor_id()];
768 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
769 806
770 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -822,30 +859,28 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
822 spin_lock_irqsave(&rsp->onofflock, flags); 859 spin_lock_irqsave(&rsp->onofflock, flags);
823 860
824 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 861 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
825 rnp = rdp->mynode; 862 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
826 mask = rdp->grpmask; /* rnp->grplo is constant. */ 863 mask = rdp->grpmask; /* rnp->grplo is constant. */
827 do { 864 do {
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 865 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 866 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 867 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 868 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 869 break;
833 } 870 }
871 rcu_preempt_offline_tasks(rsp, rnp, rdp);
834 mask = rnp->grpmask; 872 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 874 rnp = rnp->parent;
837 } while (rnp != NULL); 875 } while (rnp != NULL);
838 lastcomp = rsp->completed; 876 lastcomp = rsp->completed;
839 877
840 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
841 879
842 /* Being offline is a quiescent state, so go record it. */
843 cpu_quiet(cpu, rsp, rdp, lastcomp);
844
845 /* 880 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 881 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 882 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 883 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 884 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 885 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 886 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +911,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 911 */
877static void rcu_offline_cpu(int cpu) 912static void rcu_offline_cpu(int cpu)
878{ 913{
879 __rcu_offline_cpu(cpu, &rcu_state); 914 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 915 __rcu_offline_cpu(cpu, &rcu_bh_state);
916 rcu_preempt_offline_cpu(cpu);
881} 917}
882 918
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 919#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +999,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 999 */
964void rcu_check_callbacks(int cpu, int user) 1000void rcu_check_callbacks(int cpu, int user)
965{ 1001{
1002 if (!rcu_pending(cpu))
1003 return; /* if nothing for RCU to do. */
966 if (user || 1004 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1005 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1006 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1009,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1009 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1010 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1011 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1012 * a quiescent state, so note it.
975 * 1013 *
976 * No memory barrier is required here because both 1014 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1015 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1016 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1017 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1018 */
982 1019
983 rcu_qsctr_inc(cpu); 1020 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1021 rcu_bh_qs(cpu);
985 1022
986 } else if (!in_softirq()) { 1023 } else if (!in_softirq()) {
987 1024
@@ -989,11 +1026,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1026 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1027 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1028 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1029 * critical section, so note it.
993 */ 1030 */
994 1031
995 rcu_bh_qsctr_inc(cpu); 1032 rcu_bh_qs(cpu);
996 } 1033 }
1034 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1035 raise_softirq(RCU_SOFTIRQ);
998} 1036}
999 1037
@@ -1132,6 +1170,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1170{
1133 unsigned long flags; 1171 unsigned long flags;
1134 1172
1173 WARN_ON_ONCE(rdp->beenonline == 0);
1174
1135 /* 1175 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1176 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1177 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1210,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1210 */
1171 smp_mb(); /* See above block comment. */ 1211 smp_mb(); /* See above block comment. */
1172 1212
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1213 __rcu_process_callbacks(&rcu_sched_state,
1214 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1215 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1216 rcu_preempt_process_callbacks();
1175 1217
1176 /* 1218 /*
1177 * Memory references from any later RCU read-side critical sections 1219 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1269,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1269}
1228 1270
1229/* 1271/*
1230 * Queue an RCU callback for invocation after a grace period. 1272 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1273 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1274void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1275{
1234 __call_rcu(head, func, &rcu_state); 1276 __call_rcu(head, func, &rcu_sched_state);
1235} 1277}
1236EXPORT_SYMBOL_GPL(call_rcu); 1278EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1279
1238/* 1280/*
1239 * Queue an RCU for invocation after a quicker grace period. 1281 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1347,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1347 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1348 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1349 */
1308int rcu_pending(int cpu) 1350static int rcu_pending(int cpu)
1309{ 1351{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1352 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1353 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1354 rcu_preempt_pending(cpu);
1312} 1355}
1313 1356
1314/* 1357/*
@@ -1320,27 +1363,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1363int rcu_needs_cpu(int cpu)
1321{ 1364{
1322 /* RCU callbacks either ready or pending? */ 1365 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1366 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1367 per_cpu(rcu_bh_data, cpu).nxtlist ||
1368 rcu_preempt_needs_cpu(cpu);
1325} 1369}
1326 1370
1327/* 1371/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1372 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1373 */
1339static void __cpuinit 1374static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1375rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1376{
1342 unsigned long flags; 1377 unsigned long flags;
1343 int i; 1378 int i;
1379 struct rcu_data *rdp = rsp->rda[cpu];
1380 struct rcu_node *rnp = rcu_get_root(rsp);
1381
1382 /* Set up local state, ensuring consistent view of global state. */
1383 spin_lock_irqsave(&rnp->lock, flags);
1384 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1385 rdp->nxtlist = NULL;
1386 for (i = 0; i < RCU_NEXT_SIZE; i++)
1387 rdp->nxttail[i] = &rdp->nxtlist;
1388 rdp->qlen = 0;
1389#ifdef CONFIG_NO_HZ
1390 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1391#endif /* #ifdef CONFIG_NO_HZ */
1392 rdp->cpu = cpu;
1393 spin_unlock_irqrestore(&rnp->lock, flags);
1394}
1395
1396/*
1397 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1398 * offline event can be happening at a given time. Note also that we
1399 * can accept some slop in the rsp->completed access due to the fact
1400 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1401 */
1402static void __cpuinit
1403rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1404{
1405 unsigned long flags;
1344 long lastcomp; 1406 long lastcomp;
1345 unsigned long mask; 1407 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1408 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1416,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1416 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1417 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1418 rdp->beenonline = 1; /* We have now been online. */
1419 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1420 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1421 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1422 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1423
1370 /* 1424 /*
@@ -1387,34 +1441,21 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1387 rnp = rnp->parent; 1441 rnp = rnp->parent;
1388 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1389 1443
1390 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1444 spin_unlock_irqrestore(&rsp->onofflock, flags);
1391
1392 /*
1393 * A new grace period might start here. If so, we will be part of
1394 * it, and its gpnum will be greater than ours, so we will
1395 * participate. It is also possible for the gpnum to have been
1396 * incremented before this function was called, and the bitmasks
1397 * to not be filled out until now, in which case we will also
1398 * participate due to our gpnum being behind.
1399 */
1400
1401 /* Since it is coming online, the CPU is in a quiescent state. */
1402 cpu_quiet(cpu, rsp, rdp, lastcomp);
1403 local_irq_restore(flags);
1404} 1445}
1405 1446
1406static void __cpuinit rcu_online_cpu(int cpu) 1447static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1448{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1449 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1450 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1451 rcu_preempt_init_percpu_data(cpu);
1411} 1452}
1412 1453
1413/* 1454/*
1414 * Handle CPU online/offline notifcation events. 1455 * Handle CPU online/offline notification events.
1415 */ 1456 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1457int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1458 unsigned long action, void *hcpu)
1418{ 1459{
1419 long cpu = (long)hcpu; 1460 long cpu = (long)hcpu;
1420 1461
@@ -1486,6 +1527,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1527 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1528 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1529 spin_lock_init(&rnp->lock);
1530 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1531 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1532 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1533 rnp->grplo = j * cpustride;
@@ -1503,16 +1545,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1545 j / rsp->levelspread[i - 1];
1504 } 1546 }
1505 rnp->level = i; 1547 rnp->level = i;
1548 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1549 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1550 }
1507 } 1551 }
1508} 1552}
1509 1553
1510/* 1554/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1555 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1556 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1557 * structure.
1513 */ 1558 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1559#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1560do { \
1561 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1562 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1563 j = 0; \
1518 for_each_possible_cpu(i) { \ 1564 for_each_possible_cpu(i) { \
@@ -1520,32 +1566,43 @@ do { \
1520 j++; \ 1566 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1567 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1568 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1569 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1570 } \
1524} while (0) 1571} while (0)
1525 1572
1526static struct notifier_block __cpuinitdata rcu_nb = { 1573#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1574
1528}; 1575void __init __rcu_init_preempt(void)
1576{
1577 int i; /* All used by RCU_INIT_FLAVOR(). */
1578 int j;
1579 struct rcu_node *rnp;
1580
1581 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1582}
1583
1584#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1585
1586void __init __rcu_init_preempt(void)
1587{
1588}
1589
1590#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1591
1530void __init __rcu_init(void) 1592void __init __rcu_init(void)
1531{ 1593{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1594 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1595 int j;
1534 struct rcu_node *rnp; 1596 struct rcu_node *rnp;
1535 1597
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1598 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1599#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1600 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1601#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1602 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1603 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1604 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1605 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1606}
1550 1607
1551module_param(blimit, int, 0); 1608module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..8e8287a983c2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..1cee04f627eb
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,566 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed;
71 barrier();
72 rdp->passed_quiesc = 1;
73}
74
75/*
76 * We have entered the scheduler, and the current task might soon be
77 * context-switched away from. If this task is in an RCU read-side
78 * critical section, we will no longer be able to rely on the CPU to
79 * record that fact, so we enqueue the task on the appropriate entry
80 * of the blocked_tasks[] array. The task will dequeue itself when
81 * it exits the outermost enclosing RCU read-side critical section.
82 * Therefore, the current grace period cannot be permitted to complete
83 * until the blocked_tasks[] entry indexed by the low-order bit of
84 * rnp->gpnum empties.
85 *
86 * Caller must disable preemption.
87 */
88static void rcu_preempt_note_context_switch(int cpu)
89{
90 struct task_struct *t = current;
91 unsigned long flags;
92 int phase;
93 struct rcu_data *rdp;
94 struct rcu_node *rnp;
95
96 if (t->rcu_read_lock_nesting &&
97 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
98
99 /* Possibly blocking in an RCU read-side critical section. */
100 rdp = rcu_preempt_state.rda[cpu];
101 rnp = rdp->mynode;
102 spin_lock_irqsave(&rnp->lock, flags);
103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
104 t->rcu_blocked_node = rnp;
105
106 /*
107 * If this CPU has already checked in, then this task
108 * will hold up the next grace period rather than the
109 * current grace period. Queue the task accordingly.
110 * If the task is queued for the current grace period
111 * (i.e., this CPU has not yet passed through a quiescent
112 * state for the current grace period), then as long
113 * as that task remains queued, the current grace period
114 * cannot end.
115 *
116 * But first, note that the current CPU must still be
117 * on line!
118 */
119 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
123 spin_unlock_irqrestore(&rnp->lock, flags);
124 }
125
126 /*
127 * Either we were not in an RCU read-side critical section to
128 * begin with, or we have now recorded that critical section
129 * globally. Either way, we can now note a quiescent state
130 * for this CPU. Again, if we were in an RCU read-side critical
131 * section, and if that critical section was blocking the current
132 * grace period, then the fact that the task has been enqueued
133 * means that we continue to block the current grace period.
134 */
135 rcu_preempt_qs(cpu);
136 local_irq_save(flags);
137 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
138 local_irq_restore(flags);
139}
140
141/*
142 * Tree-preemptable RCU implementation for rcu_read_lock().
143 * Just increment ->rcu_read_lock_nesting, shared state will be updated
144 * if we block.
145 */
146void __rcu_read_lock(void)
147{
148 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
149 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
150}
151EXPORT_SYMBOL_GPL(__rcu_read_lock);
152
153static void rcu_read_unlock_special(struct task_struct *t)
154{
155 int empty;
156 unsigned long flags;
157 unsigned long mask;
158 struct rcu_node *rnp;
159 int special;
160
161 /* NMI handlers cannot block and cannot safely manipulate state. */
162 if (in_nmi())
163 return;
164
165 local_irq_save(flags);
166
167 /*
168 * If RCU core is waiting for this CPU to exit critical section,
169 * let it know that we have done so.
170 */
171 special = t->rcu_read_unlock_special;
172 if (special & RCU_READ_UNLOCK_NEED_QS) {
173 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
174 rcu_preempt_qs(smp_processor_id());
175 }
176
177 /* Hardware IRQ handlers cannot block. */
178 if (in_irq()) {
179 local_irq_restore(flags);
180 return;
181 }
182
183 /* Clean up if blocked during RCU read-side critical section. */
184 if (special & RCU_READ_UNLOCK_BLOCKED) {
185 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
186
187 /*
188 * Remove this task from the list it blocked on. The
189 * task can migrate while we acquire the lock, but at
190 * most one time. So at most two passes through loop.
191 */
192 for (;;) {
193 rnp = t->rcu_blocked_node;
194 spin_lock(&rnp->lock); /* irqs already disabled. */
195 if (rnp == t->rcu_blocked_node)
196 break;
197 spin_unlock(&rnp->lock); /* irqs remain disabled. */
198 }
199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
200 list_del_init(&t->rcu_node_entry);
201 t->rcu_blocked_node = NULL;
202
203 /*
204 * If this was the last task on the current list, and if
205 * we aren't waiting on any CPUs, report the quiescent state.
206 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
207 * drop rnp->lock and restore irq.
208 */
209 if (!empty && rnp->qsmask == 0 &&
210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
211 struct rcu_node *rnp_p;
212
213 if (rnp->parent == NULL) {
214 /* Only one rcu_node in the tree. */
215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
216 return;
217 }
218 /* Report up the rest of the hierarchy. */
219 mask = rnp->grpmask;
220 spin_unlock_irqrestore(&rnp->lock, flags);
221 rnp_p = rnp->parent;
222 spin_lock_irqsave(&rnp_p->lock, flags);
223 WARN_ON_ONCE(rnp->qsmask);
224 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
225 return;
226 }
227 spin_unlock(&rnp->lock);
228 }
229 local_irq_restore(flags);
230}
231
232/*
233 * Tree-preemptable RCU implementation for rcu_read_unlock().
234 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
235 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
236 * invoke rcu_read_unlock_special() to clean up after a context switch
237 * in an RCU read-side critical section and other special cases.
238 */
239void __rcu_read_unlock(void)
240{
241 struct task_struct *t = current;
242
243 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
244 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
245 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
246 rcu_read_unlock_special(t);
247}
248EXPORT_SYMBOL_GPL(__rcu_read_unlock);
249
250#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
251
252/*
253 * Scan the current list of tasks blocked within RCU read-side critical
254 * sections, printing out the tid of each.
255 */
256static void rcu_print_task_stall(struct rcu_node *rnp)
257{
258 unsigned long flags;
259 struct list_head *lp;
260 int phase = rnp->gpnum & 0x1;
261 struct task_struct *t;
262
263 if (!list_empty(&rnp->blocked_tasks[phase])) {
264 spin_lock_irqsave(&rnp->lock, flags);
265 phase = rnp->gpnum & 0x1; /* re-read under lock. */
266 lp = &rnp->blocked_tasks[phase];
267 list_for_each_entry(t, lp, rcu_node_entry)
268 printk(" P%d", t->pid);
269 spin_unlock_irqrestore(&rnp->lock, flags);
270 }
271}
272
273#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
274
275/*
276 * Check that the list of blocked tasks for the newly completed grace
277 * period is in fact empty. It is a serious bug to complete a grace
278 * period that still has RCU readers blocked! This function must be
279 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
280 * must be held by the caller.
281 */
282static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
283{
284 WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
285 WARN_ON_ONCE(rnp->qsmask);
286}
287
288/*
289 * Check for preempted RCU readers for the specified rcu_node structure.
290 * If the caller needs a reliable answer, it must hold the rcu_node's
291 * >lock.
292 */
293static int rcu_preempted_readers(struct rcu_node *rnp)
294{
295 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
296}
297
298#ifdef CONFIG_HOTPLUG_CPU
299
300/*
301 * Handle tasklist migration for case in which all CPUs covered by the
302 * specified rcu_node have gone offline. Move them up to the root
303 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock.
306 *
307 * The caller must hold rnp->lock with irqs disabled.
308 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp,
311 struct rcu_data *rdp)
312{
313 int i;
314 struct list_head *lp;
315 struct list_head *lp_root;
316 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp;
318
319 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */
322 }
323 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1])));
326
327 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the
329 * root rcu_node can be at most one ahead of the rest of the
330 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element.
332 */
333 for (i = 0; i < 2; i++) {
334 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) {
337 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
338 spin_lock(&rnp_root->lock); /* irqs already disabled */
339 list_del(&tp->rcu_node_entry);
340 tp->rcu_blocked_node = rnp_root;
341 list_add(&tp->rcu_node_entry, lp_root);
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 }
344 }
345}
346
347/*
348 * Do CPU-offline processing for preemptable RCU.
349 */
350static void rcu_preempt_offline_cpu(int cpu)
351{
352 __rcu_offline_cpu(cpu, &rcu_preempt_state);
353}
354
355#endif /* #ifdef CONFIG_HOTPLUG_CPU */
356
357/*
358 * Check for a quiescent state from the current CPU. When a task blocks,
359 * the task is recorded in the corresponding CPU's rcu_node structure,
360 * which is checked elsewhere.
361 *
362 * Caller must disable hard irqs.
363 */
364static void rcu_preempt_check_callbacks(int cpu)
365{
366 struct task_struct *t = current;
367
368 if (t->rcu_read_lock_nesting == 0) {
369 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
370 rcu_preempt_qs(cpu);
371 return;
372 }
373 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
374 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
375}
376
377/*
378 * Process callbacks for preemptable RCU.
379 */
380static void rcu_preempt_process_callbacks(void)
381{
382 __rcu_process_callbacks(&rcu_preempt_state,
383 &__get_cpu_var(rcu_preempt_data));
384}
385
386/*
387 * Queue a preemptable-RCU callback for invocation after a grace period.
388 */
389void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
390{
391 __call_rcu(head, func, &rcu_preempt_state);
392}
393EXPORT_SYMBOL_GPL(call_rcu);
394
395/*
396 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done.
398 */
399static int rcu_preempt_pending(int cpu)
400{
401 return __rcu_pending(&rcu_preempt_state,
402 &per_cpu(rcu_preempt_data, cpu));
403}
404
405/*
406 * Does preemptable RCU need the CPU to stay out of dynticks mode?
407 */
408static int rcu_preempt_needs_cpu(int cpu)
409{
410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
411}
412
413/*
414 * Initialize preemptable RCU's per-CPU data.
415 */
416static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
417{
418 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
419}
420
421/*
422 * Check for a task exiting while in a preemptable-RCU read-side
423 * critical section, clean up if so. No need to issue warnings,
424 * as debug_check_no_locks_held() already does this if lockdep
425 * is enabled.
426 */
427void exit_rcu(void)
428{
429 struct task_struct *t = current;
430
431 if (t->rcu_read_lock_nesting == 0)
432 return;
433 t->rcu_read_lock_nesting = 1;
434 rcu_read_unlock();
435}
436
437#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
438
439/*
440 * Tell them what RCU they are running.
441 */
442static inline void rcu_bootup_announce(void)
443{
444 printk(KERN_INFO "Hierarchical RCU implementation.\n");
445}
446
447/*
448 * Return the number of RCU batches processed thus far for debug & stats.
449 */
450long rcu_batches_completed(void)
451{
452 return rcu_batches_completed_sched();
453}
454EXPORT_SYMBOL_GPL(rcu_batches_completed);
455
456/*
457 * Because preemptable RCU does not exist, we never have to check for
458 * CPUs being in quiescent states.
459 */
460static void rcu_preempt_note_context_switch(int cpu)
461{
462}
463
464#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
465
466/*
467 * Because preemptable RCU does not exist, we never have to check for
468 * tasks blocked within RCU read-side critical sections.
469 */
470static void rcu_print_task_stall(struct rcu_node *rnp)
471{
472}
473
474#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
475
476/*
477 * Because there is no preemptable RCU, there can be no readers blocked,
478 * so there is no need to check for blocked tasks. So check only for
479 * bogus qsmask values.
480 */
481static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
482{
483 WARN_ON_ONCE(rnp->qsmask);
484}
485
486/*
487 * Because preemptable RCU does not exist, there are never any preempted
488 * RCU readers.
489 */
490static int rcu_preempted_readers(struct rcu_node *rnp)
491{
492 return 0;
493}
494
495#ifdef CONFIG_HOTPLUG_CPU
496
497/*
498 * Because preemptable RCU does not exist, it never needs to migrate
499 * tasks that were blocked within RCU read-side critical sections.
500 */
501static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
502 struct rcu_node *rnp,
503 struct rcu_data *rdp)
504{
505}
506
507/*
508 * Because preemptable RCU does not exist, it never needs CPU-offline
509 * processing.
510 */
511static void rcu_preempt_offline_cpu(int cpu)
512{
513}
514
515#endif /* #ifdef CONFIG_HOTPLUG_CPU */
516
517/*
518 * Because preemptable RCU does not exist, it never has any callbacks
519 * to check.
520 */
521void rcu_preempt_check_callbacks(int cpu)
522{
523}
524
525/*
526 * Because preemptable RCU does not exist, it never has any callbacks
527 * to process.
528 */
529void rcu_preempt_process_callbacks(void)
530{
531}
532
533/*
534 * In classic RCU, call_rcu() is just call_rcu_sched().
535 */
536void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
537{
538 call_rcu_sched(head, func);
539}
540EXPORT_SYMBOL_GPL(call_rcu);
541
542/*
543 * Because preemptable RCU does not exist, it never has any work to do.
544 */
545static int rcu_preempt_pending(int cpu)
546{
547 return 0;
548}
549
550/*
551 * Because preemptable RCU does not exist, it never needs any CPU.
552 */
553static int rcu_preempt_needs_cpu(int cpu)
554{
555 return 0;
556}
557
558/*
559 * Because preemptable RCU does not exist, there is no per-CPU
560 * data to initialize.
561 */
562static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
563{
564}
565
566#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..179e6ad80dc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -88,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
88 return single_open(file, show_rcudata, NULL); 93 return single_open(file, show_rcudata, NULL);
89} 94}
90 95
91static struct file_operations rcudata_fops = { 96static const struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE, 97 .owner = THIS_MODULE,
93 .open = rcudata_open, 98 .open = rcudata_open,
94 .read = seq_read, 99 .read = seq_read,
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -136,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
136 return single_open(file, show_rcudata_csv, NULL); 145 return single_open(file, show_rcudata_csv, NULL);
137} 146}
138 147
139static struct file_operations rcudata_csv_fops = { 148static const struct file_operations rcudata_csv_fops = {
140 .owner = THIS_MODULE, 149 .owner = THIS_MODULE,
141 .open = rcudata_csv_open, 150 .open = rcudata_csv_open,
142 .read = seq_read, 151 .read = seq_read,
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -183,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
183 return single_open(file, show_rcuhier, NULL); 196 return single_open(file, show_rcuhier, NULL);
184} 197}
185 198
186static struct file_operations rcuhier_fops = { 199static const struct file_operations rcuhier_fops = {
187 .owner = THIS_MODULE, 200 .owner = THIS_MODULE,
188 .open = rcuhier_open, 201 .open = rcuhier_open,
189 .read = seq_read, 202 .read = seq_read,
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -205,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
205 return single_open(file, show_rcugp, NULL); 222 return single_open(file, show_rcugp, NULL);
206} 223}
207 224
208static struct file_operations rcugp_fops = { 225static const struct file_operations rcugp_fops = {
209 .owner = THIS_MODULE, 226 .owner = THIS_MODULE,
210 .open = rcugp_open, 227 .open = rcugp_open,
211 .read = seq_read, 228 .read = seq_read,
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -255,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
255 return single_open(file, show_rcu_pending, NULL); 276 return single_open(file, show_rcu_pending, NULL);
256} 277}
257 278
258static struct file_operations rcu_pending_fops = { 279static const struct file_operations rcu_pending_fops = {
259 .owner = THIS_MODULE, 280 .owner = THIS_MODULE,
260 .open = rcu_pending_open, 281 .open = rcu_pending_open,
261 .read = seq_read, 282 .read = seq_read,
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/relay.c b/kernel/relay.c
index bc188549788f..760c26209a3c 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
60/* 60/*
61 * vm_ops for relay file mappings. 61 * vm_ops for relay file mappings.
62 */ 62 */
63static struct vm_operations_struct relay_file_mmap_ops = { 63static const struct vm_operations_struct relay_file_mmap_ops = {
64 .fault = relay_buf_fault, 64 .fault = relay_buf_fault,
65 .close = relay_file_mmap_close, 65 .close = relay_file_mmap_close,
66}; 66};
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index e1338f074314..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
19{ 19{
20 spin_lock_init(&counter->lock); 20 spin_lock_init(&counter->lock);
21 counter->limit = RESOURCE_MAX; 21 counter->limit = RESOURCE_MAX;
22 counter->soft_limit = RESOURCE_MAX;
22 counter->parent = parent; 23 counter->parent = parent;
23} 24}
24 25
@@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member)
101 return &counter->limit; 102 return &counter->limit;
102 case RES_FAILCNT: 103 case RES_FAILCNT:
103 return &counter->failcnt; 104 return &counter->failcnt;
105 case RES_SOFT_LIMIT:
106 return &counter->soft_limit;
104 }; 107 };
105 108
106 BUG(); 109 BUG();
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
223 223
224EXPORT_SYMBOL(release_resource); 224EXPORT_SYMBOL(release_resource);
225 225
226#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) 226#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
227/* 227/*
228 * Finds the lowest memory reosurce exists within [res->start.res->end) 228 * Finds the lowest memory reosurce exists within [res->start.res->end)
229 * the caller must specify res->start, res->end, res->flags. 229 * the caller must specify res->start, res->end, res->flags and "name".
230 * If found, returns 0, res is overwritten, if not found, returns -1. 230 * If found, returns 0, res is overwritten, if not found, returns -1.
231 */ 231 */
232static int find_next_system_ram(struct resource *res) 232static int find_next_system_ram(struct resource *res, char *name)
233{ 233{
234 resource_size_t start, end; 234 resource_size_t start, end;
235 struct resource *p; 235 struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
245 /* system ram is just marked as IORESOURCE_MEM */ 245 /* system ram is just marked as IORESOURCE_MEM */
246 if (p->flags != res->flags) 246 if (p->flags != res->flags)
247 continue; 247 continue;
248 if (name && strcmp(p->name, name))
249 continue;
248 if (p->start > end) { 250 if (p->start > end) {
249 p = NULL; 251 p = NULL;
250 break; 252 break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
262 res->end = p->end; 264 res->end = p->end;
263 return 0; 265 return 0;
264} 266}
265int 267
266walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, 268/*
267 int (*func)(unsigned long, unsigned long, void *)) 269 * This function calls callback against all memory range of "System RAM"
270 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
271 * Now, this function is only for "System RAM".
272 */
273int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *))
268{ 275{
269 struct resource res; 276 struct resource res;
270 unsigned long pfn, len; 277 unsigned long pfn, len;
271 u64 orig_end; 278 u64 orig_end;
272 int ret = -1; 279 int ret = -1;
280
273 res.start = (u64) start_pfn << PAGE_SHIFT; 281 res.start = (u64) start_pfn << PAGE_SHIFT;
274 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 282 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
275 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 283 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
276 orig_end = res.end; 284 orig_end = res.end;
277 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 285 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) {
278 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 287 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
279 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
280 ret = (*func)(pfn, len, arg); 289 ret = (*func)(pfn, len, arg);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..1535f3884b88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -537,14 +505,6 @@ struct root_domain {
537#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
538 struct cpupri cpupri; 506 struct cpupri cpupri;
539#endif 507#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541 /*
542 * Preferred wake up cpu nominated by sched_mc balance that will be
543 * used when most cpus are idle in the system indicating overall very
544 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
545 */
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548}; 508};
549 509
550/* 510/*
@@ -616,6 +576,7 @@ struct rq {
616 576
617 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
618 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
619 int active_balance; 580 int active_balance;
620 int push_cpu; 581 int push_cpu;
621 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -626,6 +587,9 @@ struct rq {
626 587
627 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
628 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
629#endif 593#endif
630 594
631 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -665,9 +629,10 @@ struct rq {
665 629
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667 631
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
669{ 634{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
671} 636}
672 637
673static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 662
697inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
698{ 664{
@@ -715,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
715 * This interface allows printk to be called with the runqueue lock 681 * This interface allows printk to be called with the runqueue lock
716 * held and know whether or not it is OK to wake up the klogd. 682 * held and know whether or not it is OK to wake up the klogd.
717 */ 683 */
718int runqueue_is_locked(void) 684int runqueue_is_locked(int cpu)
719{ 685{
720 int cpu = get_cpu(); 686 return spin_is_locked(&cpu_rq(cpu)->lock);
721 struct rq *rq = cpu_rq(cpu);
722 int ret;
723
724 ret = spin_is_locked(&rq->lock);
725 put_cpu();
726 return ret;
727} 687}
728 688
729/* 689/*
@@ -820,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
820 return single_open(filp, sched_feat_show, NULL); 780 return single_open(filp, sched_feat_show, NULL);
821} 781}
822 782
823static struct file_operations sched_feat_fops = { 783static const struct file_operations sched_feat_fops = {
824 .open = sched_feat_open, 784 .open = sched_feat_open,
825 .write = sched_feat_write, 785 .write = sched_feat_write,
826 .read = seq_read, 786 .read = seq_read,
@@ -861,6 +821,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 821unsigned int sysctl_sched_shares_thresh = 4;
862 822
863/* 823/*
824 * period over which we average the RT time consumption, measured
825 * in ms.
826 *
827 * default: 1s
828 */
829const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
830
831/*
864 * period over which we measure -rt task cpu usage in us. 832 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 833 * default: 1s
866 */ 834 */
@@ -1278,12 +1246,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1246}
1279#endif /* CONFIG_NO_HZ */ 1247#endif /* CONFIG_NO_HZ */
1280 1248
1249static u64 sched_avg_period(void)
1250{
1251 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1252}
1253
1254static void sched_avg_update(struct rq *rq)
1255{
1256 s64 period = sched_avg_period();
1257
1258 while ((s64)(rq->clock - rq->age_stamp) > period) {
1259 rq->age_stamp += period;
1260 rq->rt_avg /= 2;
1261 }
1262}
1263
1264static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1265{
1266 rq->rt_avg += rt_delta;
1267 sched_avg_update(rq);
1268}
1269
1281#else /* !CONFIG_SMP */ 1270#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1271static void resched_task(struct task_struct *p)
1283{ 1272{
1284 assert_spin_locked(&task_rq(p)->lock); 1273 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1274 set_tsk_need_resched(p);
1286} 1275}
1276
1277static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1278{
1279}
1287#endif /* CONFIG_SMP */ 1280#endif /* CONFIG_SMP */
1288 1281
1289#if BITS_PER_LONG == 32 1282#if BITS_PER_LONG == 32
@@ -1494,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
1494#endif 1487#endif
1495 1488
1496#ifdef CONFIG_SMP 1489#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type); 1490/* Used instead of source_load when we know the type == 0 */
1498static unsigned long target_load(int cpu, int type); 1491static unsigned long weighted_cpuload(const int cpu)
1492{
1493 return cpu_rq(cpu)->load.weight;
1494}
1495
1496/*
1497 * Return a low guess at the load of a migration-source cpu weighted
1498 * according to the scheduling class and "nice" value.
1499 *
1500 * We want to under-estimate the load of migration sources, to
1501 * balance conservatively.
1502 */
1503static unsigned long source_load(int cpu, int type)
1504{
1505 struct rq *rq = cpu_rq(cpu);
1506 unsigned long total = weighted_cpuload(cpu);
1507
1508 if (type == 0 || !sched_feat(LB_BIAS))
1509 return total;
1510
1511 return min(rq->cpu_load[type-1], total);
1512}
1513
1514/*
1515 * Return a high guess at the load of a migration-target cpu weighted
1516 * according to the scheduling class and "nice" value.
1517 */
1518static unsigned long target_load(int cpu, int type)
1519{
1520 struct rq *rq = cpu_rq(cpu);
1521 unsigned long total = weighted_cpuload(cpu);
1522
1523 if (type == 0 || !sched_feat(LB_BIAS))
1524 return total;
1525
1526 return max(rq->cpu_load[type-1], total);
1527}
1528
1529static struct sched_group *group_of(int cpu)
1530{
1531 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1532
1533 if (!sd)
1534 return NULL;
1535
1536 return sd->groups;
1537}
1538
1539static unsigned long power_of(int cpu)
1540{
1541 struct sched_group *group = group_of(cpu);
1542
1543 if (!group)
1544 return SCHED_LOAD_SCALE;
1545
1546 return group->cpu_power;
1547}
1548
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1549static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500 1550
1501static unsigned long cpu_avg_load_per_task(int cpu) 1551static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1563
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1564#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1565
1566struct update_shares_data {
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1572static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1573
1518/* 1574/*
1519 * Calculate and set the cpu's group shares. 1575 * Calculate and set the cpu's group shares.
1520 */ 1576 */
1521static void 1577static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1578 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1579 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd)
1524{ 1581{
1525 unsigned long shares; 1582 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1583 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1584
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1585 rq_weight = usd->rq_weight[cpu];
1586 if (!rq_weight) {
1587 boost = 1;
1588 rq_weight = NICE_0_LOAD;
1589 }
1532 1590
1533 /* 1591 /*
1534 * \Sum shares * rq_weight 1592 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1593 * shares_i = -----------------------------
1536 * \Sum rq_weight 1594 * \Sum_j rq_weight_j
1537 *
1538 */ 1595 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1596 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1597 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1602 unsigned long flags;
1546 1603
1547 spin_lock_irqsave(&rq->lock, flags); 1604 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1607 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1608 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1609 }
@@ -1559,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1616 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1617static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1618{
1562 unsigned long weight, rq_weight = 0; 1619 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1620 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1621 struct sched_domain *sd = data;
1622 unsigned long flags;
1565 int i; 1623 int i;
1566 1624
1625 if (!tg->se[0])
1626 return 0;
1627
1628 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data);
1630
1567 for_each_cpu(i, sched_domain_span(sd)) { 1631 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight;
1634
1568 /* 1635 /*
1569 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1638 * run here it will not get delayed by group starvation.
1572 */ 1639 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1640 if (!weight)
1575 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1576 1642
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1643 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1580 } 1645 }
@@ -1586,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1651 shares = tg->shares;
1587 1652
1588 for_each_cpu(i, sched_domain_span(sd)) 1653 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1655
1656 local_irq_restore(flags);
1590 1657
1591 return 0; 1658 return 0;
1592} 1659}
@@ -1616,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1683
1617static void update_shares(struct sched_domain *sd) 1684static void update_shares(struct sched_domain *sd)
1618{ 1685{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1686 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1687 u64 now;
1688
1689 if (root_task_group_empty())
1690 return;
1691
1692 now = cpu_clock(raw_smp_processor_id());
1693 elapsed = now - sd->last_update;
1621 1694
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1695 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1696 sd->last_update = now;
@@ -1627,6 +1700,9 @@ static void update_shares(struct sched_domain *sd)
1627 1700
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1701static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1702{
1703 if (root_task_group_empty())
1704 return;
1705
1630 spin_unlock(&rq->lock); 1706 spin_unlock(&rq->lock);
1631 update_shares(sd); 1707 update_shares(sd);
1632 spin_lock(&rq->lock); 1708 spin_lock(&rq->lock);
@@ -1634,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1710
1635static void update_h_load(long cpu) 1711static void update_h_load(long cpu)
1636{ 1712{
1713 if (root_task_group_empty())
1714 return;
1715
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1716 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1717}
1639 1718
@@ -1651,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1651 1730
1652#ifdef CONFIG_PREEMPT 1731#ifdef CONFIG_PREEMPT
1653 1732
1733static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1734
1654/* 1735/*
1655 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1736 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1656 * way at the expense of forcing extra atomic operations in all 1737 * way at the expense of forcing extra atomic operations in all
@@ -1915,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1915} 1996}
1916 1997
1917#ifdef CONFIG_SMP 1998#ifdef CONFIG_SMP
1918
1919/* Used instead of source_load when we know the type == 0 */
1920static unsigned long weighted_cpuload(const int cpu)
1921{
1922 return cpu_rq(cpu)->load.weight;
1923}
1924
1925/* 1999/*
1926 * Is this task likely cache-hot: 2000 * Is this task likely cache-hot:
1927 */ 2001 */
@@ -1979,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1979 if (task_hot(p, old_rq->clock, NULL)) 2053 if (task_hot(p, old_rq->clock, NULL))
1980 schedstat_inc(p, se.nr_forced2_migrations); 2054 schedstat_inc(p, se.nr_forced2_migrations);
1981#endif 2055#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0); 2057 1, 1, NULL, 0);
1984 } 2058 }
1985 p->se.vruntime -= old_cfsrq->min_vruntime - 2059 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2195,186 +2269,6 @@ void kick_process(struct task_struct *p)
2195 preempt_enable(); 2269 preempt_enable();
2196} 2270}
2197EXPORT_SYMBOL_GPL(kick_process); 2271EXPORT_SYMBOL_GPL(kick_process);
2198
2199/*
2200 * Return a low guess at the load of a migration-source cpu weighted
2201 * according to the scheduling class and "nice" value.
2202 *
2203 * We want to under-estimate the load of migration sources, to
2204 * balance conservatively.
2205 */
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217/*
2218 * Return a high guess at the load of a migration-target cpu weighted
2219 * according to the scheduling class and "nice" value.
2220 */
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232/*
2233 * find_idlest_group finds and returns the least busy CPU group within the
2234 * domain.
2235 */
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249 /* Skip over this group if it has no CPUs allowed */
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257 /* Tally up the load of all CPUs in the group */
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261 /* Bias balancing toward cpus of our domain */
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288/*
2289 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2290 */
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298 /* Traverse only the allowed CPUs */
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311/*
2312 * sched_balance_self: balance the current task (running on cpu) in domains
2313 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2314 * SD_BALANCE_EXEC.
2315 *
2316 * Balance, ie. select the least loaded group.
2317 *
2318 * Returns the target CPU number, or the same CPU if no balancing is needed.
2319 *
2320 * preempt must be disabled.
2321 */
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328 /*
2329 * If power savings logic is enabled for a domain, stop there.
2330 */
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357 /* Now try balancing at a lower domain level of cpu */
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362 /* Now try balancing at a lower domain level of new_cpu */
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372 /* while loop will break here if sd == NULL */
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif /* CONFIG_SMP */ 2272#endif /* CONFIG_SMP */
2379 2273
2380/** 2274/**
@@ -2412,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
2412 * 2306 *
2413 * returns failure only if the task is already active. 2307 * returns failure only if the task is already active.
2414 */ 2308 */
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2309static int try_to_wake_up(struct task_struct *p, unsigned int state,
2310 int wake_flags)
2416{ 2311{
2417 int cpu, orig_cpu, this_cpu, success = 0; 2312 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2313 unsigned long flags;
2419 long old_state;
2420 struct rq *rq; 2314 struct rq *rq;
2421 2315
2422 if (!sched_feat(SYNC_WAKEUPS)) 2316 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0; 2317 wake_flags &= ~WF_SYNC;
2424 2318
2425#ifdef CONFIG_SMP 2319 this_cpu = get_cpu();
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428
2429 this_cpu = raw_smp_processor_id();
2430 cpu = task_cpu(p);
2431
2432 for_each_domain(this_cpu, sd) {
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440 2320
2441 smp_wmb(); 2321 smp_wmb();
2442 rq = task_rq_lock(p, &flags); 2322 rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq); 2323 update_rq_clock(rq);
2444 old_state = p->state; 2324 if (!(p->state & state))
2445 if (!(old_state & state))
2446 goto out; 2325 goto out;
2447 2326
2448 if (p->se.on_rq) 2327 if (p->se.on_rq)
@@ -2450,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2450 2329
2451 cpu = task_cpu(p); 2330 cpu = task_cpu(p);
2452 orig_cpu = cpu; 2331 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454 2332
2455#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p))) 2334 if (unlikely(task_running(rq, p)))
2457 goto out_activate; 2335 goto out_activate;
2458 2336
2459 cpu = p->sched_class->select_task_rq(p, sync); 2337 /*
2460 if (cpu != orig_cpu) { 2338 * In order to handle concurrent wakeups and release the rq->lock
2339 * we put the task in TASK_WAKING state.
2340 *
2341 * First fix up the nr_uninterruptible count:
2342 */
2343 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags);
2347
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu)
2461 set_task_cpu(p, cpu); 2350 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags);
2463 /* might preempt at this point */
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470 2351
2471 this_cpu = smp_processor_id(); 2352 rq = task_rq_lock(p, &flags);
2472 cpu = task_cpu(p); 2353 WARN_ON(p->state != TASK_WAKING);
2473 } 2354 cpu = task_cpu(p);
2474 2355
2475#ifdef CONFIG_SCHEDSTATS 2356#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count); 2357 schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2490out_activate: 2371out_activate:
2491#endif /* CONFIG_SMP */ 2372#endif /* CONFIG_SMP */
2492 schedstat_inc(p, se.nr_wakeups); 2373 schedstat_inc(p, se.nr_wakeups);
2493 if (sync) 2374 if (wake_flags & WF_SYNC)
2494 schedstat_inc(p, se.nr_wakeups_sync); 2375 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu) 2376 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate); 2377 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,7 +2400,7 @@ out_activate:
2519 2400
2520out_running: 2401out_running:
2521 trace_sched_wakeup(rq, p, success); 2402 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync); 2403 check_preempt_curr(rq, p, wake_flags);
2523 2404
2524 p->state = TASK_RUNNING; 2405 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP 2406#ifdef CONFIG_SMP
@@ -2528,6 +2409,7 @@ out_running:
2528#endif 2409#endif
2529out: 2410out:
2530 task_rq_unlock(rq, &flags); 2411 task_rq_unlock(rq, &flags);
2412 put_cpu();
2531 2413
2532 return success; 2414 return success;
2533} 2415}
@@ -2570,6 +2452,7 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_overlap = 0; 2452 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0; 2453 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2573 2456
2574#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0; 2458 p->se.wait_start = 0;
@@ -2631,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2631 2514
2632 __sched_fork(p); 2515 __sched_fork(p);
2633 2516
2634#ifdef CONFIG_SMP
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2636#endif
2637 set_task_cpu(p, cpu);
2638
2639 /* 2517 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2518 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2519 */
2642 p->prio = current->normal_prio; 2520 p->prio = current->normal_prio;
2521
2522 /*
2523 * Revert to default priority/policy on fork if requested.
2524 */
2525 if (unlikely(p->sched_reset_on_fork)) {
2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2527 p->policy = SCHED_NORMAL;
2528
2529 if (p->normal_prio < DEFAULT_PRIO)
2530 p->prio = DEFAULT_PRIO;
2531
2532 if (PRIO_TO_NICE(p->static_prio) < 0) {
2533 p->static_prio = NICE_TO_PRIO(0);
2534 set_load_weight(p);
2535 }
2536
2537 /*
2538 * We don't need the reset flag anymore after the fork. It has
2539 * fulfilled its duty:
2540 */
2541 p->sched_reset_on_fork = 0;
2542 }
2543
2643 if (!rt_prio(p->prio)) 2544 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2545 p->sched_class = &fair_sched_class;
2645 2546
2547#ifdef CONFIG_SMP
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2549#endif
2550 set_task_cpu(p, cpu);
2551
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on())) 2553 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2554 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2688,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2688 inc_nr_running(rq); 2594 inc_nr_running(rq);
2689 } 2595 }
2690 trace_sched_wakeup_new(rq, p, 1); 2596 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0); 2597 check_preempt_curr(rq, p, WF_FORK);
2692#ifdef CONFIG_SMP 2598#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up) 2599 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p); 2600 p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2702{
2797 struct mm_struct *mm = rq->prev_mm; 2703 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2704 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2705
2806 rq->prev_mm = NULL; 2706 rq->prev_mm = NULL;
2807 2707
@@ -2818,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2818 */ 2718 */
2819 prev_state = prev->state; 2719 prev_state = prev->state;
2820 finish_arch_switch(prev); 2720 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2721 perf_event_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2722 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2723
2828 fire_sched_in_preempt_notifiers(current); 2724 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2725 if (mm)
@@ -2838,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2734 }
2839} 2735}
2840 2736
2737#ifdef CONFIG_SMP
2738
2739/* assumes rq->lock is held */
2740static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2741{
2742 if (prev->sched_class->pre_schedule)
2743 prev->sched_class->pre_schedule(rq, prev);
2744}
2745
2746/* rq->lock is NOT held, but preemption is disabled */
2747static inline void post_schedule(struct rq *rq)
2748{
2749 if (rq->post_schedule) {
2750 unsigned long flags;
2751
2752 spin_lock_irqsave(&rq->lock, flags);
2753 if (rq->curr->sched_class->post_schedule)
2754 rq->curr->sched_class->post_schedule(rq);
2755 spin_unlock_irqrestore(&rq->lock, flags);
2756
2757 rq->post_schedule = 0;
2758 }
2759}
2760
2761#else
2762
2763static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2764{
2765}
2766
2767static inline void post_schedule(struct rq *rq)
2768{
2769}
2770
2771#endif
2772
2841/** 2773/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2774 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2775 * @prev: the thread we just switched away from.
@@ -2848,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2780 struct rq *rq = this_rq();
2849 2781
2850 finish_task_switch(rq, prev); 2782 finish_task_switch(rq, prev);
2783
2784 /*
2785 * FIXME: do we need to worry about rq being invalidated by the
2786 * task_switch?
2787 */
2788 post_schedule(rq);
2789
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2790#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2791 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2792 preempt_enable();
@@ -2965,6 +2904,19 @@ unsigned long nr_iowait(void)
2965 return sum; 2904 return sum;
2966} 2905}
2967 2906
2907unsigned long nr_iowait_cpu(void)
2908{
2909 struct rq *this = this_rq();
2910 return atomic_read(&this->nr_iowait);
2911}
2912
2913unsigned long this_cpu_load(void)
2914{
2915 struct rq *this = this_rq();
2916 return this->cpu_load[0];
2917}
2918
2919
2968/* Variables and functions for calc_load */ 2920/* Variables and functions for calc_load */
2969static atomic_long_t calc_load_tasks; 2921static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update; 2922static unsigned long calc_load_update;
@@ -3164,7 +3116,7 @@ out:
3164void sched_exec(void) 3116void sched_exec(void)
3165{ 3117{
3166 int new_cpu, this_cpu = get_cpu(); 3118 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3119 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3168 put_cpu(); 3120 put_cpu();
3169 if (new_cpu != this_cpu) 3121 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu); 3122 sched_migrate_task(current, new_cpu);
@@ -3379,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3331{
3380 const struct sched_class *class; 3332 const struct sched_class *class;
3381 3333
3382 for (class = sched_class_highest; class; class = class->next) 3334 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3335 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3336 return 1;
3337 }
3385 3338
3386 return 0; 3339 return 0;
3387} 3340}
@@ -3544,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3497 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3498 * from other group and save more power
3546 */ 3499 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3500 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3501 return;
3549 3502
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3503 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3583 *imbalance = sds->min_load_per_task; 3536 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min; 3537 sds->busiest = sds->group_min;
3585 3538
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1; 3539 return 1;
3592 3540
3593} 3541}
@@ -3612,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3560#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3561
3614 3562
3563unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3564{
3565 return SCHED_LOAD_SCALE;
3566}
3567
3568unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3569{
3570 return default_scale_freq_power(sd, cpu);
3571}
3572
3573unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3574{
3575 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3576 unsigned long smt_gain = sd->smt_gain;
3577
3578 smt_gain /= weight;
3579
3580 return smt_gain;
3581}
3582
3583unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3584{
3585 return default_scale_smt_power(sd, cpu);
3586}
3587
3588unsigned long scale_rt_power(int cpu)
3589{
3590 struct rq *rq = cpu_rq(cpu);
3591 u64 total, available;
3592
3593 sched_avg_update(rq);
3594
3595 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3596 available = total - rq->rt_avg;
3597
3598 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3599 total = SCHED_LOAD_SCALE;
3600
3601 total >>= SCHED_LOAD_SHIFT;
3602
3603 return div_u64(available, total);
3604}
3605
3606static void update_cpu_power(struct sched_domain *sd, int cpu)
3607{
3608 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3609 unsigned long power = SCHED_LOAD_SCALE;
3610 struct sched_group *sdg = sd->groups;
3611
3612 if (sched_feat(ARCH_POWER))
3613 power *= arch_scale_freq_power(sd, cpu);
3614 else
3615 power *= default_scale_freq_power(sd, cpu);
3616
3617 power >>= SCHED_LOAD_SHIFT;
3618
3619 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3620 if (sched_feat(ARCH_POWER))
3621 power *= arch_scale_smt_power(sd, cpu);
3622 else
3623 power *= default_scale_smt_power(sd, cpu);
3624
3625 power >>= SCHED_LOAD_SHIFT;
3626 }
3627
3628 power *= scale_rt_power(cpu);
3629 power >>= SCHED_LOAD_SHIFT;
3630
3631 if (!power)
3632 power = 1;
3633
3634 sdg->cpu_power = power;
3635}
3636
3637static void update_group_power(struct sched_domain *sd, int cpu)
3638{
3639 struct sched_domain *child = sd->child;
3640 struct sched_group *group, *sdg = sd->groups;
3641 unsigned long power;
3642
3643 if (!child) {
3644 update_cpu_power(sd, cpu);
3645 return;
3646 }
3647
3648 power = 0;
3649
3650 group = child->groups;
3651 do {
3652 power += group->cpu_power;
3653 group = group->next;
3654 } while (group != child->groups);
3655
3656 sdg->cpu_power = power;
3657}
3658
3615/** 3659/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3660 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3617 * @group: sched_group whose statistics are to be updated. 3661 * @group: sched_group whose statistics are to be updated.
@@ -3624,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3668 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3669 * @sgs: variable to hold the statistics for this group.
3626 */ 3670 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3671static inline void update_sg_lb_stats(struct sched_domain *sd,
3672 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3673 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3674 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3675 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3680 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3681 unsigned long avg_load_per_task;
3637 3682
3638 if (local_group) 3683 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3684 balance_cpu = group_first_cpu(group);
3685 if (balance_cpu == this_cpu)
3686 update_group_power(sd, this_cpu);
3687 }
3640 3688
3641 /* Tally up the load of all CPUs in the group */ 3689 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3690 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3733 }
3686 3734
3687 /* Adjust by relative CPU power of the group */ 3735 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3736 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3737
3691 3738
3692 /* 3739 /*
@@ -3698,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3745 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3746 * the hierarchy?
3700 */ 3747 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3748 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3749 group->cpu_power;
3703 3750
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3751 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3752 sgs->group_imb = 1;
3706 3753
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3754 sgs->group_capacity =
3708 3755 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3756}
3710 3757
3711/** 3758/**
@@ -3723,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3770 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3771 struct sd_lb_stats *sds)
3725{ 3772{
3773 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3774 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3775 struct sg_lb_stats sgs;
3728 int load_idx; 3776 int load_idx, prefer_sibling = 0;
3777
3778 if (child && child->flags & SD_PREFER_SIBLING)
3779 prefer_sibling = 1;
3729 3780
3730 init_sd_power_savings_stats(sd, sds, idle); 3781 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3782 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3787 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3788 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3790 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3791 local_group, cpus, balance, &sgs);
3741 3792
3742 if (local_group && balance && !(*balance)) 3793 if (local_group && balance && !(*balance))
3743 return; 3794 return;
3744 3795
3745 sds->total_load += sgs.group_load; 3796 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3797 sds->total_pwr += group->cpu_power;
3798
3799 /*
3800 * In case the child domain prefers tasks go to siblings
3801 * first, lower the group capacity to one so that we'll try
3802 * and move all the excess tasks away.
3803 */
3804 if (prefer_sibling)
3805 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3806
3748 if (local_group) { 3807 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3808 sds->this_load = sgs.avg_load;
@@ -3763,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3822 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3823 group = group->next;
3765 } while (group != sd->groups); 3824 } while (group != sd->groups);
3766
3767} 3825}
3768 3826
3769/** 3827/**
@@ -3801,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3859 * moving them.
3802 */ 3860 */
3803 3861
3804 pwr_now += sds->busiest->__cpu_power * 3862 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3863 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3864 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3865 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3866 pwr_now /= SCHED_LOAD_SCALE;
3809 3867
3810 /* Amount of load we'd subtract */ 3868 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3869 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3870 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3871 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3872 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 3873 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 3874
3817 /* Amount of load we'd add */ 3875 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 3876 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3877 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 3878 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 3879 sds->this->cpu_power;
3822 else 3880 else
3823 tmp = sg_div_cpu_power(sds->this, 3881 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3882 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 3883 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 3884 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 3885 pwr_move /= SCHED_LOAD_SCALE;
3828 3886
@@ -3857,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 3915 sds->max_load - sds->busiest_load_per_task);
3858 3916
3859 /* How much load to actually move to equalise the imbalance */ 3917 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3918 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3919 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 3920 / SCHED_LOAD_SCALE;
3863 3921
3864 /* 3922 /*
@@ -3988,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4046 int i;
3989 4047
3990 for_each_cpu(i, sched_group_cpus(group)) { 4048 for_each_cpu(i, sched_group_cpus(group)) {
4049 unsigned long power = power_of(i);
4050 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4051 unsigned long wl;
3992 4052
3993 if (!cpumask_test_cpu(i, cpus)) 4053 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4054 continue;
3995 4055
3996 rq = cpu_rq(i); 4056 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4057 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4058 wl /= power;
3998 4059
3999 if (rq->nr_running == 1 && wl > imbalance) 4060 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4061 continue;
4001 4062
4002 if (wl > max_load) { 4063 if (wl > max_load) {
@@ -5031,17 +5092,16 @@ void account_idle_time(cputime_t cputime)
5031 */ 5092 */
5032void account_process_tick(struct task_struct *p, int user_tick) 5093void account_process_tick(struct task_struct *p, int user_tick)
5033{ 5094{
5034 cputime_t one_jiffy = jiffies_to_cputime(1); 5095 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5035 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5036 struct rq *rq = this_rq(); 5096 struct rq *rq = this_rq();
5037 5097
5038 if (user_tick) 5098 if (user_tick)
5039 account_user_time(p, one_jiffy, one_jiffy_scaled); 5099 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5040 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5100 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5041 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5101 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5042 one_jiffy_scaled); 5102 one_jiffy_scaled);
5043 else 5103 else
5044 account_idle_time(one_jiffy); 5104 account_idle_time(cputime_one_jiffy);
5045} 5105}
5046 5106
5047/* 5107/*
@@ -5145,7 +5205,7 @@ void scheduler_tick(void)
5145 curr->sched_class->task_tick(rq, curr, 0); 5205 curr->sched_class->task_tick(rq, curr, 0);
5146 spin_unlock(&rq->lock); 5206 spin_unlock(&rq->lock);
5147 5207
5148 perf_counter_task_tick(curr, cpu); 5208 perf_event_task_tick(curr, cpu);
5149 5209
5150#ifdef CONFIG_SMP 5210#ifdef CONFIG_SMP
5151 rq->idle_at_tick = idle_cpu(cpu); 5211 rq->idle_at_tick = idle_cpu(cpu);
@@ -5257,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
5257#endif 5317#endif
5258} 5318}
5259 5319
5260static void put_prev_task(struct rq *rq, struct task_struct *prev) 5320static void put_prev_task(struct rq *rq, struct task_struct *p)
5261{ 5321{
5262 if (prev->state == TASK_RUNNING) { 5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5263 u64 runtime = prev->se.sum_exec_runtime;
5264 5323
5265 runtime -= prev->se.prev_sum_exec_runtime; 5324 update_avg(&p->se.avg_running, runtime);
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267 5325
5326 if (p->state == TASK_RUNNING) {
5268 /* 5327 /*
5269 * In order to avoid avg_overlap growing stale when we are 5328 * In order to avoid avg_overlap growing stale when we are
5270 * indeed overlapping and hence not getting put to sleep, grow 5329 * indeed overlapping and hence not getting put to sleep, grow
@@ -5274,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5274 * correlates to the amount of cache footprint a task can 5333 * correlates to the amount of cache footprint a task can
5275 * build up. 5334 * build up.
5276 */ 5335 */
5277 update_avg(&prev->se.avg_overlap, runtime); 5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5337 update_avg(&p->se.avg_overlap, runtime);
5338 } else {
5339 update_avg(&p->se.avg_running, 0);
5278 } 5340 }
5279 prev->sched_class->put_prev_task(rq, prev); 5341 p->sched_class->put_prev_task(rq, p);
5280} 5342}
5281 5343
5282/* 5344/*
@@ -5325,7 +5387,7 @@ need_resched:
5325 preempt_disable(); 5387 preempt_disable();
5326 cpu = smp_processor_id(); 5388 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5389 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5390 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5391 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5392 switch_count = &prev->nivcsw;
5331 5393
@@ -5349,10 +5411,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5411 switch_count = &prev->nvcsw;
5350 } 5412 }
5351 5413
5352#ifdef CONFIG_SMP 5414 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5415
5357 if (unlikely(!rq->nr_running)) 5416 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5417 idle_balance(cpu, rq);
@@ -5362,7 +5421,7 @@ need_resched_nonpreemptible:
5362 5421
5363 if (likely(prev != next)) { 5422 if (likely(prev != next)) {
5364 sched_info_switch(prev, next); 5423 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu); 5424 perf_event_task_sched_out(prev, next, cpu);
5366 5425
5367 rq->nr_switches++; 5426 rq->nr_switches++;
5368 rq->curr = next; 5427 rq->curr = next;
@@ -5378,6 +5437,8 @@ need_resched_nonpreemptible:
5378 } else 5437 } else
5379 spin_unlock_irq(&rq->lock); 5438 spin_unlock_irq(&rq->lock);
5380 5439
5440 post_schedule(rq);
5441
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5442 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5443 goto need_resched_nonpreemptible;
5383 5444
@@ -5509,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5509 5570
5510#endif /* CONFIG_PREEMPT */ 5571#endif /* CONFIG_PREEMPT */
5511 5572
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5573int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5513 void *key) 5574 void *key)
5514{ 5575{
5515 return try_to_wake_up(curr->private, mode, sync); 5576 return try_to_wake_up(curr->private, mode, wake_flags);
5516} 5577}
5517EXPORT_SYMBOL(default_wake_function); 5578EXPORT_SYMBOL(default_wake_function);
5518 5579
@@ -5526,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
5526 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5587 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5527 */ 5588 */
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5589static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key) 5590 int nr_exclusive, int wake_flags, void *key)
5530{ 5591{
5531 wait_queue_t *curr, *next; 5592 wait_queue_t *curr, *next;
5532 5593
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags; 5595 unsigned flags = curr->flags;
5535 5596
5536 if (curr->func(curr, mode, sync, key) && 5597 if (curr->func(curr, mode, wake_flags, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break; 5599 break;
5539 } 5600 }
@@ -5594,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key) 5655 int nr_exclusive, void *key)
5595{ 5656{
5596 unsigned long flags; 5657 unsigned long flags;
5597 int sync = 1; 5658 int wake_flags = WF_SYNC;
5598 5659
5599 if (unlikely(!q)) 5660 if (unlikely(!q))
5600 return; 5661 return;
5601 5662
5602 if (unlikely(!nr_exclusive)) 5663 if (unlikely(!nr_exclusive))
5603 sync = 0; 5664 wake_flags = 0;
5604 5665
5605 spin_lock_irqsave(&q->lock, flags); 5666 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key); 5667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5607 spin_unlock_irqrestore(&q->lock, flags); 5668 spin_unlock_irqrestore(&q->lock, flags);
5608} 5669}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6123,17 +6184,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6184 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6185 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6186 struct rq *rq;
6187 int reset_on_fork;
6126 6188
6127 /* may grab non-irq protected spin_locks */ 6189 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6190 BUG_ON(in_interrupt());
6129recheck: 6191recheck:
6130 /* double check policy once rq lock held */ 6192 /* double check policy once rq lock held */
6131 if (policy < 0) 6193 if (policy < 0) {
6194 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6195 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6196 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6197 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6198 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6199
6200 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6201 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6202 policy != SCHED_IDLE)
6203 return -EINVAL;
6204 }
6205
6137 /* 6206 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6207 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6208 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6246,10 @@ recheck:
6177 /* can't change other user's priorities */ 6246 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6247 if (!check_same_owner(p))
6179 return -EPERM; 6248 return -EPERM;
6249
6250 /* Normal users shall not reset the sched_reset_on_fork flag */
6251 if (p->sched_reset_on_fork && !reset_on_fork)
6252 return -EPERM;
6180 } 6253 }
6181 6254
6182 if (user) { 6255 if (user) {
@@ -6220,6 +6293,8 @@ recheck:
6220 if (running) 6293 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6294 p->sched_class->put_prev_task(rq, p);
6222 6295
6296 p->sched_reset_on_fork = reset_on_fork;
6297
6223 oldprio = p->prio; 6298 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6299 __setscheduler(rq, p, policy, param->sched_priority);
6225 6300
@@ -6336,14 +6411,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6411 if (p) {
6337 retval = security_task_getscheduler(p); 6412 retval = security_task_getscheduler(p);
6338 if (!retval) 6413 if (!retval)
6339 retval = p->policy; 6414 retval = p->policy
6415 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6416 }
6341 read_unlock(&tasklist_lock); 6417 read_unlock(&tasklist_lock);
6342 return retval; 6418 return retval;
6343} 6419}
6344 6420
6345/** 6421/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6422 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6423 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6424 * @param: structure containing the RT priority.
6349 */ 6425 */
@@ -6571,19 +6647,9 @@ static inline int should_resched(void)
6571 6647
6572static void __cond_resched(void) 6648static void __cond_resched(void)
6573{ 6649{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6650 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6651 schedule();
6576#endif 6652 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6653}
6588 6654
6589int __sched _cond_resched(void) 6655int __sched _cond_resched(void)
@@ -6597,18 +6663,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6663EXPORT_SYMBOL(_cond_resched);
6598 6664
6599/* 6665/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6666 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6667 * call schedule, and on return reacquire the lock.
6602 * 6668 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6669 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6670 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6671 * spin_unlock(), once by hand).
6606 */ 6672 */
6607int cond_resched_lock(spinlock_t *lock) 6673int __cond_resched_lock(spinlock_t *lock)
6608{ 6674{
6609 int resched = should_resched(); 6675 int resched = should_resched();
6610 int ret = 0; 6676 int ret = 0;
6611 6677
6678 lockdep_assert_held(lock);
6679
6612 if (spin_needbreak(lock) || resched) { 6680 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6681 spin_unlock(lock);
6614 if (resched) 6682 if (resched)
@@ -6620,9 +6688,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6688 }
6621 return ret; 6689 return ret;
6622} 6690}
6623EXPORT_SYMBOL(cond_resched_lock); 6691EXPORT_SYMBOL(__cond_resched_lock);
6624 6692
6625int __sched cond_resched_softirq(void) 6693int __sched __cond_resched_softirq(void)
6626{ 6694{
6627 BUG_ON(!in_softirq()); 6695 BUG_ON(!in_softirq());
6628 6696
@@ -6634,7 +6702,7 @@ int __sched cond_resched_softirq(void)
6634 } 6702 }
6635 return 0; 6703 return 0;
6636} 6704}
6637EXPORT_SYMBOL(cond_resched_softirq); 6705EXPORT_SYMBOL(__cond_resched_softirq);
6638 6706
6639/** 6707/**
6640 * yield - yield the current processor to other threads. 6708 * yield - yield the current processor to other threads.
@@ -6658,11 +6726,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6726 */
6659void __sched io_schedule(void) 6727void __sched io_schedule(void)
6660{ 6728{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6729 struct rq *rq = raw_rq();
6662 6730
6663 delayacct_blkio_start(); 6731 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6732 atomic_inc(&rq->nr_iowait);
6733 current->in_iowait = 1;
6665 schedule(); 6734 schedule();
6735 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6736 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6737 delayacct_blkio_end();
6668} 6738}
@@ -6670,12 +6740,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6740
6671long __sched io_schedule_timeout(long timeout) 6741long __sched io_schedule_timeout(long timeout)
6672{ 6742{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6743 struct rq *rq = raw_rq();
6674 long ret; 6744 long ret;
6675 6745
6676 delayacct_blkio_start(); 6746 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6747 atomic_inc(&rq->nr_iowait);
6748 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6749 ret = schedule_timeout(timeout);
6750 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6751 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6752 delayacct_blkio_end();
6681 return ret; 6753 return ret;
@@ -6759,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6759 if (retval) 6831 if (retval)
6760 goto out_unlock; 6832 goto out_unlock;
6761 6833
6762 /* 6834 time_slice = p->sched_class->get_rr_interval(p);
6763 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6764 * tasks that are on an otherwise idle runqueue:
6765 */
6766 time_slice = 0;
6767 if (p->policy == SCHED_RR) {
6768 time_slice = DEF_TIMESLICE;
6769 } else if (p->policy != SCHED_FIFO) {
6770 struct sched_entity *se = &p->se;
6771 unsigned long flags;
6772 struct rq *rq;
6773 6835
6774 rq = task_rq_lock(p, &flags);
6775 if (rq->cfs.load.weight)
6776 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6777 task_rq_unlock(rq, &flags);
6778 }
6779 read_unlock(&tasklist_lock); 6836 read_unlock(&tasklist_lock);
6780 jiffies_to_timespec(time_slice, &t); 6837 jiffies_to_timespec(time_slice, &t);
6781 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6838 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -6992,8 +7049,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7049
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7050 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7051 /* Need help from migration thread: drop lock and wait. */
7052 struct task_struct *mt = rq->migration_thread;
7053
7054 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7055 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7056 wake_up_process(rq->migration_thread);
7057 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7058 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7059 tlb_migrate_finish(p->mm);
6999 return 0; 7060 return 0;
@@ -7051,6 +7112,11 @@ fail:
7051 return ret; 7112 return ret;
7052} 7113}
7053 7114
7115#define RCU_MIGRATION_IDLE 0
7116#define RCU_MIGRATION_NEED_QS 1
7117#define RCU_MIGRATION_GOT_QS 2
7118#define RCU_MIGRATION_MUST_SYNC 3
7119
7054/* 7120/*
7055 * migration_thread - this is a highprio system thread that performs 7121 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7122 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7124,7 @@ fail:
7058 */ 7124 */
7059static int migration_thread(void *data) 7125static int migration_thread(void *data)
7060{ 7126{
7127 int badcpu;
7061 int cpu = (long)data; 7128 int cpu = (long)data;
7062 struct rq *rq; 7129 struct rq *rq;
7063 7130
@@ -7092,8 +7159,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7159 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7160 list_del_init(head->next);
7094 7161
7095 spin_unlock(&rq->lock); 7162 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7163 spin_unlock(&rq->lock);
7164 __migrate_task(req->task, cpu, req->dest_cpu);
7165 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7166 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7167 spin_unlock(&rq->lock);
7168 } else {
7169 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7170 spin_unlock(&rq->lock);
7171 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7172 }
7097 local_irq_enable(); 7173 local_irq_enable();
7098 7174
7099 complete(&req->done); 7175 complete(&req->done);
@@ -7607,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7607/* 7683/*
7608 * Register at high priority so that task migration (migrate_all_tasks) 7684 * Register at high priority so that task migration (migrate_all_tasks)
7609 * happens before everything else. This has to be lower priority than 7685 * happens before everything else. This has to be lower priority than
7610 * the notifier in the perf_counter subsystem, though. 7686 * the notifier in the perf_event subsystem, though.
7611 */ 7687 */
7612static struct notifier_block __cpuinitdata migration_notifier = { 7688static struct notifier_block __cpuinitdata migration_notifier = {
7613 .notifier_call = migration_call, 7689 .notifier_call = migration_call,
@@ -7625,7 +7701,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7701 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7702 register_cpu_notifier(&migration_notifier);
7627 7703
7628 return err; 7704 return 0;
7629} 7705}
7630early_initcall(migration_init); 7706early_initcall(migration_init);
7631#endif 7707#endif
@@ -7672,7 +7748,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7748 break;
7673 } 7749 }
7674 7750
7675 if (!group->__cpu_power) { 7751 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7752 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7753 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7754 "set\n");
@@ -7696,9 +7772,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7772 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7773
7698 printk(KERN_CONT " %s", str); 7774 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7775 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7776 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7777 group->cpu_power);
7702 } 7778 }
7703 7779
7704 group = group->next; 7780 group = group->next;
@@ -7763,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd)
7763 } 7839 }
7764 7840
7765 /* Following flags don't use groups */ 7841 /* Following flags don't use groups */
7766 if (sd->flags & (SD_WAKE_IDLE | 7842 if (sd->flags & (SD_WAKE_AFFINE))
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0; 7843 return 0;
7770 7844
7771 return 1; 7845 return 1;
@@ -7782,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7856 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0; 7857 return 0;
7784 7858
7785 /* Does parent contain flags not in child? */
7786 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789 /* Flags needing groups don't count if only 1 group in parent */ 7859 /* Flags needing groups don't count if only 1 group in parent */
7790 if (parent->groups == parent->groups->next) { 7860 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE | 7861 pflags &= ~(SD_LOAD_BALANCE |
@@ -7841,7 +7911,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7911 rq->rd = rd;
7842 7912
7843 cpumask_set_cpu(rq->cpu, rd->span); 7913 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7914 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7915 set_rq_online(rq);
7846 7916
7847 spin_unlock_irqrestore(&rq->lock, flags); 7917 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8053,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8053 continue;
7984 8054
7985 cpumask_clear(sched_group_cpus(sg)); 8055 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8056 sg->cpu_power = 0;
7987 8057
7988 for_each_cpu(j, span) { 8058 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8059 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8161,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8161 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8162};
8093 8163
8164struct s_data {
8165#ifdef CONFIG_NUMA
8166 int sd_allnodes;
8167 cpumask_var_t domainspan;
8168 cpumask_var_t covered;
8169 cpumask_var_t notcovered;
8170#endif
8171 cpumask_var_t nodemask;
8172 cpumask_var_t this_sibling_map;
8173 cpumask_var_t this_core_map;
8174 cpumask_var_t send_covered;
8175 cpumask_var_t tmpmask;
8176 struct sched_group **sched_group_nodes;
8177 struct root_domain *rd;
8178};
8179
8180enum s_alloc {
8181 sa_sched_groups = 0,
8182 sa_rootdomain,
8183 sa_tmpmask,
8184 sa_send_covered,
8185 sa_this_core_map,
8186 sa_this_sibling_map,
8187 sa_nodemask,
8188 sa_sched_group_nodes,
8189#ifdef CONFIG_NUMA
8190 sa_notcovered,
8191 sa_covered,
8192 sa_domainspan,
8193#endif
8194 sa_none,
8195};
8196
8094/* 8197/*
8095 * SMT sched-domains: 8198 * SMT sched-domains:
8096 */ 8199 */
@@ -8208,11 +8311,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8311 continue;
8209 } 8312 }
8210 8313
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8314 sg->cpu_power += sd->groups->cpu_power;
8212 } 8315 }
8213 sg = sg->next; 8316 sg = sg->next;
8214 } while (sg != group_head); 8317 } while (sg != group_head);
8215} 8318}
8319
8320static int build_numa_sched_groups(struct s_data *d,
8321 const struct cpumask *cpu_map, int num)
8322{
8323 struct sched_domain *sd;
8324 struct sched_group *sg, *prev;
8325 int n, j;
8326
8327 cpumask_clear(d->covered);
8328 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8329 if (cpumask_empty(d->nodemask)) {
8330 d->sched_group_nodes[num] = NULL;
8331 goto out;
8332 }
8333
8334 sched_domain_node_span(num, d->domainspan);
8335 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8336
8337 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8338 GFP_KERNEL, num);
8339 if (!sg) {
8340 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8341 num);
8342 return -ENOMEM;
8343 }
8344 d->sched_group_nodes[num] = sg;
8345
8346 for_each_cpu(j, d->nodemask) {
8347 sd = &per_cpu(node_domains, j).sd;
8348 sd->groups = sg;
8349 }
8350
8351 sg->cpu_power = 0;
8352 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8353 sg->next = sg;
8354 cpumask_or(d->covered, d->covered, d->nodemask);
8355
8356 prev = sg;
8357 for (j = 0; j < nr_node_ids; j++) {
8358 n = (num + j) % nr_node_ids;
8359 cpumask_complement(d->notcovered, d->covered);
8360 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8361 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8362 if (cpumask_empty(d->tmpmask))
8363 break;
8364 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8365 if (cpumask_empty(d->tmpmask))
8366 continue;
8367 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8368 GFP_KERNEL, num);
8369 if (!sg) {
8370 printk(KERN_WARNING
8371 "Can not alloc domain group for node %d\n", j);
8372 return -ENOMEM;
8373 }
8374 sg->cpu_power = 0;
8375 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8376 sg->next = prev->next;
8377 cpumask_or(d->covered, d->covered, d->tmpmask);
8378 prev->next = sg;
8379 prev = sg;
8380 }
8381out:
8382 return 0;
8383}
8216#endif /* CONFIG_NUMA */ 8384#endif /* CONFIG_NUMA */
8217 8385
8218#ifdef CONFIG_NUMA 8386#ifdef CONFIG_NUMA
@@ -8266,15 +8434,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8434 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8435 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8436 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8437 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8438static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8439{
8276 struct sched_domain *child; 8440 struct sched_domain *child;
8277 struct sched_group *group; 8441 struct sched_group *group;
8442 long power;
8443 int weight;
8278 8444
8279 WARN_ON(!sd || !sd->groups); 8445 WARN_ON(!sd || !sd->groups);
8280 8446
@@ -8283,28 +8449,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8449
8284 child = sd->child; 8450 child = sd->child;
8285 8451
8286 sd->groups->__cpu_power = 0; 8452 sd->groups->cpu_power = 0;
8287 8453
8288 /* 8454 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8455 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8456 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8457 /*
8292 * can handle only one task, when there are other idle groups in the 8458 * SMT siblings share the power of a single core.
8293 * same sched domain. 8459 * Usually multiple threads get a better yield out of
8294 */ 8460 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8461 * reflect that in sd->smt_gain.
8296 (child->flags & 8462 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8463 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8464 power *= sd->smt_gain;
8465 power /= weight;
8466 power >>= SCHED_LOAD_SHIFT;
8467 }
8468 sd->groups->cpu_power += power;
8299 return; 8469 return;
8300 } 8470 }
8301 8471
8302 /* 8472 /*
8303 * add cpu_power of each child group to this groups cpu_power 8473 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8474 */
8305 group = child->groups; 8475 group = child->groups;
8306 do { 8476 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8477 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8478 group = group->next;
8309 } while (group != child->groups); 8479 } while (group != child->groups);
8310} 8480}
@@ -8371,287 +8541,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8371 request = attr->relax_domain_level; 8541 request = attr->relax_domain_level;
8372 if (request < sd->level) { 8542 if (request < sd->level) {
8373 /* turn off idle balance on this domain */ 8543 /* turn off idle balance on this domain */
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8544 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8375 } else { 8545 } else {
8376 /* turn on idle balance on this domain */ 8546 /* turn on idle balance on this domain */
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8547 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8548 }
8549}
8550
8551static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8552 const struct cpumask *cpu_map)
8553{
8554 switch (what) {
8555 case sa_sched_groups:
8556 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8557 d->sched_group_nodes = NULL;
8558 case sa_rootdomain:
8559 free_rootdomain(d->rd); /* fall through */
8560 case sa_tmpmask:
8561 free_cpumask_var(d->tmpmask); /* fall through */
8562 case sa_send_covered:
8563 free_cpumask_var(d->send_covered); /* fall through */
8564 case sa_this_core_map:
8565 free_cpumask_var(d->this_core_map); /* fall through */
8566 case sa_this_sibling_map:
8567 free_cpumask_var(d->this_sibling_map); /* fall through */
8568 case sa_nodemask:
8569 free_cpumask_var(d->nodemask); /* fall through */
8570 case sa_sched_group_nodes:
8571#ifdef CONFIG_NUMA
8572 kfree(d->sched_group_nodes); /* fall through */
8573 case sa_notcovered:
8574 free_cpumask_var(d->notcovered); /* fall through */
8575 case sa_covered:
8576 free_cpumask_var(d->covered); /* fall through */
8577 case sa_domainspan:
8578 free_cpumask_var(d->domainspan); /* fall through */
8579#endif
8580 case sa_none:
8581 break;
8378 } 8582 }
8379} 8583}
8380 8584
8381/* 8585static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8586 const struct cpumask *cpu_map)
8383 * to the individual cpus
8384 */
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{ 8587{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered;
8394 struct sched_group **sched_group_nodes = NULL;
8395 int sd_allnodes = 0;
8396
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
8398 goto out;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
8400 goto free_domainspan;
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
8402 goto free_covered;
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA 8588#ifdef CONFIG_NUMA
8417 /* 8589 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8590 return sa_none;
8419 */ 8591 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8592 return sa_domainspan;
8421 GFP_KERNEL); 8593 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8594 return sa_covered;
8595 /* Allocate the per-node list of sched groups */
8596 d->sched_group_nodes = kcalloc(nr_node_ids,
8597 sizeof(struct sched_group *), GFP_KERNEL);
8598 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8599 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8600 return sa_notcovered;
8425 } 8601 }
8426#endif 8602 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8603#endif
8428 rd = alloc_rootdomain(); 8604 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8605 return sa_sched_group_nodes;
8606 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8607 return sa_nodemask;
8608 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8609 return sa_this_sibling_map;
8610 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8611 return sa_this_core_map;
8612 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8613 return sa_send_covered;
8614 d->rd = alloc_rootdomain();
8615 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8616 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8617 return sa_tmpmask;
8432 } 8618 }
8619 return sa_rootdomain;
8620}
8433 8621
8622static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8623 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8624{
8625 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8626#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8627 struct sched_domain *parent;
8436#endif
8437 8628
8438 /* 8629 d->sd_allnodes = 0;
8439 * Set up domains for cpus specified by the cpu_map. 8630 if (cpumask_weight(cpu_map) >
8440 */ 8631 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8441 for_each_cpu(i, cpu_map) { 8632 sd = &per_cpu(allnodes_domains, i).sd;
8442 struct sched_domain *sd = NULL, *p; 8633 SD_INIT(sd, ALLNODES);
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458
8459 sd = &per_cpu(node_domains, i).sd;
8460 SD_INIT(sd, NODE);
8461 set_domain_attribute(sd, attr); 8634 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8635 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8636 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8637 d->sd_allnodes = 1;
8465 p->child = sd; 8638 }
8466 cpumask_and(sched_domain_span(sd), 8639 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8640
8641 sd = &per_cpu(node_domains, i).sd;
8642 SD_INIT(sd, NODE);
8643 set_domain_attribute(sd, attr);
8644 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8645 sd->parent = parent;
8646 if (parent)
8647 parent->child = sd;
8648 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8649#endif
8650 return sd;
8651}
8469 8652
8470 p = sd; 8653static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8654 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8655 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8656{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8657 struct sched_domain *sd;
8475 sd->parent = p; 8658 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8659 SD_INIT(sd, CPU);
8477 p->child = sd; 8660 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8661 cpumask_copy(sched_domain_span(sd), d->nodemask);
8662 sd->parent = parent;
8663 if (parent)
8664 parent->child = sd;
8665 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8666 return sd;
8667}
8479 8668
8669static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8670 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8671 struct sched_domain *parent, int i)
8672{
8673 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8674#ifdef CONFIG_SCHED_MC
8481 p = sd; 8675 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8676 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8677 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8678 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8679 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8680 parent->child = sd;
8487 sd->parent = p; 8681 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8682#endif
8683 return sd;
8684}
8491 8685
8686static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8687 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8688 struct sched_domain *parent, int i)
8689{
8690 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8691#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8692 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8693 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8694 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8695 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8696 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8697 parent->child = sd;
8499 sd->parent = p; 8698 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8699#endif
8503 } 8700 return sd;
8701}
8504 8702
8703static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8704 const struct cpumask *cpu_map, int cpu)
8705{
8706 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8707#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8708 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8709 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8710 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8711 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8712 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8713 &cpu_to_cpu_group,
8512 8714 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8715 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8716#endif
8518
8519#ifdef CONFIG_SCHED_MC 8717#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8718 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8719 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8720 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8721 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8722 &cpu_to_core_group,
8525 8723 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8724 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8725#endif
8531 8726 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8727 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8728 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8729 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8730 &cpu_to_phys_group,
8536 continue; 8731 d->send_covered, d->tmpmask);
8537 8732 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8733#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8734 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8735 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8736 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8737 break;
8548 send_covered, tmpmask); 8738#endif
8739 default:
8740 break;
8549 } 8741 }
8742}
8550 8743
8551 for (i = 0; i < nr_node_ids; i++) { 8744/*
8552 /* Set up node groups */ 8745 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8746 * to the individual cpus
8554 int j; 8747 */
8555 8748static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8749 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8750{
8558 if (cpumask_empty(nodemask)) { 8751 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8752 struct s_data d;
8560 continue; 8753 struct sched_domain *sd;
8561 } 8754 int i;
8755#ifdef CONFIG_NUMA
8756 d.sd_allnodes = 0;
8757#endif
8562 8758
8563 sched_domain_node_span(i, domainspan); 8759 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8760 if (alloc_state != sa_rootdomain)
8761 goto error;
8762 alloc_state = sa_sched_groups;
8565 8763
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8764 /*
8567 GFP_KERNEL, i); 8765 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8766 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8767 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8768 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8769 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8770
8577 sd = &per_cpu(node_domains, j).sd; 8771 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8772 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8773 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8774 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8775 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8776
8586 for (j = 0; j < nr_node_ids; j++) { 8777 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8778 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8779 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8780 }
8588 8781
8589 cpumask_complement(notcovered, covered); 8782 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8783 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8784 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8785
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8786#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8787 /* Set up node groups */
8597 continue; 8788 if (d.sd_allnodes)
8789 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8790
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8791 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8792 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8793 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8794#endif
8616 8795
8617 /* Calculate CPU power for physical packages and nodes */ 8796 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8797#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8798 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8799 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8800 init_sched_groups_power(i, sd);
8623 } 8801 }
8624#endif 8802#endif
8625#ifdef CONFIG_SCHED_MC 8803#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8804 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8805 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8806 init_sched_groups_power(i, sd);
8630 } 8807 }
8631#endif 8808#endif
8632 8809
8633 for_each_cpu(i, cpu_map) { 8810 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8811 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8812 init_sched_groups_power(i, sd);
8637 } 8813 }
8638 8814
8639#ifdef CONFIG_NUMA 8815#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8816 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8817 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8818
8643 if (sd_allnodes) { 8819 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8820 struct sched_group *sg;
8645 8821
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8822 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8823 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8824 init_numa_sched_groups_power(sg);
8649 } 8825 }
8650#endif 8826#endif
8651 8827
8652 /* Attach the domains */ 8828 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8829 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8830#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8831 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8832#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8834,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8834#else
8660 sd = &per_cpu(phys_domains, i).sd; 8835 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8836#endif
8662 cpu_attach_domain(sd, rd, i); 8837 cpu_attach_domain(sd, d.rd, i);
8663 } 8838 }
8664 8839
8665 err = 0; 8840 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8841 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8842 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8843
8694#ifdef CONFIG_NUMA
8695error: 8844error:
8696 free_sched_groups(cpu_map, tmpmask); 8845 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8846 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8847}
8701 8848
8702static int build_sched_domains(const struct cpumask *cpu_map) 8849static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9015,6 +9162,7 @@ void __init sched_init_smp(void)
9015 cpumask_var_t non_isolated_cpus; 9162 cpumask_var_t non_isolated_cpus;
9016 9163
9017 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9164 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9165 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9018 9166
9019#if defined(CONFIG_NUMA) 9167#if defined(CONFIG_NUMA)
9020 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9168 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9046,7 +9194,6 @@ void __init sched_init_smp(void)
9046 sched_init_granularity(); 9194 sched_init_granularity();
9047 free_cpumask_var(non_isolated_cpus); 9195 free_cpumask_var(non_isolated_cpus);
9048 9196
9049 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9050 init_sched_rt_class(); 9197 init_sched_rt_class();
9051} 9198}
9052#else 9199#else
@@ -9304,11 +9451,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9451 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9452 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9453 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9454 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9455 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9456 */
9310 init_tg_cfs_entry(&init_task_group, 9457 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9458 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9459 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9460 root_task_group.se[i]);
9314 9461
@@ -9334,6 +9481,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9481#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9482 rq->sd = NULL;
9336 rq->rd = NULL; 9483 rq->rd = NULL;
9484 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9485 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9486 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9487 rq->push_cpu = 0;
@@ -9392,19 +9540,26 @@ void __init sched_init(void)
9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9393#endif /* SMP */ 9541#endif /* SMP */
9394 9542
9395 perf_counter_init(); 9543 perf_event_init();
9396 9544
9397 scheduler_running = 1; 9545 scheduler_running = 1;
9398} 9546}
9399 9547
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9548#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9549static inline int preempt_count_equals(int preempt_offset)
9550{
9551 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9552
9553 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9554}
9555
9556void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9557{
9403#ifdef in_atomic 9558#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9559 static unsigned long prev_jiffy; /* ratelimiting */
9405 9560
9406 if ((!in_atomic() && !irqs_disabled()) || 9561 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9562 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9563 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9564 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9565 return;
@@ -10157,7 +10312,7 @@ static int sched_rt_global_constraints(void)
10157#endif /* CONFIG_RT_GROUP_SCHED */ 10312#endif /* CONFIG_RT_GROUP_SCHED */
10158 10313
10159int sched_rt_handler(struct ctl_table *table, int write, 10314int sched_rt_handler(struct ctl_table *table, int write,
10160 struct file *filp, void __user *buffer, size_t *lenp, 10315 void __user *buffer, size_t *lenp,
10161 loff_t *ppos) 10316 loff_t *ppos)
10162{ 10317{
10163 int ret; 10318 int ret;
@@ -10168,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10168 old_period = sysctl_sched_rt_period; 10323 old_period = sysctl_sched_rt_period;
10169 old_runtime = sysctl_sched_rt_runtime; 10324 old_runtime = sysctl_sched_rt_runtime;
10170 10325
10171 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10326 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10172 10327
10173 if (!ret && write) { 10328 if (!ret && write) {
10174 ret = sched_rt_global_constraints(); 10329 ret = sched_rt_global_constraints();
@@ -10222,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10222} 10377}
10223 10378
10224static int 10379static int
10225cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10380cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10226 struct task_struct *tsk)
10227{ 10381{
10228#ifdef CONFIG_RT_GROUP_SCHED 10382#ifdef CONFIG_RT_GROUP_SCHED
10229 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10383 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10233,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10233 if (tsk->sched_class != &fair_sched_class) 10387 if (tsk->sched_class != &fair_sched_class)
10234 return -EINVAL; 10388 return -EINVAL;
10235#endif 10389#endif
10390 return 0;
10391}
10236 10392
10393static int
10394cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10395 struct task_struct *tsk, bool threadgroup)
10396{
10397 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10398 if (retval)
10399 return retval;
10400 if (threadgroup) {
10401 struct task_struct *c;
10402 rcu_read_lock();
10403 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10404 retval = cpu_cgroup_can_attach_task(cgrp, c);
10405 if (retval) {
10406 rcu_read_unlock();
10407 return retval;
10408 }
10409 }
10410 rcu_read_unlock();
10411 }
10237 return 0; 10412 return 0;
10238} 10413}
10239 10414
10240static void 10415static void
10241cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10416cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10242 struct cgroup *old_cont, struct task_struct *tsk) 10417 struct cgroup *old_cont, struct task_struct *tsk,
10418 bool threadgroup)
10243{ 10419{
10244 sched_move_task(tsk); 10420 sched_move_task(tsk);
10421 if (threadgroup) {
10422 struct task_struct *c;
10423 rcu_read_lock();
10424 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10425 sched_move_task(c);
10426 }
10427 rcu_read_unlock();
10428 }
10245} 10429}
10246 10430
10247#ifdef CONFIG_FAIR_GROUP_SCHED 10431#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10581,3 +10765,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10765 .subsys_id = cpuacct_subsys_id,
10582}; 10766};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10767#endif /* CONFIG_CGROUP_CPUACCT */
10768
10769#ifndef CONFIG_SMP
10770
10771int rcu_expedited_torture_stats(char *page)
10772{
10773 return 0;
10774}
10775EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10776
10777void synchronize_sched_expedited(void)
10778{
10779}
10780EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10781
10782#else /* #ifndef CONFIG_SMP */
10783
10784static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10785static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10786
10787#define RCU_EXPEDITED_STATE_POST -2
10788#define RCU_EXPEDITED_STATE_IDLE -1
10789
10790static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10791
10792int rcu_expedited_torture_stats(char *page)
10793{
10794 int cnt = 0;
10795 int cpu;
10796
10797 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10798 for_each_online_cpu(cpu) {
10799 cnt += sprintf(&page[cnt], " %d:%d",
10800 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10801 }
10802 cnt += sprintf(&page[cnt], "\n");
10803 return cnt;
10804}
10805EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10806
10807static long synchronize_sched_expedited_count;
10808
10809/*
10810 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10811 * approach to force grace period to end quickly. This consumes
10812 * significant time on all CPUs, and is thus not recommended for
10813 * any sort of common-case code.
10814 *
10815 * Note that it is illegal to call this function while holding any
10816 * lock that is acquired by a CPU-hotplug notifier. Failing to
10817 * observe this restriction will result in deadlock.
10818 */
10819void synchronize_sched_expedited(void)
10820{
10821 int cpu;
10822 unsigned long flags;
10823 bool need_full_sync = 0;
10824 struct rq *rq;
10825 struct migration_req *req;
10826 long snap;
10827 int trycount = 0;
10828
10829 smp_mb(); /* ensure prior mod happens before capturing snap. */
10830 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10831 get_online_cpus();
10832 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10833 put_online_cpus();
10834 if (trycount++ < 10)
10835 udelay(trycount * num_online_cpus());
10836 else {
10837 synchronize_sched();
10838 return;
10839 }
10840 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10841 smp_mb(); /* ensure test happens before caller kfree */
10842 return;
10843 }
10844 get_online_cpus();
10845 }
10846 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10847 for_each_online_cpu(cpu) {
10848 rq = cpu_rq(cpu);
10849 req = &per_cpu(rcu_migration_req, cpu);
10850 init_completion(&req->done);
10851 req->task = NULL;
10852 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10853 spin_lock_irqsave(&rq->lock, flags);
10854 list_add(&req->list, &rq->migration_queue);
10855 spin_unlock_irqrestore(&rq->lock, flags);
10856 wake_up_process(rq->migration_thread);
10857 }
10858 for_each_online_cpu(cpu) {
10859 rcu_expedited_state = cpu;
10860 req = &per_cpu(rcu_migration_req, cpu);
10861 rq = cpu_rq(cpu);
10862 wait_for_completion(&req->done);
10863 spin_lock_irqsave(&rq->lock, flags);
10864 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10865 need_full_sync = 1;
10866 req->dest_cpu = RCU_MIGRATION_IDLE;
10867 spin_unlock_irqrestore(&rq->lock, flags);
10868 }
10869 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10870 mutex_unlock(&rcu_sched_expedited_mutex);
10871 put_online_cpus();
10872 if (need_full_sync)
10873 synchronize_sched();
10874}
10875EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10876
10877#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg64(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
@@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 410 PN(se.wait_max);
410 PN(se.wait_sum); 411 PN(se.wait_sum);
411 P(se.wait_count); 412 P(se.wait_count);
413 PN(se.iowait_sum);
414 P(se.iowait_count);
412 P(sched_info.bkl_count); 415 P(sched_info.bkl_count);
413 P(se.nr_migrations); 416 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 417 P(se.nr_migrations_cold);
@@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 482 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 483 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 484 p->se.wait_count = 0;
485 p->se.iowait_sum = 0;
486 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 487 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 488 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 489 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..4e777b47eeda 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -376,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
376 384
377#ifdef CONFIG_SCHED_DEBUG 385#ifdef CONFIG_SCHED_DEBUG
378int sched_nr_latency_handler(struct ctl_table *table, int write, 386int sched_nr_latency_handler(struct ctl_table *table, int write,
379 struct file *filp, void __user *buffer, size_t *lenp, 387 void __user *buffer, size_t *lenp,
380 loff_t *ppos) 388 loff_t *ppos)
381{ 389{
382 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
383 391
384 if (ret || !write) 392 if (ret || !write)
385 return ret; 393 return ret;
@@ -505,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
505 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
506 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
507 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
508 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
509 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
510 } 519 }
@@ -537,6 +546,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 546 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 547 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 548 rq_of(cfs_rq)->clock - se->wait_start);
549#ifdef CONFIG_SCHEDSTATS
550 if (entity_is_task(se)) {
551 trace_sched_stat_wait(task_of(se),
552 rq_of(cfs_rq)->clock - se->wait_start);
553 }
554#endif
540 schedstat_set(se->wait_start, 0); 555 schedstat_set(se->wait_start, 0);
541} 556}
542 557
@@ -628,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 643 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
630 645
631 if (tsk) 646 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 647 account_scheduler_latency(tsk, delta >> 10, 1);
648 trace_sched_stat_sleep(tsk, delta);
649 }
633 } 650 }
634 if (se->block_start) { 651 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 652 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +661,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 661 se->sum_sleep_runtime += delta;
645 662
646 if (tsk) { 663 if (tsk) {
664 if (tsk->in_iowait) {
665 se->iowait_sum += delta;
666 se->iowait_count++;
667 trace_sched_stat_iowait(tsk, delta);
668 }
669
647 /* 670 /*
648 * Blocking time is in units of nanosecs, so shift by 671 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 672 * 20 to get a milliseconds-range estimation of the
@@ -687,29 +710,33 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
688 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
689 712
690 if (!initial) { 713 /* sleeps up to a single latency don't count. */
691 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
692 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
693 unsigned long thresh = sysctl_sched_latency;
694 716
695 /* 717 /*
696 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
697 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
698 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
699 * all of which have the same weight. 721 * all of which have the same weight.
700 */ 722 */
701 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
702 (!entity_is_task(se) || 724 task_of(se)->policy != SCHED_IDLE))
703 task_of(se)->policy != SCHED_IDLE)) 725 thresh = calc_delta_fair(thresh, se);
704 thresh = calc_delta_fair(thresh, se);
705 726
706 vruntime -= thresh; 727 /*
707 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
708 733
709 /* ensure we never gain time by being placed backwards. */ 734 vruntime -= thresh;
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 735 }
712 736
737 /* ensure we never gain time by being placed backwards. */
738 vruntime = max_vruntime(se->vruntime, vruntime);
739
713 se->vruntime = vruntime; 740 se->vruntime = vruntime;
714} 741}
715 742
@@ -735,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
735 762
736static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
737{ 764{
738 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
739 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
740 767
741 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
742 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
743} 770}
744 771
@@ -1040,79 +1067,6 @@ static void yield_task_fair(struct rq *rq)
1040 se->vruntime = rightmost->vruntime + 1; 1067 se->vruntime = rightmost->vruntime + 1;
1041} 1068}
1042 1069
1043/*
1044 * wake_idle() will wake a task on an idle cpu if task->cpu is
1045 * not idle and an idle cpu is available. The span of cpus to
1046 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask)
1050 *
1051 * Returns the CPU we should wake onto.
1052 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1054static int wake_idle(int cpu, struct task_struct *p)
1055{
1056 struct sched_domain *sd;
1057 int i;
1058 unsigned int chosen_wakeup_cpu;
1059 int this_cpu;
1060
1061 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1063 * are idle and this is not a kernel thread and this task's affinity
1064 * allows it to be moved to preferred cpu, then just move!
1065 */
1066
1067 this_cpu = smp_processor_id();
1068 chosen_wakeup_cpu =
1069 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1070
1071 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1072 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1073 p->mm && !(p->flags & PF_KTHREAD) &&
1074 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1075 return chosen_wakeup_cpu;
1076
1077 /*
1078 * If it is idle, then it is the best cpu to run this task.
1079 *
1080 * This cpu is also the best, if it has more than one task already.
1081 * Siblings must be also busy(in most cases) as they didn't already
1082 * pickup the extra load from this cpu and hence we need not check
1083 * sibling runqueue info. This will avoid the checks and cache miss
1084 * penalities associated with that.
1085 */
1086 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1087 return cpu;
1088
1089 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) {
1097 schedstat_inc(p,
1098 se.nr_wakeups_idle);
1099 }
1100 return i;
1101 }
1102 }
1103 } else {
1104 break;
1105 }
1106 }
1107 return cpu;
1108}
1109#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1110static inline int wake_idle(int cpu, struct task_struct *p)
1111{
1112 return cpu;
1113}
1114#endif
1115
1116#ifdef CONFIG_SMP 1070#ifdef CONFIG_SMP
1117 1071
1118#ifdef CONFIG_FAIR_GROUP_SCHED 1072#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1199,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1199 1153
1200#endif 1154#endif
1201 1155
1202static int 1156static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1203wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1204 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1205 int idx, unsigned long load, unsigned long this_load,
1206 unsigned int imbalance)
1207{ 1157{
1208 struct task_struct *curr = this_rq->curr; 1158 struct task_struct *curr = current;
1209 struct task_group *tg; 1159 unsigned long this_load, load;
1210 unsigned long tl = this_load; 1160 int idx, this_cpu, prev_cpu;
1211 unsigned long tl_per_task; 1161 unsigned long tl_per_task;
1162 unsigned int imbalance;
1163 struct task_group *tg;
1212 unsigned long weight; 1164 unsigned long weight;
1213 int balanced; 1165 int balanced;
1214 1166
1215 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1167 idx = sd->wake_idx;
1216 return 0; 1168 this_cpu = smp_processor_id();
1169 prev_cpu = task_cpu(p);
1170 load = source_load(prev_cpu, idx);
1171 this_load = target_load(this_cpu, idx);
1217 1172
1218 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1173 if (sync) {
1219 p->se.avg_overlap > sysctl_sched_migration_cost)) 1174 if (sched_feat(SYNC_LESS) &&
1220 sync = 0; 1175 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178 } else {
1179 if (sched_feat(SYNC_MORE) &&
1180 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1181 p->se.avg_overlap < sysctl_sched_migration_cost))
1182 sync = 1;
1183 }
1221 1184
1222 /* 1185 /*
1223 * If sync wakeup then subtract the (maximum possible) 1186 * If sync wakeup then subtract the (maximum possible)
@@ -1228,14 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1228 tg = task_group(current); 1191 tg = task_group(current);
1229 weight = current->se.load.weight; 1192 weight = current->se.load.weight;
1230 1193
1231 tl += effective_load(tg, this_cpu, -weight, -weight); 1194 this_load += effective_load(tg, this_cpu, -weight, -weight);
1232 load += effective_load(tg, prev_cpu, 0, -weight); 1195 load += effective_load(tg, prev_cpu, 0, -weight);
1233 } 1196 }
1234 1197
1235 tg = task_group(p); 1198 tg = task_group(p);
1236 weight = p->se.load.weight; 1199 weight = p->se.load.weight;
1237 1200
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1201 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1202
1203 /*
1204 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1205 * due to the sync cause above having dropped this_load to 0, we'll
1206 * always have an imbalance, but there's really nothing you can do
1207 * about that, so that's good too.
1208 *
1209 * Otherwise check if either cpus are near enough in load to allow this
1210 * task to be woken on this_cpu.
1211 */
1212 balanced = !this_load ||
1213 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1214 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1215
1241 /* 1216 /*
@@ -1249,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1249 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1224 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1250 tl_per_task = cpu_avg_load_per_task(this_cpu); 1225 tl_per_task = cpu_avg_load_per_task(this_cpu);
1251 1226
1252 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1227 if (balanced ||
1253 tl_per_task)) { 1228 (this_load <= load &&
1229 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1254 /* 1230 /*
1255 * This domain has SD_WAKE_AFFINE and 1231 * This domain has SD_WAKE_AFFINE and
1256 * p is cache cold in this domain, and 1232 * p is cache cold in this domain, and
1257 * there is no bad imbalance. 1233 * there is no bad imbalance.
1258 */ 1234 */
1259 schedstat_inc(this_sd, ttwu_move_affine); 1235 schedstat_inc(sd, ttwu_move_affine);
1260 schedstat_inc(p, se.nr_wakeups_affine); 1236 schedstat_inc(p, se.nr_wakeups_affine);
1261 1237
1262 return 1; 1238 return 1;
@@ -1264,67 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1264 return 0; 1240 return 0;
1265} 1241}
1266 1242
1267static int select_task_rq_fair(struct task_struct *p, int sync) 1243/*
1244 * find_idlest_group finds and returns the least busy CPU group within the
1245 * domain.
1246 */
1247static struct sched_group *
1248find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1249 int this_cpu, int load_idx)
1268{ 1250{
1269 struct sched_domain *sd, *this_sd = NULL; 1251 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270 int prev_cpu, this_cpu, new_cpu; 1252 unsigned long min_load = ULONG_MAX, this_load = 0;
1271 unsigned long load, this_load; 1253 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1272 struct rq *this_rq;
1273 unsigned int imbalance;
1274 int idx;
1275 1254
1276 prev_cpu = task_cpu(p); 1255 do {
1277 this_cpu = smp_processor_id(); 1256 unsigned long load, avg_load;
1278 this_rq = cpu_rq(this_cpu); 1257 int local_group;
1279 new_cpu = prev_cpu; 1258 int i;
1280 1259
1281 if (prev_cpu == this_cpu) 1260 /* Skip over this group if it has no CPUs allowed */
1282 goto out; 1261 if (!cpumask_intersects(sched_group_cpus(group),
1283 /* 1262 &p->cpus_allowed))
1284 * 'this_sd' is the first domain that both 1263 continue;
1285 * this_cpu and prev_cpu are present in: 1264
1286 */ 1265 local_group = cpumask_test_cpu(this_cpu,
1287 for_each_domain(this_cpu, sd) { 1266 sched_group_cpus(group));
1288 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1267
1289 this_sd = sd; 1268 /* Tally up the load of all CPUs in the group */
1290 break; 1269 avg_load = 0;
1270
1271 for_each_cpu(i, sched_group_cpus(group)) {
1272 /* Bias balancing toward cpus of our domain */
1273 if (local_group)
1274 load = source_load(i, load_idx);
1275 else
1276 load = target_load(i, load_idx);
1277
1278 avg_load += load;
1279 }
1280
1281 /* Adjust by relative CPU power of the group */
1282 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1283
1284 if (local_group) {
1285 this_load = avg_load;
1286 this = group;
1287 } else if (avg_load < min_load) {
1288 min_load = avg_load;
1289 idlest = group;
1290 }
1291 } while (group = group->next, group != sd->groups);
1292
1293 if (!idlest || 100*this_load < imbalance*min_load)
1294 return NULL;
1295 return idlest;
1296}
1297
1298/*
1299 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1300 */
1301static int
1302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1303{
1304 unsigned long load, min_load = ULONG_MAX;
1305 int idlest = -1;
1306 int i;
1307
1308 /* Traverse only the allowed CPUs */
1309 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1310 load = weighted_cpuload(i);
1311
1312 if (load < min_load || (load == min_load && i == this_cpu)) {
1313 min_load = load;
1314 idlest = i;
1291 } 1315 }
1292 } 1316 }
1293 1317
1294 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1318 return idlest;
1295 goto out; 1319}
1296 1320
1297 /* 1321/*
1298 * Check for affine wakeup and passive balancing possibilities. 1322 * sched_balance_self: balance the current task (running on cpu) in domains
1299 */ 1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1300 if (!this_sd) 1324 * SD_BALANCE_EXEC.
1325 *
1326 * Balance, ie. select the least loaded group.
1327 *
1328 * Returns the target CPU number, or the same CPU if no balancing is needed.
1329 *
1330 * preempt must be disabled.
1331 */
1332static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1333{
1334 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1335 int cpu = smp_processor_id();
1336 int prev_cpu = task_cpu(p);
1337 int new_cpu = cpu;
1338 int want_affine = 0;
1339 int want_sd = 1;
1340 int sync = wake_flags & WF_SYNC;
1341
1342 if (sd_flag & SD_BALANCE_WAKE) {
1343 if (sched_feat(AFFINE_WAKEUPS) &&
1344 cpumask_test_cpu(cpu, &p->cpus_allowed))
1345 want_affine = 1;
1346 new_cpu = prev_cpu;
1347 }
1348
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) {
1351 /*
1352 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider.
1354 */
1355 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1356 unsigned long power = 0;
1357 unsigned long nr_running = 0;
1358 unsigned long capacity;
1359 int i;
1360
1361 for_each_cpu(i, sched_domain_span(tmp)) {
1362 power += power_of(i);
1363 nr_running += cpu_rq(i)->cfs.nr_running;
1364 }
1365
1366 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1367
1368 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1369 nr_running /= 2;
1370
1371 if (nr_running < capacity)
1372 want_sd = 0;
1373 }
1374
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1377
1378 affine_sd = tmp;
1379 want_affine = 0;
1380 }
1381
1382 if (!want_sd && !want_affine)
1383 break;
1384
1385 if (!(tmp->flags & sd_flag))
1386 continue;
1387
1388 if (want_sd)
1389 sd = tmp;
1390 }
1391
1392 if (sched_feat(LB_SHARES_UPDATE)) {
1393 /*
1394 * Pick the largest domain to update shares over
1395 */
1396 tmp = sd;
1397 if (affine_sd && (!tmp ||
1398 cpumask_weight(sched_domain_span(affine_sd)) >
1399 cpumask_weight(sched_domain_span(sd))))
1400 tmp = affine_sd;
1401
1402 if (tmp)
1403 update_shares(tmp);
1404 }
1405
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1407 new_cpu = cpu;
1301 goto out; 1408 goto out;
1409 }
1410
1411 while (sd) {
1412 int load_idx = sd->forkexec_idx;
1413 struct sched_group *group;
1414 int weight;
1302 1415
1303 idx = this_sd->wake_idx; 1416 if (!(sd->flags & sd_flag)) {
1417 sd = sd->child;
1418 continue;
1419 }
1304 1420
1305 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1421 if (sd_flag & SD_BALANCE_WAKE)
1422 load_idx = sd->wake_idx;
1306 1423
1307 load = source_load(prev_cpu, idx); 1424 group = find_idlest_group(sd, p, cpu, load_idx);
1308 this_load = target_load(this_cpu, idx); 1425 if (!group) {
1426 sd = sd->child;
1427 continue;
1428 }
1309 1429
1310 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1430 new_cpu = find_idlest_cpu(group, p, cpu);
1311 load, this_load, imbalance)) 1431 if (new_cpu == -1 || new_cpu == cpu) {
1312 return this_cpu; 1432 /* Now try balancing at a lower domain level of cpu */
1433 sd = sd->child;
1434 continue;
1435 }
1313 1436
1314 /* 1437 /* Now try balancing at a lower domain level of new_cpu */
1315 * Start passive balancing when half the imbalance_pct 1438 cpu = new_cpu;
1316 * limit is reached. 1439 weight = cpumask_weight(sched_domain_span(sd));
1317 */ 1440 sd = NULL;
1318 if (this_sd->flags & SD_WAKE_BALANCE) { 1441 for_each_domain(cpu, tmp) {
1319 if (imbalance*this_load <= 100*load) { 1442 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1320 schedstat_inc(this_sd, ttwu_move_balance); 1443 break;
1321 schedstat_inc(p, se.nr_wakeups_passive); 1444 if (tmp->flags & sd_flag)
1322 return this_cpu; 1445 sd = tmp;
1323 } 1446 }
1447 /* while loop will break here if sd == NULL */
1324 } 1448 }
1325 1449
1326out: 1450out:
1327 return wake_idle(new_cpu, p); 1451 rcu_read_unlock();
1452 return new_cpu;
1328} 1453}
1329#endif /* CONFIG_SMP */ 1454#endif /* CONFIG_SMP */
1330 1455
@@ -1437,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
1437/* 1562/*
1438 * Preempt the current task with a newly woken task if needed: 1563 * Preempt the current task with a newly woken task if needed:
1439 */ 1564 */
1440static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1565static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1441{ 1566{
1442 struct task_struct *curr = rq->curr; 1567 struct task_struct *curr = rq->curr;
1443 struct sched_entity *se = &curr->se, *pse = &p->se; 1568 struct sched_entity *se = &curr->se, *pse = &p->se;
1444 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC;
1445 1571
1446 update_curr(cfs_rq); 1572 update_curr(cfs_rq);
1447 1573
@@ -1467,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1467 */ 1593 */
1468 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1469 set_last_buddy(se); 1595 set_last_buddy(se);
1470 set_next_buddy(pse); 1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse);
1471 1598
1472 /* 1599 /*
1473 * We can come here with TIF_NEED_RESCHED already set from new task 1600 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1489,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1489 return; 1616 return;
1490 } 1617 }
1491 1618
1492 if (!sched_feat(WAKEUP_PREEMPT)) 1619 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1493 return; 1620 (sched_feat(WAKEUP_OVERLAP) &&
1494 1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1495 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1496 (se->avg_overlap < sysctl_sched_migration_cost &&
1497 pse->avg_overlap < sysctl_sched_migration_cost))) {
1498 resched_task(curr); 1623 resched_task(curr);
1499 return; 1624 return;
1500 } 1625 }
1501 1626
1627 if (sched_feat(WAKEUP_RUNNING)) {
1628 if (pse->avg_running < se->avg_running) {
1629 set_next_buddy(pse);
1630 resched_task(curr);
1631 return;
1632 }
1633 }
1634
1635 if (!sched_feat(WAKEUP_PREEMPT))
1636 return;
1637
1502 find_matching_se(&se, &pse); 1638 find_matching_se(&se, &pse);
1503 1639
1504 BUG_ON(!pse); 1640 BUG_ON(!pse);
@@ -1521,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1521 /* 1657 /*
1522 * If se was a buddy, clear it so that it will have to earn 1658 * If se was a buddy, clear it so that it will have to earn
1523 * the favour again. 1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1524 */ 1665 */
1525 __clear_buddies(cfs_rq, se); 1666 __clear_buddies(cfs_rq, NULL);
1526 set_next_entity(cfs_rq, se); 1667 set_next_entity(cfs_rq, se);
1527 cfs_rq = group_cfs_rq(se); 1668 cfs_rq = group_cfs_rq(se);
1528 } while (cfs_rq); 1669 } while (cfs_rq);
@@ -1721,6 +1862,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1862 sched_info_queued(p);
1722 1863
1723 update_curr(cfs_rq); 1864 update_curr(cfs_rq);
1865 if (curr)
1866 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1867 place_entity(cfs_rq, se, 1);
1725 1868
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1869 /* 'curr' will be NULL if the child belongs to a different group */
@@ -1796,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
1796} 1939}
1797#endif 1940#endif
1798 1941
1942unsigned int get_rr_interval_fair(struct task_struct *task)
1943{
1944 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0;
1948
1949 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue:
1952 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957
1958 return rr_interval;
1959}
1960
1799/* 1961/*
1800 * All the scheduling class methods: 1962 * All the scheduling class methods:
1801 */ 1963 */
@@ -1824,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
1824 .prio_changed = prio_changed_fair, 1986 .prio_changed = prio_changed_fair,
1825 .switched_to = switched_to_fair, 1987 .switched_to = switched_to_fair,
1826 1988
1989 .get_rr_interval = get_rr_interval_fair,
1990
1827#ifdef CONFIG_FAIR_GROUP_SCHED 1991#ifdef CONFIG_FAIR_GROUP_SCHED
1828 .moved_group = moved_group_fair, 1992 .moved_group = moved_group_fair,
1829#endif 1993#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -927,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
927#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
928static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
929 940
930static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
931{ 942{
932 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
933 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
934 /* 948 /*
935 * If the current task is an RT task, then 949 * If the current task is an RT task, then
936 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -988,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
988/* 1002/*
989 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
990 */ 1004 */
991static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
992{ 1006{
993 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
994 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1064,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1078 if (p)
1065 dequeue_pushable_task(rq, p); 1079 dequeue_pushable_task(rq, p);
1066 1080
1081#ifdef CONFIG_SMP
1082 /*
1083 * We detect this state here so that we can avoid taking the RQ
1084 * lock again later if there is no need to push
1085 */
1086 rq->post_schedule = has_pushable_tasks(rq);
1087#endif
1088
1067 return p; 1089 return p;
1068} 1090}
1069 1091
@@ -1162,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1184 return -1; /* No targets found */
1163 1185
1164 /* 1186 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1187 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1188 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1189 * the best one based on our affinity and topology.
@@ -1262,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1277 return lowest_rq;
1263} 1278}
1264 1279
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1280static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1281{
1272 struct task_struct *p; 1282 struct task_struct *p;
@@ -1466,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1476 pull_rt_task(rq);
1467} 1477}
1468 1478
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1479static void post_schedule_rt(struct rq *rq)
1478{ 1480{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1482}
1487 1483
1488/* 1484/*
@@ -1738,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
1738 dequeue_pushable_task(rq, p); 1734 dequeue_pushable_task(rq, p);
1739} 1735}
1740 1736
1737unsigned int get_rr_interval_rt(struct task_struct *task)
1738{
1739 /*
1740 * Time slice is 0 for SCHED_FIFO tasks
1741 */
1742 if (task->policy == SCHED_RR)
1743 return DEF_TIMESLICE;
1744 else
1745 return 0;
1746}
1747
1741static const struct sched_class rt_sched_class = { 1748static const struct sched_class rt_sched_class = {
1742 .next = &fair_sched_class, 1749 .next = &fair_sched_class,
1743 .enqueue_task = enqueue_task_rt, 1750 .enqueue_task = enqueue_task_rt,
@@ -1758,7 +1765,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1765 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1766 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1767 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1768 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1769 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1770 .switched_from = switched_from_rt,
@@ -1767,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
1767 .set_curr_task = set_curr_task_rt, 1773 .set_curr_task = set_curr_task_rt,
1768 .task_tick = task_tick_rt, 1774 .task_tick = task_tick_rt,
1769 1775
1776 .get_rr_interval = get_rr_interval_rt,
1777
1770 .prio_changed = prio_changed_rt, 1778 .prio_changed = prio_changed_rt,
1771 .switched_to = switched_to_rt, 1779 .switched_to = switched_to_rt,
1772}; 1780};
diff --git a/kernel/signal.c b/kernel/signal.c
index 64c5deeaca5d..6705320784fd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
705 705
706 if (why) { 706 if (why) {
707 /* 707 /*
708 * The first thread which returns from finish_stop() 708 * The first thread which returns from do_signal_stop()
709 * will take ->siglock, notice SIGNAL_CLD_MASK, and 709 * will take ->siglock, notice SIGNAL_CLD_MASK, and
710 * notify its parent. See get_signal_to_deliver(). 710 * notify its parent. See get_signal_to_deliver().
711 */ 711 */
@@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
971 return send_signal(sig, info, t, 0); 971 return send_signal(sig, info, t, 0);
972} 972}
973 973
974int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p,
975 bool group)
976{
977 unsigned long flags;
978 int ret = -ESRCH;
979
980 if (lock_task_sighand(p, &flags)) {
981 ret = send_signal(sig, info, p, group);
982 unlock_task_sighand(p, &flags);
983 }
984
985 return ret;
986}
987
974/* 988/*
975 * Force a signal that the process can't ignore: if necessary 989 * Force a signal that the process can't ignore: if necessary
976 * we unblock the signal and change any SIG_IGN to SIG_DFL. 990 * we unblock the signal and change any SIG_IGN to SIG_DFL.
@@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p)
1036 } 1050 }
1037} 1051}
1038 1052
1039int __fatal_signal_pending(struct task_struct *tsk)
1040{
1041 return sigismember(&tsk->pending.signal, SIGKILL);
1042}
1043EXPORT_SYMBOL(__fatal_signal_pending);
1044
1045struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1053struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
1046{ 1054{
1047 struct sighand_struct *sighand; 1055 struct sighand_struct *sighand;
@@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1068 */ 1076 */
1069int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1077int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1070{ 1078{
1071 unsigned long flags; 1079 int ret = check_kill_permission(sig, info, p);
1072 int ret;
1073 1080
1074 ret = check_kill_permission(sig, info, p); 1081 if (!ret && sig)
1075 1082 ret = do_send_sig_info(sig, info, p, true);
1076 if (!ret && sig) {
1077 ret = -ESRCH;
1078 if (lock_task_sighand(p, &flags)) {
1079 ret = __group_send_sig_info(sig, info, p);
1080 unlock_task_sighand(p, &flags);
1081 }
1082 }
1083 1083
1084 return ret; 1084 return ret;
1085} 1085}
@@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1224 * These are for backward compatibility with the rest of the kernel source. 1224 * These are for backward compatibility with the rest of the kernel source.
1225 */ 1225 */
1226 1226
1227/*
1228 * The caller must ensure the task can't exit.
1229 */
1230int 1227int
1231send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1228send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1232{ 1229{
1233 int ret;
1234 unsigned long flags;
1235
1236 /* 1230 /*
1237 * Make sure legacy kernel users don't send in bad values 1231 * Make sure legacy kernel users don't send in bad values
1238 * (normal paths check this in check_kill_permission). 1232 * (normal paths check this in check_kill_permission).
@@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1240 if (!valid_signal(sig)) 1234 if (!valid_signal(sig))
1241 return -EINVAL; 1235 return -EINVAL;
1242 1236
1243 spin_lock_irqsave(&p->sighand->siglock, flags); 1237 return do_send_sig_info(sig, info, p, false);
1244 ret = specific_send_sig_info(sig, info, p);
1245 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1246 return ret;
1247} 1238}
1248 1239
1249#define __si_special(priv) \ 1240#define __si_special(priv) \
@@ -1383,15 +1374,6 @@ ret:
1383} 1374}
1384 1375
1385/* 1376/*
1386 * Wake up any threads in the parent blocked in wait* syscalls.
1387 */
1388static inline void __wake_up_parent(struct task_struct *p,
1389 struct task_struct *parent)
1390{
1391 wake_up_interruptible_sync(&parent->signal->wait_chldexit);
1392}
1393
1394/*
1395 * Let a parent know about the death of a child. 1377 * Let a parent know about the death of a child.
1396 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1378 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1397 * 1379 *
@@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code)
1673 spin_unlock_irq(&current->sighand->siglock); 1655 spin_unlock_irq(&current->sighand->siglock);
1674} 1656}
1675 1657
1676static void
1677finish_stop(int stop_count)
1678{
1679 /*
1680 * If there are no other threads in the group, or if there is
1681 * a group stop in progress and we are the last to stop,
1682 * report to the parent. When ptraced, every thread reports itself.
1683 */
1684 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1685 read_lock(&tasklist_lock);
1686 do_notify_parent_cldstop(current, CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689
1690 do {
1691 schedule();
1692 } while (try_to_freeze());
1693 /*
1694 * Now we don't run again until continued.
1695 */
1696 current->exit_code = 0;
1697}
1698
1699/* 1658/*
1700 * This performs the stopping for SIGSTOP and other stop signals. 1659 * This performs the stopping for SIGSTOP and other stop signals.
1701 * We have to stop all threads in the thread group. 1660 * We have to stop all threads in the thread group.
@@ -1705,15 +1664,9 @@ finish_stop(int stop_count)
1705static int do_signal_stop(int signr) 1664static int do_signal_stop(int signr)
1706{ 1665{
1707 struct signal_struct *sig = current->signal; 1666 struct signal_struct *sig = current->signal;
1708 int stop_count; 1667 int notify;
1709 1668
1710 if (sig->group_stop_count > 0) { 1669 if (!sig->group_stop_count) {
1711 /*
1712 * There is a group stop in progress. We don't need to
1713 * start another one.
1714 */
1715 stop_count = --sig->group_stop_count;
1716 } else {
1717 struct task_struct *t; 1670 struct task_struct *t;
1718 1671
1719 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || 1672 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
@@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr)
1725 */ 1678 */
1726 sig->group_exit_code = signr; 1679 sig->group_exit_code = signr;
1727 1680
1728 stop_count = 0; 1681 sig->group_stop_count = 1;
1729 for (t = next_thread(current); t != current; t = next_thread(t)) 1682 for (t = next_thread(current); t != current; t = next_thread(t))
1730 /* 1683 /*
1731 * Setting state to TASK_STOPPED for a group 1684 * Setting state to TASK_STOPPED for a group
@@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr)
1734 */ 1687 */
1735 if (!(t->flags & PF_EXITING) && 1688 if (!(t->flags & PF_EXITING) &&
1736 !task_is_stopped_or_traced(t)) { 1689 !task_is_stopped_or_traced(t)) {
1737 stop_count++; 1690 sig->group_stop_count++;
1738 signal_wake_up(t, 0); 1691 signal_wake_up(t, 0);
1739 } 1692 }
1740 sig->group_stop_count = stop_count;
1741 } 1693 }
1694 /*
1695 * If there are no other threads in the group, or if there is
1696 * a group stop in progress and we are the last to stop, report
1697 * to the parent. When ptraced, every thread reports itself.
1698 */
1699 notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
1700 notify = tracehook_notify_jctl(notify, CLD_STOPPED);
1701 /*
1702 * tracehook_notify_jctl() can drop and reacquire siglock, so
1703 * we keep ->group_stop_count != 0 before the call. If SIGCONT
1704 * or SIGKILL comes in between ->group_stop_count == 0.
1705 */
1706 if (sig->group_stop_count) {
1707 if (!--sig->group_stop_count)
1708 sig->flags = SIGNAL_STOP_STOPPED;
1709 current->exit_code = sig->group_exit_code;
1710 __set_current_state(TASK_STOPPED);
1711 }
1712 spin_unlock_irq(&current->sighand->siglock);
1742 1713
1743 if (stop_count == 0) 1714 if (notify) {
1744 sig->flags = SIGNAL_STOP_STOPPED; 1715 read_lock(&tasklist_lock);
1745 current->exit_code = sig->group_exit_code; 1716 do_notify_parent_cldstop(current, notify);
1746 __set_current_state(TASK_STOPPED); 1717 read_unlock(&tasklist_lock);
1718 }
1719
1720 /* Now we don't run again until woken by SIGCONT or SIGKILL */
1721 do {
1722 schedule();
1723 } while (try_to_freeze());
1724
1725 tracehook_finish_jctl();
1726 current->exit_code = 0;
1747 1727
1748 spin_unlock_irq(&current->sighand->siglock);
1749 finish_stop(stop_count);
1750 return 1; 1728 return 1;
1751} 1729}
1752 1730
@@ -1815,14 +1793,15 @@ relock:
1815 int why = (signal->flags & SIGNAL_STOP_CONTINUED) 1793 int why = (signal->flags & SIGNAL_STOP_CONTINUED)
1816 ? CLD_CONTINUED : CLD_STOPPED; 1794 ? CLD_CONTINUED : CLD_STOPPED;
1817 signal->flags &= ~SIGNAL_CLD_MASK; 1795 signal->flags &= ~SIGNAL_CLD_MASK;
1818 spin_unlock_irq(&sighand->siglock);
1819 1796
1820 if (unlikely(!tracehook_notify_jctl(1, why))) 1797 why = tracehook_notify_jctl(why, CLD_CONTINUED);
1821 goto relock; 1798 spin_unlock_irq(&sighand->siglock);
1822 1799
1823 read_lock(&tasklist_lock); 1800 if (why) {
1824 do_notify_parent_cldstop(current->group_leader, why); 1801 read_lock(&tasklist_lock);
1825 read_unlock(&tasklist_lock); 1802 do_notify_parent_cldstop(current->group_leader, why);
1803 read_unlock(&tasklist_lock);
1804 }
1826 goto relock; 1805 goto relock;
1827 } 1806 }
1828 1807
@@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk)
1987 if (unlikely(tsk->signal->group_stop_count) && 1966 if (unlikely(tsk->signal->group_stop_count) &&
1988 !--tsk->signal->group_stop_count) { 1967 !--tsk->signal->group_stop_count) {
1989 tsk->signal->flags = SIGNAL_STOP_STOPPED; 1968 tsk->signal->flags = SIGNAL_STOP_STOPPED;
1990 group_stop = 1; 1969 group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
1991 } 1970 }
1992out: 1971out:
1993 spin_unlock_irq(&tsk->sighand->siglock); 1972 spin_unlock_irq(&tsk->sighand->siglock);
1994 1973
1995 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { 1974 if (unlikely(group_stop)) {
1996 read_lock(&tasklist_lock); 1975 read_lock(&tasklist_lock);
1997 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1976 do_notify_parent_cldstop(tsk, group_stop);
1998 read_unlock(&tasklist_lock); 1977 read_unlock(&tasklist_lock);
1999 } 1978 }
2000} 1979}
@@ -2290,7 +2269,6 @@ static int
2290do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) 2269do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2291{ 2270{
2292 struct task_struct *p; 2271 struct task_struct *p;
2293 unsigned long flags;
2294 int error = -ESRCH; 2272 int error = -ESRCH;
2295 2273
2296 rcu_read_lock(); 2274 rcu_read_lock();
@@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2300 /* 2278 /*
2301 * The null signal is a permissions and process existence 2279 * The null signal is a permissions and process existence
2302 * probe. No signal is actually delivered. 2280 * probe. No signal is actually delivered.
2303 *
2304 * If lock_task_sighand() fails we pretend the task dies
2305 * after receiving the signal. The window is tiny, and the
2306 * signal is private anyway.
2307 */ 2281 */
2308 if (!error && sig && lock_task_sighand(p, &flags)) { 2282 if (!error && sig) {
2309 error = specific_send_sig_info(sig, info, p); 2283 error = do_send_sig_info(sig, info, p, false);
2310 unlock_task_sighand(p, &flags); 2284 /*
2285 * If lock_task_sighand() failed we pretend the task
2286 * dies after receiving the signal. The window is tiny,
2287 * and the signal is private anyway.
2288 */
2289 if (unlikely(error == -ESRCH))
2290 error = 0;
2311 } 2291 }
2312 } 2292 }
2313 rcu_read_unlock(); 2293 rcu_read_unlock();
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 09d7519557d3..0d31135efbf4 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long);
26static void slow_work_oom_timeout(unsigned long); 26static void slow_work_oom_timeout(unsigned long);
27 27
28#ifdef CONFIG_SYSCTL 28#ifdef CONFIG_SYSCTL
29static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, 29static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
31 31
32static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, 32static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 33 void __user *, size_t *, loff_t *);
34#endif 34#endif
35 35
@@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data)
493 * Handle adjustment of the minimum number of threads 493 * Handle adjustment of the minimum number of threads
494 */ 494 */
495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 495static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 struct file *filp, void __user *buffer, 496 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 497 size_t *lenp, loff_t *ppos)
498{ 498{
499 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 499 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 500 int n;
501 501
502 if (ret == 0) { 502 if (ret == 0) {
@@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
521 * Handle adjustment of the maximum number of threads 521 * Handle adjustment of the maximum number of threads
522 */ 522 */
523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 523static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 struct file *filp, void __user *buffer, 524 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 525 size_t *lenp, loff_t *ppos)
526{ 526{
527 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 527 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 528 int n;
529 529
530 if (ret == 0) { 530 if (ret == 0) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..c9d1c7835c2f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
29 29
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data csd; 31 struct call_single_data csd;
32 spinlock_t lock; 32 atomic_t refs;
33 unsigned int refs;
34 cpumask_var_t cpumask; 33 cpumask_var_t cpumask;
35}; 34};
36 35
@@ -39,9 +38,7 @@ struct call_single_queue {
39 spinlock_t lock; 38 spinlock_t lock;
40}; 39};
41 40
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45 42
46static int 43static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 174 int cpu = get_cpu();
178 175
179 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online.
178 */
179 WARN_ON_ONCE(!cpu_online(cpu));
180
181 /*
180 * Ensure entry is visible on call_function_queue after we have 182 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 183 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 184 * If we don't have this, then we may miss an entry on the list
@@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 193 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
192 int refs; 194 int refs;
193 195
194 spin_lock(&data->lock); 196 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
197 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
201 198
202 data->csd.func(data->csd.info); 199 data->csd.func(data->csd.info);
203 200
204 spin_lock(&data->lock); 201 refs = atomic_dec_return(&data->refs);
205 WARN_ON(data->refs == 0); 202 WARN_ON(refs < 0);
206 refs = --data->refs;
207 if (!refs) { 203 if (!refs) {
208 spin_lock(&call_function.lock); 204 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock); 206 spin_unlock(&call_function.lock);
211 } 207 }
212 spin_unlock(&data->lock);
213 208
214 if (refs) 209 if (refs)
215 continue; 210 continue;
@@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 225 unsigned int data_flags;
231 LIST_HEAD(list); 226 LIST_HEAD(list);
232 227
228 /*
229 * Shouldn't receive this interrupt on a cpu that is not yet online.
230 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232
233 spin_lock(&q->lock); 233 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 234 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 235 spin_unlock(&q->lock);
@@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 285 */
286 this_cpu = get_cpu(); 286 this_cpu = get_cpu();
287 287
288 /* Can deadlock when called with interrupts disabled */ 288 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 289 * Can deadlock when called with interrupts disabled.
290 * We allow cpu's that are not yet online though, as no one else can
291 * send smp call function interrupt to this cpu and as such deadlocks
292 * can't happen.
293 */
294 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
295 && !oops_in_progress);
290 296
291 if (cpu == this_cpu) { 297 if (cpu == this_cpu) {
292 local_irq_save(flags); 298 local_irq_save(flags);
@@ -329,19 +335,18 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 335{
330 csd_lock(data); 336 csd_lock(data);
331 337
332 /* Can deadlock when called with interrupts disabled */ 338 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 339 * Can deadlock when called with interrupts disabled.
340 * We allow cpu's that are not yet online though, as no one else can
341 * send smp call function interrupt to this cpu and as such deadlocks
342 * can't happen.
343 */
344 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
345 && !oops_in_progress);
334 346
335 generic_exec_single(cpu, data, wait); 347 generic_exec_single(cpu, data, wait);
336} 348}
337 349
338/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
339
340#ifndef arch_send_call_function_ipi_mask
341# define arch_send_call_function_ipi_mask(maskp) \
342 arch_send_call_function_ipi(*(maskp))
343#endif
344
345/** 350/**
346 * smp_call_function_many(): Run a function on a set of other CPUs. 351 * smp_call_function_many(): Run a function on a set of other CPUs.
347 * @mask: The set of cpus to run on (only runs on online subset). 352 * @mask: The set of cpus to run on (only runs on online subset).
@@ -365,8 +370,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 370 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 371 int cpu, next_cpu, this_cpu = smp_processor_id();
367 372
368 /* Can deadlock when called with interrupts disabled */ 373 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 374 * Can deadlock when called with interrupts disabled.
375 * We allow cpu's that are not yet online though, as no one else can
376 * send smp call function interrupt to this cpu and as such deadlocks
377 * can't happen.
378 */
379 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
380 && !oops_in_progress);
370 381
371 /* So, what's a CPU they want? Ignoring this one. */ 382 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 383 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -391,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask,
391 data = &__get_cpu_var(cfd_data); 402 data = &__get_cpu_var(cfd_data);
392 csd_lock(&data->csd); 403 csd_lock(&data->csd);
393 404
394 spin_lock_irqsave(&data->lock, flags);
395 data->csd.func = func; 405 data->csd.func = func;
396 data->csd.info = info; 406 data->csd.info = info;
397 cpumask_and(data->cpumask, mask, cpu_online_mask); 407 cpumask_and(data->cpumask, mask, cpu_online_mask);
398 cpumask_clear_cpu(this_cpu, data->cpumask); 408 cpumask_clear_cpu(this_cpu, data->cpumask);
399 data->refs = cpumask_weight(data->cpumask); 409 atomic_set(&data->refs, cpumask_weight(data->cpumask));
400 410
401 spin_lock(&call_function.lock); 411 spin_lock_irqsave(&call_function.lock, flags);
402 /* 412 /*
403 * Place entry at the _HEAD_ of the list, so that any cpu still 413 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt() 414 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries: 415 * will not miss any other list entries:
406 */ 416 */
407 list_add_rcu(&data->csd.list, &call_function.queue); 417 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock); 418 spin_unlock_irqrestore(&call_function.lock, flags);
409
410 spin_unlock_irqrestore(&data->lock, flags);
411 419
412 /* 420 /*
413 * Make the list addition visible before sending the ipi. 421 * Make the list addition visible before sending the ipi.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 88796c330838..81324d12eb35 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void)
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 90EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 91
92int proc_dosoftlockup_thresh(struct ctl_table *table, int write, 92int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
93 struct file *filp, void __user *buffer, 93 void __user *buffer,
94 size_t *lenp, loff_t *ppos) 94 size_t *lenp, loff_t *ppos)
95{ 95{
96 touch_all_softlockup_watchdogs(); 96 touch_all_softlockup_watchdogs();
97 return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 97 return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
98} 98}
99 99
100/* 100/*
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..255475d163e0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338 unsigned long flags; 1338 unsigned long flags;
1339 cputime_t utime, stime; 1339 cputime_t utime, stime;
1340 struct task_cputime cputime; 1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0;
1341 1342
1342 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1343 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1346 utime = task_utime(current); 1347 utime = task_utime(current);
1347 stime = task_stime(current); 1348 stime = task_stime(current);
1348 accumulate_thread_rusage(p, r); 1349 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss;
1349 goto out; 1351 goto out;
1350 } 1352 }
1351 1353
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1363 r->ru_majflt = p->signal->cmaj_flt; 1365 r->ru_majflt = p->signal->cmaj_flt;
1364 r->ru_inblock = p->signal->cinblock; 1366 r->ru_inblock = p->signal->cinblock;
1365 r->ru_oublock = p->signal->coublock; 1367 r->ru_oublock = p->signal->coublock;
1368 maxrss = p->signal->cmaxrss;
1366 1369
1367 if (who == RUSAGE_CHILDREN) 1370 if (who == RUSAGE_CHILDREN)
1368 break; 1371 break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1377 r->ru_majflt += p->signal->maj_flt; 1380 r->ru_majflt += p->signal->maj_flt;
1378 r->ru_inblock += p->signal->inblock; 1381 r->ru_inblock += p->signal->inblock;
1379 r->ru_oublock += p->signal->oublock; 1382 r->ru_oublock += p->signal->oublock;
1383 if (maxrss < p->signal->maxrss)
1384 maxrss = p->signal->maxrss;
1380 t = p; 1385 t = p;
1381 do { 1386 do {
1382 accumulate_thread_rusage(t, r); 1387 accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1392out: 1397out:
1393 cputime_to_timeval(utime, &r->ru_utime); 1398 cputime_to_timeval(utime, &r->ru_utime);
1394 cputime_to_timeval(stime, &r->ru_stime); 1399 cputime_to_timeval(stime, &r->ru_stime);
1400
1401 if (who != RUSAGE_CHILDREN) {
1402 struct mm_struct *mm = get_task_mm(p);
1403 if (mm) {
1404 setmax_mm_hiwater_rss(&maxrss, mm);
1405 mmput(mm);
1406 }
1407 }
1408 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1395} 1409}
1396 1410
1397int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1411int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1525 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1526 error = SET_TSC_CTL(arg2);
1513 break; 1527 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1528 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1529 error = perf_event_task_disable();
1516 break; 1530 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1531 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1532 error = perf_event_task_enable();
1519 break; 1533 break;
1520 case PR_GET_TIMERSLACK: 1534 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1535 error = current->timer_slack_ns;
@@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1528 current->timer_slack_ns = arg2; 1542 current->timer_slack_ns = arg2;
1529 error = 0; 1543 error = 0;
1530 break; 1544 break;
1545 case PR_MCE_KILL:
1546 if (arg4 | arg5)
1547 return -EINVAL;
1548 switch (arg2) {
1549 case 0:
1550 if (arg3 != 0)
1551 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS;
1553 break;
1554 case 1:
1555 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0)
1557 current->flags |= PF_MCE_EARLY;
1558 else
1559 current->flags &= ~PF_MCE_EARLY;
1560 break;
1561 default:
1562 return -EINVAL;
1563 }
1564 error = 0;
1565 break;
1566
1531 default: 1567 default:
1532 error = -EINVAL; 1568 error = -EINVAL;
1533 break; 1569 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..e06d0b8d1951 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg);
49cond_syscall(compat_sys_sendmsg); 49cond_syscall(compat_sys_sendmsg);
50cond_syscall(sys_recvmsg); 50cond_syscall(sys_recvmsg);
51cond_syscall(compat_sys_recvmsg); 51cond_syscall(compat_sys_recvmsg);
52cond_syscall(compat_sys_recvfrom);
52cond_syscall(sys_socketcall); 53cond_syscall(sys_socketcall);
53cond_syscall(sys_futex); 54cond_syscall(sys_futex);
54cond_syscall(compat_sys_futex); 55cond_syscall(compat_sys_futex);
@@ -177,4 +178,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 178cond_syscall(sys_eventfd2);
178 179
179/* performance counters: */ 180/* performance counters: */
180cond_syscall(sys_perf_counter_open); 181cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..0d949c517412 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -26,7 +26,6 @@
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/security.h> 27#include <linux/security.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/utsname.h>
30#include <linux/kmemcheck.h> 29#include <linux/kmemcheck.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/fs.h> 31#include <linux/fs.h>
@@ -49,9 +48,8 @@
49#include <linux/acpi.h> 48#include <linux/acpi.h>
50#include <linux/reboot.h> 49#include <linux/reboot.h>
51#include <linux/ftrace.h> 50#include <linux/ftrace.h>
52#include <linux/security.h>
53#include <linux/slow-work.h> 51#include <linux/slow-work.h>
54#include <linux/perf_counter.h> 52#include <linux/perf_event.h>
55 53
56#include <asm/uaccess.h> 54#include <asm/uaccess.h>
57#include <asm/processor.h> 55#include <asm/processor.h>
@@ -78,6 +76,7 @@ extern int max_threads;
78extern int core_uses_pid; 76extern int core_uses_pid;
79extern int suid_dumpable; 77extern int suid_dumpable;
80extern char core_pattern[]; 78extern char core_pattern[];
79extern unsigned int core_pipe_limit;
81extern int pid_max; 80extern int pid_max;
82extern int min_free_kbytes; 81extern int min_free_kbytes;
83extern int pid_max_min, pid_max_max; 82extern int pid_max_min, pid_max_max;
@@ -92,6 +91,9 @@ extern int sysctl_nr_trim_pages;
92#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
93extern int rcutorture_runnable; 92extern int rcutorture_runnable;
94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
95 97
96/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -104,6 +106,9 @@ static int __maybe_unused one = 1;
104static int __maybe_unused two = 2; 106static int __maybe_unused two = 2;
105static unsigned long one_ul = 1; 107static unsigned long one_ul = 1;
106static int one_hundred = 100; 108static int one_hundred = 100;
109#ifdef CONFIG_PRINTK
110static int ten_thousand = 10000;
111#endif
107 112
108/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 113/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
109static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 114static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -158,9 +163,9 @@ extern int max_lock_depth;
158#endif 163#endif
159 164
160#ifdef CONFIG_PROC_SYSCTL 165#ifdef CONFIG_PROC_SYSCTL
161static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 166static int proc_do_cad_pid(struct ctl_table *table, int write,
162 void __user *buffer, size_t *lenp, loff_t *ppos); 167 void __user *buffer, size_t *lenp, loff_t *ppos);
163static int proc_taint(struct ctl_table *table, int write, struct file *filp, 168static int proc_taint(struct ctl_table *table, int write,
164 void __user *buffer, size_t *lenp, loff_t *ppos); 169 void __user *buffer, size_t *lenp, loff_t *ppos);
165#endif 170#endif
166 171
@@ -246,6 +251,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
246#endif 251#endif
247 252
248static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int),
259 .mode = 0644,
260 .proc_handler = &proc_dointvec,
261 },
249#ifdef CONFIG_SCHED_DEBUG 262#ifdef CONFIG_SCHED_DEBUG
250 { 263 {
251 .ctl_name = CTL_UNNUMBERED, 264 .ctl_name = CTL_UNNUMBERED,
@@ -300,14 +313,6 @@ static struct ctl_table kern_table[] = {
300 }, 313 },
301 { 314 {
302 .ctl_name = CTL_UNNUMBERED, 315 .ctl_name = CTL_UNNUMBERED,
303 .procname = "sched_child_runs_first",
304 .data = &sysctl_sched_child_runs_first,
305 .maxlen = sizeof(unsigned int),
306 .mode = 0644,
307 .proc_handler = &proc_dointvec,
308 },
309 {
310 .ctl_name = CTL_UNNUMBERED,
311 .procname = "sched_features", 316 .procname = "sched_features",
312 .data = &sysctl_sched_features, 317 .data = &sysctl_sched_features,
313 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
@@ -332,6 +337,14 @@ static struct ctl_table kern_table[] = {
332 }, 337 },
333 { 338 {
334 .ctl_name = CTL_UNNUMBERED, 339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int),
343 .mode = 0644,
344 .proc_handler = &proc_dointvec,
345 },
346 {
347 .ctl_name = CTL_UNNUMBERED,
335 .procname = "timer_migration", 348 .procname = "timer_migration",
336 .data = &sysctl_timer_migration, 349 .data = &sysctl_timer_migration,
337 .maxlen = sizeof(unsigned int), 350 .maxlen = sizeof(unsigned int),
@@ -411,6 +424,14 @@ static struct ctl_table kern_table[] = {
411 .proc_handler = &proc_dostring, 424 .proc_handler = &proc_dostring,
412 .strategy = &sysctl_string, 425 .strategy = &sysctl_string,
413 }, 426 },
427 {
428 .ctl_name = CTL_UNNUMBERED,
429 .procname = "core_pipe_limit",
430 .data = &core_pipe_limit,
431 .maxlen = sizeof(unsigned int),
432 .mode = 0644,
433 .proc_handler = &proc_dointvec,
434 },
414#ifdef CONFIG_PROC_SYSCTL 435#ifdef CONFIG_PROC_SYSCTL
415 { 436 {
416 .procname = "tainted", 437 .procname = "tainted",
@@ -712,6 +733,17 @@ static struct ctl_table kern_table[] = {
712 .mode = 0644, 733 .mode = 0644,
713 .proc_handler = &proc_dointvec, 734 .proc_handler = &proc_dointvec,
714 }, 735 },
736 {
737 .ctl_name = CTL_UNNUMBERED,
738 .procname = "printk_delay",
739 .data = &printk_delay_msec,
740 .maxlen = sizeof(int),
741 .mode = 0644,
742 .proc_handler = &proc_dointvec_minmax,
743 .strategy = &sysctl_intvec,
744 .extra1 = &zero,
745 .extra2 = &ten_thousand,
746 },
715#endif 747#endif
716 { 748 {
717 .ctl_name = KERN_NGROUPS_MAX, 749 .ctl_name = KERN_NGROUPS_MAX,
@@ -954,28 +986,28 @@ static struct ctl_table kern_table[] = {
954 .child = slow_work_sysctls, 986 .child = slow_work_sysctls,
955 }, 987 },
956#endif 988#endif
957#ifdef CONFIG_PERF_COUNTERS 989#ifdef CONFIG_PERF_EVENTS
958 { 990 {
959 .ctl_name = CTL_UNNUMBERED, 991 .ctl_name = CTL_UNNUMBERED,
960 .procname = "perf_counter_paranoid", 992 .procname = "perf_event_paranoid",
961 .data = &sysctl_perf_counter_paranoid, 993 .data = &sysctl_perf_event_paranoid,
962 .maxlen = sizeof(sysctl_perf_counter_paranoid), 994 .maxlen = sizeof(sysctl_perf_event_paranoid),
963 .mode = 0644, 995 .mode = 0644,
964 .proc_handler = &proc_dointvec, 996 .proc_handler = &proc_dointvec,
965 }, 997 },
966 { 998 {
967 .ctl_name = CTL_UNNUMBERED, 999 .ctl_name = CTL_UNNUMBERED,
968 .procname = "perf_counter_mlock_kb", 1000 .procname = "perf_event_mlock_kb",
969 .data = &sysctl_perf_counter_mlock, 1001 .data = &sysctl_perf_event_mlock,
970 .maxlen = sizeof(sysctl_perf_counter_mlock), 1002 .maxlen = sizeof(sysctl_perf_event_mlock),
971 .mode = 0644, 1003 .mode = 0644,
972 .proc_handler = &proc_dointvec, 1004 .proc_handler = &proc_dointvec,
973 }, 1005 },
974 { 1006 {
975 .ctl_name = CTL_UNNUMBERED, 1007 .ctl_name = CTL_UNNUMBERED,
976 .procname = "perf_counter_max_sample_rate", 1008 .procname = "perf_event_max_sample_rate",
977 .data = &sysctl_perf_counter_sample_rate, 1009 .data = &sysctl_perf_event_sample_rate,
978 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 1010 .maxlen = sizeof(sysctl_perf_event_sample_rate),
979 .mode = 0644, 1011 .mode = 0644,
980 .proc_handler = &proc_dointvec, 1012 .proc_handler = &proc_dointvec,
981 }, 1013 },
@@ -990,7 +1022,16 @@ static struct ctl_table kern_table[] = {
990 .proc_handler = &proc_dointvec, 1022 .proc_handler = &proc_dointvec,
991 }, 1023 },
992#endif 1024#endif
993 1025#ifdef CONFIG_BLOCK
1026 {
1027 .ctl_name = CTL_UNNUMBERED,
1028 .procname = "blk_iopoll",
1029 .data = &blk_iopoll_enabled,
1030 .maxlen = sizeof(int),
1031 .mode = 0644,
1032 .proc_handler = &proc_dointvec,
1033 },
1034#endif
994/* 1035/*
995 * NOTE: do not add new entries to this table unless you have read 1036 * NOTE: do not add new entries to this table unless you have read
996 * Documentation/sysctl/ctl_unnumbered.txt 1037 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1357,6 +1398,31 @@ static struct ctl_table vm_table[] = {
1357 .mode = 0644, 1398 .mode = 0644,
1358 .proc_handler = &scan_unevictable_handler, 1399 .proc_handler = &scan_unevictable_handler,
1359 }, 1400 },
1401#ifdef CONFIG_MEMORY_FAILURE
1402 {
1403 .ctl_name = CTL_UNNUMBERED,
1404 .procname = "memory_failure_early_kill",
1405 .data = &sysctl_memory_failure_early_kill,
1406 .maxlen = sizeof(sysctl_memory_failure_early_kill),
1407 .mode = 0644,
1408 .proc_handler = &proc_dointvec_minmax,
1409 .strategy = &sysctl_intvec,
1410 .extra1 = &zero,
1411 .extra2 = &one,
1412 },
1413 {
1414 .ctl_name = CTL_UNNUMBERED,
1415 .procname = "memory_failure_recovery",
1416 .data = &sysctl_memory_failure_recovery,
1417 .maxlen = sizeof(sysctl_memory_failure_recovery),
1418 .mode = 0644,
1419 .proc_handler = &proc_dointvec_minmax,
1420 .strategy = &sysctl_intvec,
1421 .extra1 = &zero,
1422 .extra2 = &one,
1423 },
1424#endif
1425
1360/* 1426/*
1361 * NOTE: do not add new entries to this table unless you have read 1427 * NOTE: do not add new entries to this table unless you have read
1362 * Documentation/sysctl/ctl_unnumbered.txt 1428 * Documentation/sysctl/ctl_unnumbered.txt
@@ -2185,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head)
2185#ifdef CONFIG_PROC_SYSCTL 2251#ifdef CONFIG_PROC_SYSCTL
2186 2252
2187static int _proc_do_string(void* data, int maxlen, int write, 2253static int _proc_do_string(void* data, int maxlen, int write,
2188 struct file *filp, void __user *buffer, 2254 void __user *buffer,
2189 size_t *lenp, loff_t *ppos) 2255 size_t *lenp, loff_t *ppos)
2190{ 2256{
2191 size_t len; 2257 size_t len;
@@ -2246,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write,
2246 * proc_dostring - read a string sysctl 2312 * proc_dostring - read a string sysctl
2247 * @table: the sysctl table 2313 * @table: the sysctl table
2248 * @write: %TRUE if this is a write to the sysctl file 2314 * @write: %TRUE if this is a write to the sysctl file
2249 * @filp: the file structure
2250 * @buffer: the user buffer 2315 * @buffer: the user buffer
2251 * @lenp: the size of the user buffer 2316 * @lenp: the size of the user buffer
2252 * @ppos: file position 2317 * @ppos: file position
@@ -2260,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write,
2260 * 2325 *
2261 * Returns 0 on success. 2326 * Returns 0 on success.
2262 */ 2327 */
2263int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2328int proc_dostring(struct ctl_table *table, int write,
2264 void __user *buffer, size_t *lenp, loff_t *ppos) 2329 void __user *buffer, size_t *lenp, loff_t *ppos)
2265{ 2330{
2266 return _proc_do_string(table->data, table->maxlen, write, filp, 2331 return _proc_do_string(table->data, table->maxlen, write,
2267 buffer, lenp, ppos); 2332 buffer, lenp, ppos);
2268} 2333}
2269 2334
@@ -2288,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2288} 2353}
2289 2354
2290static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2355static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2291 int write, struct file *filp, void __user *buffer, 2356 int write, void __user *buffer,
2292 size_t *lenp, loff_t *ppos, 2357 size_t *lenp, loff_t *ppos,
2293 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2358 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2294 int write, void *data), 2359 int write, void *data),
@@ -2395,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2395#undef TMPBUFLEN 2460#undef TMPBUFLEN
2396} 2461}
2397 2462
2398static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2463static int do_proc_dointvec(struct ctl_table *table, int write,
2399 void __user *buffer, size_t *lenp, loff_t *ppos, 2464 void __user *buffer, size_t *lenp, loff_t *ppos,
2400 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2465 int (*conv)(int *negp, unsigned long *lvalp, int *valp,
2401 int write, void *data), 2466 int write, void *data),
2402 void *data) 2467 void *data)
2403{ 2468{
2404 return __do_proc_dointvec(table->data, table, write, filp, 2469 return __do_proc_dointvec(table->data, table, write,
2405 buffer, lenp, ppos, conv, data); 2470 buffer, lenp, ppos, conv, data);
2406} 2471}
2407 2472
@@ -2409,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2409 * proc_dointvec - read a vector of integers 2474 * proc_dointvec - read a vector of integers
2410 * @table: the sysctl table 2475 * @table: the sysctl table
2411 * @write: %TRUE if this is a write to the sysctl file 2476 * @write: %TRUE if this is a write to the sysctl file
2412 * @filp: the file structure
2413 * @buffer: the user buffer 2477 * @buffer: the user buffer
2414 * @lenp: the size of the user buffer 2478 * @lenp: the size of the user buffer
2415 * @ppos: file position 2479 * @ppos: file position
@@ -2419,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil
2419 * 2483 *
2420 * Returns 0 on success. 2484 * Returns 0 on success.
2421 */ 2485 */
2422int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2486int proc_dointvec(struct ctl_table *table, int write,
2423 void __user *buffer, size_t *lenp, loff_t *ppos) 2487 void __user *buffer, size_t *lenp, loff_t *ppos)
2424{ 2488{
2425 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2489 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2426 NULL,NULL); 2490 NULL,NULL);
2427} 2491}
2428 2492
@@ -2430,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2430 * Taint values can only be increased 2494 * Taint values can only be increased
2431 * This means we can safely use a temporary. 2495 * This means we can safely use a temporary.
2432 */ 2496 */
2433static int proc_taint(struct ctl_table *table, int write, struct file *filp, 2497static int proc_taint(struct ctl_table *table, int write,
2434 void __user *buffer, size_t *lenp, loff_t *ppos) 2498 void __user *buffer, size_t *lenp, loff_t *ppos)
2435{ 2499{
2436 struct ctl_table t; 2500 struct ctl_table t;
@@ -2442,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2442 2506
2443 t = *table; 2507 t = *table;
2444 t.data = &tmptaint; 2508 t.data = &tmptaint;
2445 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); 2509 err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
2446 if (err < 0) 2510 if (err < 0)
2447 return err; 2511 return err;
2448 2512
@@ -2494,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2494 * proc_dointvec_minmax - read a vector of integers with min/max values 2558 * proc_dointvec_minmax - read a vector of integers with min/max values
2495 * @table: the sysctl table 2559 * @table: the sysctl table
2496 * @write: %TRUE if this is a write to the sysctl file 2560 * @write: %TRUE if this is a write to the sysctl file
2497 * @filp: the file structure
2498 * @buffer: the user buffer 2561 * @buffer: the user buffer
2499 * @lenp: the size of the user buffer 2562 * @lenp: the size of the user buffer
2500 * @ppos: file position 2563 * @ppos: file position
@@ -2507,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2507 * 2570 *
2508 * Returns 0 on success. 2571 * Returns 0 on success.
2509 */ 2572 */
2510int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2573int proc_dointvec_minmax(struct ctl_table *table, int write,
2511 void __user *buffer, size_t *lenp, loff_t *ppos) 2574 void __user *buffer, size_t *lenp, loff_t *ppos)
2512{ 2575{
2513 struct do_proc_dointvec_minmax_conv_param param = { 2576 struct do_proc_dointvec_minmax_conv_param param = {
2514 .min = (int *) table->extra1, 2577 .min = (int *) table->extra1,
2515 .max = (int *) table->extra2, 2578 .max = (int *) table->extra2,
2516 }; 2579 };
2517 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2580 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2518 do_proc_dointvec_minmax_conv, &param); 2581 do_proc_dointvec_minmax_conv, &param);
2519} 2582}
2520 2583
2521static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, 2584static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
2522 struct file *filp,
2523 void __user *buffer, 2585 void __user *buffer,
2524 size_t *lenp, loff_t *ppos, 2586 size_t *lenp, loff_t *ppos,
2525 unsigned long convmul, 2587 unsigned long convmul,
@@ -2624,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2624} 2686}
2625 2687
2626static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2688static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2627 struct file *filp,
2628 void __user *buffer, 2689 void __user *buffer,
2629 size_t *lenp, loff_t *ppos, 2690 size_t *lenp, loff_t *ppos,
2630 unsigned long convmul, 2691 unsigned long convmul,
2631 unsigned long convdiv) 2692 unsigned long convdiv)
2632{ 2693{
2633 return __do_proc_doulongvec_minmax(table->data, table, write, 2694 return __do_proc_doulongvec_minmax(table->data, table, write,
2634 filp, buffer, lenp, ppos, convmul, convdiv); 2695 buffer, lenp, ppos, convmul, convdiv);
2635} 2696}
2636 2697
2637/** 2698/**
2638 * proc_doulongvec_minmax - read a vector of long integers with min/max values 2699 * proc_doulongvec_minmax - read a vector of long integers with min/max values
2639 * @table: the sysctl table 2700 * @table: the sysctl table
2640 * @write: %TRUE if this is a write to the sysctl file 2701 * @write: %TRUE if this is a write to the sysctl file
2641 * @filp: the file structure
2642 * @buffer: the user buffer 2702 * @buffer: the user buffer
2643 * @lenp: the size of the user buffer 2703 * @lenp: the size of the user buffer
2644 * @ppos: file position 2704 * @ppos: file position
@@ -2651,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
2651 * 2711 *
2652 * Returns 0 on success. 2712 * Returns 0 on success.
2653 */ 2713 */
2654int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2714int proc_doulongvec_minmax(struct ctl_table *table, int write,
2655 void __user *buffer, size_t *lenp, loff_t *ppos) 2715 void __user *buffer, size_t *lenp, loff_t *ppos)
2656{ 2716{
2657 return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); 2717 return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
2658} 2718}
2659 2719
2660/** 2720/**
2661 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values 2721 * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
2662 * @table: the sysctl table 2722 * @table: the sysctl table
2663 * @write: %TRUE if this is a write to the sysctl file 2723 * @write: %TRUE if this is a write to the sysctl file
2664 * @filp: the file structure
2665 * @buffer: the user buffer 2724 * @buffer: the user buffer
2666 * @lenp: the size of the user buffer 2725 * @lenp: the size of the user buffer
2667 * @ppos: file position 2726 * @ppos: file position
@@ -2676,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp
2676 * Returns 0 on success. 2735 * Returns 0 on success.
2677 */ 2736 */
2678int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2737int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2679 struct file *filp,
2680 void __user *buffer, 2738 void __user *buffer,
2681 size_t *lenp, loff_t *ppos) 2739 size_t *lenp, loff_t *ppos)
2682{ 2740{
2683 return do_proc_doulongvec_minmax(table, write, filp, buffer, 2741 return do_proc_doulongvec_minmax(table, write, buffer,
2684 lenp, ppos, HZ, 1000l); 2742 lenp, ppos, HZ, 1000l);
2685} 2743}
2686 2744
@@ -2756,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2756 * proc_dointvec_jiffies - read a vector of integers as seconds 2814 * proc_dointvec_jiffies - read a vector of integers as seconds
2757 * @table: the sysctl table 2815 * @table: the sysctl table
2758 * @write: %TRUE if this is a write to the sysctl file 2816 * @write: %TRUE if this is a write to the sysctl file
2759 * @filp: the file structure
2760 * @buffer: the user buffer 2817 * @buffer: the user buffer
2761 * @lenp: the size of the user buffer 2818 * @lenp: the size of the user buffer
2762 * @ppos: file position 2819 * @ppos: file position
@@ -2768,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2768 * 2825 *
2769 * Returns 0 on success. 2826 * Returns 0 on success.
2770 */ 2827 */
2771int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2828int proc_dointvec_jiffies(struct ctl_table *table, int write,
2772 void __user *buffer, size_t *lenp, loff_t *ppos) 2829 void __user *buffer, size_t *lenp, loff_t *ppos)
2773{ 2830{
2774 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2831 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2775 do_proc_dointvec_jiffies_conv,NULL); 2832 do_proc_dointvec_jiffies_conv,NULL);
2776} 2833}
2777 2834
@@ -2779,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2779 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds 2836 * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
2780 * @table: the sysctl table 2837 * @table: the sysctl table
2781 * @write: %TRUE if this is a write to the sysctl file 2838 * @write: %TRUE if this is a write to the sysctl file
2782 * @filp: the file structure
2783 * @buffer: the user buffer 2839 * @buffer: the user buffer
2784 * @lenp: the size of the user buffer 2840 * @lenp: the size of the user buffer
2785 * @ppos: pointer to the file position 2841 * @ppos: pointer to the file position
@@ -2791,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
2791 * 2847 *
2792 * Returns 0 on success. 2848 * Returns 0 on success.
2793 */ 2849 */
2794int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2850int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2795 void __user *buffer, size_t *lenp, loff_t *ppos) 2851 void __user *buffer, size_t *lenp, loff_t *ppos)
2796{ 2852{
2797 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2853 return do_proc_dointvec(table,write,buffer,lenp,ppos,
2798 do_proc_dointvec_userhz_jiffies_conv,NULL); 2854 do_proc_dointvec_userhz_jiffies_conv,NULL);
2799} 2855}
2800 2856
@@ -2802,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2802 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds 2858 * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
2803 * @table: the sysctl table 2859 * @table: the sysctl table
2804 * @write: %TRUE if this is a write to the sysctl file 2860 * @write: %TRUE if this is a write to the sysctl file
2805 * @filp: the file structure
2806 * @buffer: the user buffer 2861 * @buffer: the user buffer
2807 * @lenp: the size of the user buffer 2862 * @lenp: the size of the user buffer
2808 * @ppos: file position 2863 * @ppos: file position
@@ -2815,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file
2815 * 2870 *
2816 * Returns 0 on success. 2871 * Returns 0 on success.
2817 */ 2872 */
2818int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2873int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2819 void __user *buffer, size_t *lenp, loff_t *ppos) 2874 void __user *buffer, size_t *lenp, loff_t *ppos)
2820{ 2875{
2821 return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, 2876 return do_proc_dointvec(table, write, buffer, lenp, ppos,
2822 do_proc_dointvec_ms_jiffies_conv, NULL); 2877 do_proc_dointvec_ms_jiffies_conv, NULL);
2823} 2878}
2824 2879
2825static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 2880static int proc_do_cad_pid(struct ctl_table *table, int write,
2826 void __user *buffer, size_t *lenp, loff_t *ppos) 2881 void __user *buffer, size_t *lenp, loff_t *ppos)
2827{ 2882{
2828 struct pid *new_pid; 2883 struct pid *new_pid;
@@ -2831,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2831 2886
2832 tmp = pid_vnr(cad_pid); 2887 tmp = pid_vnr(cad_pid);
2833 2888
2834 r = __do_proc_dointvec(&tmp, table, write, filp, buffer, 2889 r = __do_proc_dointvec(&tmp, table, write, buffer,
2835 lenp, ppos, NULL, NULL); 2890 lenp, ppos, NULL, NULL);
2836 if (r || !write) 2891 if (r || !write)
2837 return r; 2892 return r;
@@ -2846,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp
2846 2901
2847#else /* CONFIG_PROC_FS */ 2902#else /* CONFIG_PROC_FS */
2848 2903
2849int proc_dostring(struct ctl_table *table, int write, struct file *filp, 2904int proc_dostring(struct ctl_table *table, int write,
2850 void __user *buffer, size_t *lenp, loff_t *ppos) 2905 void __user *buffer, size_t *lenp, loff_t *ppos)
2851{ 2906{
2852 return -ENOSYS; 2907 return -ENOSYS;
2853} 2908}
2854 2909
2855int proc_dointvec(struct ctl_table *table, int write, struct file *filp, 2910int proc_dointvec(struct ctl_table *table, int write,
2856 void __user *buffer, size_t *lenp, loff_t *ppos) 2911 void __user *buffer, size_t *lenp, loff_t *ppos)
2857{ 2912{
2858 return -ENOSYS; 2913 return -ENOSYS;
2859} 2914}
2860 2915
2861int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, 2916int proc_dointvec_minmax(struct ctl_table *table, int write,
2862 void __user *buffer, size_t *lenp, loff_t *ppos) 2917 void __user *buffer, size_t *lenp, loff_t *ppos)
2863{ 2918{
2864 return -ENOSYS; 2919 return -ENOSYS;
2865} 2920}
2866 2921
2867int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, 2922int proc_dointvec_jiffies(struct ctl_table *table, int write,
2868 void __user *buffer, size_t *lenp, loff_t *ppos) 2923 void __user *buffer, size_t *lenp, loff_t *ppos)
2869{ 2924{
2870 return -ENOSYS; 2925 return -ENOSYS;
2871} 2926}
2872 2927
2873int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, 2928int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
2874 void __user *buffer, size_t *lenp, loff_t *ppos) 2929 void __user *buffer, size_t *lenp, loff_t *ppos)
2875{ 2930{
2876 return -ENOSYS; 2931 return -ENOSYS;
2877} 2932}
2878 2933
2879int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, 2934int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
2880 void __user *buffer, size_t *lenp, loff_t *ppos) 2935 void __user *buffer, size_t *lenp, loff_t *ppos)
2881{ 2936{
2882 return -ENOSYS; 2937 return -ENOSYS;
2883} 2938}
2884 2939
2885int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, 2940int proc_doulongvec_minmax(struct ctl_table *table, int write,
2886 void __user *buffer, size_t *lenp, loff_t *ppos) 2941 void __user *buffer, size_t *lenp, loff_t *ppos)
2887{ 2942{
2888 return -ENOSYS; 2943 return -ENOSYS;
2889} 2944}
2890 2945
2891int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, 2946int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2892 struct file *filp,
2893 void __user *buffer, 2947 void __user *buffer,
2894 size_t *lenp, loff_t *ppos) 2948 size_t *lenp, loff_t *ppos)
2895{ 2949{
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 0b0a6366c9d4..ee266620b06c 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o 1obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
2 2
3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o 3obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o 4obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..5e18c6ab2c6a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
165 162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,12 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 list_for_each_entry(cs, &clocksource_list, list)
300
301 list_for_each_entry(cs, &clocksource_list, list) {
302 if (cs->resume) 398 if (cs->resume)
303 cs->resume(); 399 cs->resume();
304 }
305 400
306 clocksource_resume_watchdog(); 401 clocksource_resume_watchdog();
307
308 spin_unlock_irqrestore(&clocksource_lock, flags);
309} 402}
310 403
311/** 404/**
@@ -320,75 +413,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 413 clocksource_resume_watchdog();
321} 414}
322 415
416#ifdef CONFIG_GENERIC_TIME
417
323/** 418/**
324 * clocksource_get_next - Returns the selected clocksource 419 * clocksource_select - Select the best clocksource available
325 * 420 *
421 * Private function. Must hold clocksource_mutex when called.
422 *
423 * Select the clocksource with the best rating, or the clocksource,
424 * which is selected by userspace override.
326 */ 425 */
327struct clocksource *clocksource_get_next(void) 426static void clocksource_select(void)
328{ 427{
329 unsigned long flags; 428 struct clocksource *best, *cs;
330 429
331 spin_lock_irqsave(&clocksource_lock, flags); 430 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 431 return;
333 curr_clocksource = next_clocksource; 432 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 433 best = list_first_entry(&clocksource_list, struct clocksource, list);
434 /* Check for the override clocksource. */
435 list_for_each_entry(cs, &clocksource_list, list) {
436 if (strcmp(cs->name, override_name) != 0)
437 continue;
438 /*
439 * Check to make sure we don't switch to a non-highres
440 * capable clocksource if the tick code is in oneshot
441 * mode (highres or nohz)
442 */
443 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
444 tick_oneshot_mode_active()) {
445 /* Override clocksource cannot be used. */
446 printk(KERN_WARNING "Override clocksource %s is not "
447 "HRT compatible. Cannot switch while in "
448 "HRT/NOHZ mode\n", cs->name);
449 override_name[0] = 0;
450 } else
451 /* Override clocksource can be used. */
452 best = cs;
453 break;
454 }
455 if (curr_clocksource != best) {
456 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
457 curr_clocksource = best;
458 timekeeping_notify(curr_clocksource);
335 } 459 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 460}
340 461
341/** 462#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 463
343 * 464static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 465
466#endif
467
468/*
469 * clocksource_done_booting - Called near the end of core bootup
345 * 470 *
346 * Select the clocksource with the best rating, or the clocksource, 471 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 472 * We use fs_initcall because we want this to start before
473 * device_initcall but after subsys_initcall.
348 */ 474 */
349static struct clocksource *select_clocksource(void) 475static int __init clocksource_done_booting(void)
350{ 476{
351 struct clocksource *next; 477 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 478
362 if (next == curr_clocksource) 479 /*
363 return NULL; 480 * Run the watchdog first to eliminate unstable clock sources
481 */
482 clocksource_watchdog_kthread(NULL);
364 483
365 return next; 484 mutex_lock(&clocksource_mutex);
485 clocksource_select();
486 mutex_unlock(&clocksource_mutex);
487 return 0;
366} 488}
489fs_initcall(clocksource_done_booting);
367 490
368/* 491/*
369 * Enqueue the clocksource sorted by rating 492 * Enqueue the clocksource sorted by rating
370 */ 493 */
371static int clocksource_enqueue(struct clocksource *c) 494static void clocksource_enqueue(struct clocksource *cs)
372{ 495{
373 struct list_head *tmp, *entry = &clocksource_list; 496 struct list_head *entry = &clocksource_list;
497 struct clocksource *tmp;
374 498
375 list_for_each(tmp, &clocksource_list) { 499 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 500 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 501 if (tmp->rating >= cs->rating)
383 entry = tmp; 502 entry = &tmp->list;
384 } 503 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 504}
393 505
394/** 506/**
@@ -397,52 +509,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 509 *
398 * Returns -EBUSY if registration fails, zero otherwise. 510 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 511 */
400int clocksource_register(struct clocksource *c) 512int clocksource_register(struct clocksource *cs)
401{ 513{
402 unsigned long flags; 514 mutex_lock(&clocksource_mutex);
403 int ret; 515 clocksource_enqueue(cs);
404 516 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 517 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 518 mutex_unlock(&clocksource_mutex);
407 if (!ret) 519 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 520}
414EXPORT_SYMBOL(clocksource_register); 521EXPORT_SYMBOL(clocksource_register);
415 522
523static void __clocksource_change_rating(struct clocksource *cs, int rating)
524{
525 list_del(&cs->list);
526 cs->rating = rating;
527 clocksource_enqueue(cs);
528 clocksource_select();
529}
530
416/** 531/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 532 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 533 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 534void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 535{
422 unsigned long flags; 536 mutex_lock(&clocksource_mutex);
423 537 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 538 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 539}
540EXPORT_SYMBOL(clocksource_change_rating);
431 541
432/** 542/**
433 * clocksource_unregister - remove a registered clocksource 543 * clocksource_unregister - remove a registered clocksource
434 */ 544 */
435void clocksource_unregister(struct clocksource *cs) 545void clocksource_unregister(struct clocksource *cs)
436{ 546{
437 unsigned long flags; 547 mutex_lock(&clocksource_mutex);
438 548 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 549 list_del(&cs->list);
441 if (clocksource_override == cs) 550 clocksource_select();
442 clocksource_override = NULL; 551 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 552}
553EXPORT_SYMBOL(clocksource_unregister);
446 554
447#ifdef CONFIG_SYSFS 555#ifdef CONFIG_SYSFS
448/** 556/**
@@ -458,9 +566,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 566{
459 ssize_t count = 0; 567 ssize_t count = 0;
460 568
461 spin_lock_irq(&clocksource_lock); 569 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 570 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 571 mutex_unlock(&clocksource_mutex);
464 572
465 return count; 573 return count;
466} 574}
@@ -478,9 +586,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 586 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 587 const char *buf, size_t count)
480{ 588{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 589 size_t ret = count;
483 int len;
484 590
485 /* strings from sysfs write are not 0 terminated! */ 591 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 592 if (count >= sizeof(override_name))
@@ -490,44 +596,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 596 if (buf[count-1] == '\n')
491 count--; 597 count--;
492 598
493 spin_lock_irq(&clocksource_lock); 599 mutex_lock(&clocksource_mutex);
494 600
495 if (count > 0) 601 if (count > 0)
496 memcpy(override_name, buf, count); 602 memcpy(override_name, buf, count);
497 override_name[count] = 0; 603 override_name[count] = 0;
604 clocksource_select();
498 605
499 len = strlen(override_name); 606 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 607
532 return ret; 608 return ret;
533} 609}
@@ -547,7 +623,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 623 struct clocksource *src;
548 ssize_t count = 0; 624 ssize_t count = 0;
549 625
550 spin_lock_irq(&clocksource_lock); 626 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 627 list_for_each_entry(src, &clocksource_list, list) {
552 /* 628 /*
553 * Don't show non-HRES clocksource if the tick code is 629 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +635,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 635 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 636 "%s ", src->name);
561 } 637 }
562 spin_unlock_irq(&clocksource_lock); 638 mutex_unlock(&clocksource_mutex);
563 639
564 count += snprintf(buf + count, 640 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 641 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +690,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 690 */
615static int __init boot_override_clocksource(char* str) 691static int __init boot_override_clocksource(char* str)
616{ 692{
617 unsigned long flags; 693 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 694 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 695 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 696 mutex_unlock(&clocksource_mutex);
622 return 1; 697 return 1;
623} 698}
624 699
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
new file mode 100644
index 000000000000..86628e755f38
--- /dev/null
+++ b/kernel/time/timeconv.c
@@ -0,0 +1,127 @@
1/*
2 * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
3 * This file is part of the GNU C Library.
4 * Contributed by Paul Eggert (eggert@twinsun.com).
5 *
6 * The GNU C Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The GNU C Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with the GNU C Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22/*
23 * Converts the calendar time to broken-down time representation
24 * Based on code from glibc-2.6
25 *
26 * 2009-7-14:
27 * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com>
28 */
29
30#include <linux/time.h>
31#include <linux/module.h>
32
33/*
34 * Nonzero if YEAR is a leap year (every 4 years,
35 * except every 100th isn't, and every 400th is).
36 */
37static int __isleap(long year)
38{
39 return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0);
40}
41
42/* do a mathdiv for long type */
43static long math_div(long a, long b)
44{
45 return a / b - (a % b < 0);
46}
47
48/* How many leap years between y1 and y2, y1 must less or equal to y2 */
49static long leaps_between(long y1, long y2)
50{
51 long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100)
52 + math_div(y1 - 1, 400);
53 long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100)
54 + math_div(y2 - 1, 400);
55 return leaps2 - leaps1;
56}
57
58/* How many days come before each month (0-12). */
59static const unsigned short __mon_yday[2][13] = {
60 /* Normal years. */
61 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365},
62 /* Leap years. */
63 {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366}
64};
65
66#define SECS_PER_HOUR (60 * 60)
67#define SECS_PER_DAY (SECS_PER_HOUR * 24)
68
69/**
70 * time_to_tm - converts the calendar time to local broken-down time
71 *
72 * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
73 * Coordinated Universal Time (UTC).
74 * @offset offset seconds adding to totalsecs.
75 * @result pointer to struct tm variable to receive broken-down time
76 */
77void time_to_tm(time_t totalsecs, int offset, struct tm *result)
78{
79 long days, rem, y;
80 const unsigned short *ip;
81
82 days = totalsecs / SECS_PER_DAY;
83 rem = totalsecs % SECS_PER_DAY;
84 rem += offset;
85 while (rem < 0) {
86 rem += SECS_PER_DAY;
87 --days;
88 }
89 while (rem >= SECS_PER_DAY) {
90 rem -= SECS_PER_DAY;
91 ++days;
92 }
93
94 result->tm_hour = rem / SECS_PER_HOUR;
95 rem %= SECS_PER_HOUR;
96 result->tm_min = rem / 60;
97 result->tm_sec = rem % 60;
98
99 /* January 1, 1970 was a Thursday. */
100 result->tm_wday = (4 + days) % 7;
101 if (result->tm_wday < 0)
102 result->tm_wday += 7;
103
104 y = 1970;
105
106 while (days < 0 || days >= (__isleap(y) ? 366 : 365)) {
107 /* Guess a corrected year, assuming 365 days per year. */
108 long yg = y + math_div(days, 365);
109
110 /* Adjust DAYS and Y to match the guessed year. */
111 days -= (yg - y) * 365 + leaps_between(y, yg);
112 y = yg;
113 }
114
115 result->tm_year = y - 1900;
116
117 result->tm_yday = days;
118
119 ip = __mon_yday[__isleap(y)];
120 for (y = 11; days < ip[y]; y--)
121 continue;
122 days -= ip[y];
123
124 result->tm_mon = y;
125 result->tm_mday = days + 1;
126}
127EXPORT_SYMBOL(time_to_tm);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/tick.h> 20#include <linux/tick.h>
21#include <linux/stop_machine.h>
22
23/* Structure holding internal timekeeping values. */
24struct timekeeper {
25 /* Current clocksource used for timekeeping. */
26 struct clocksource *clock;
27 /* The shift value of the current clocksource. */
28 int shift;
29
30 /* Number of clock cycles in one NTP interval. */
31 cycle_t cycle_interval;
32 /* Number of clock shifted nano seconds in one NTP interval. */
33 u64 xtime_interval;
34 /* Raw nano seconds accumulated per NTP interval. */
35 u32 raw_interval;
36
37 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
38 u64 xtime_nsec;
39 /* Difference between accumulated time and NTP time in ntp
40 * shifted nano seconds. */
41 s64 ntp_error;
42 /* Shift conversion between clock shifted nano seconds and
43 * ntp shifted nano seconds. */
44 int ntp_error_shift;
45 /* NTP adjusted clock multiplier */
46 u32 mult;
47};
48
49struct timekeeper timekeeper;
50
51/**
52 * timekeeper_setup_internals - Set up internals to use clocksource clock.
53 *
54 * @clock: Pointer to clocksource.
55 *
56 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
57 * pair and interval request.
58 *
59 * Unless you're the timekeeping code, you should not be using this!
60 */
61static void timekeeper_setup_internals(struct clocksource *clock)
62{
63 cycle_t interval;
64 u64 tmp;
65
66 timekeeper.clock = clock;
67 clock->cycle_last = clock->read(clock);
21 68
69 /* Do the ns -> cycle conversion first, using original mult */
70 tmp = NTP_INTERVAL_LENGTH;
71 tmp <<= clock->shift;
72 tmp += clock->mult/2;
73 do_div(tmp, clock->mult);
74 if (tmp == 0)
75 tmp = 1;
76
77 interval = (cycle_t) tmp;
78 timekeeper.cycle_interval = interval;
79
80 /* Go back from cycles -> shifted ns */
81 timekeeper.xtime_interval = (u64) interval * clock->mult;
82 timekeeper.raw_interval =
83 ((u64) interval * clock->mult) >> clock->shift;
84
85 timekeeper.xtime_nsec = 0;
86 timekeeper.shift = clock->shift;
87
88 timekeeper.ntp_error = 0;
89 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
90
91 /*
92 * The timekeeper keeps its own mult values for the currently
93 * active clocksource. These value will be adjusted via NTP
94 * to counteract clock drifting.
95 */
96 timekeeper.mult = clock->mult;
97}
98
99/* Timekeeper helper functions. */
100static inline s64 timekeeping_get_ns(void)
101{
102 cycle_t cycle_now, cycle_delta;
103 struct clocksource *clock;
104
105 /* read clocksource: */
106 clock = timekeeper.clock;
107 cycle_now = clock->read(clock);
108
109 /* calculate the delta since the last update_wall_time: */
110 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
111
112 /* return delta convert to nanoseconds using ntp adjusted mult. */
113 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
114 timekeeper.shift);
115}
116
117static inline s64 timekeeping_get_ns_raw(void)
118{
119 cycle_t cycle_now, cycle_delta;
120 struct clocksource *clock;
121
122 /* read clocksource: */
123 clock = timekeeper.clock;
124 cycle_now = clock->read(clock);
125
126 /* calculate the delta since the last update_wall_time: */
127 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
128
129 /* return delta convert to nanoseconds using ntp adjusted mult. */
130 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
131}
22 132
23/* 133/*
24 * This read-write spinlock protects us from races in SMP while 134 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 154 */
45struct timespec xtime __attribute__ ((aligned (16))); 155struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 156struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 157static struct timespec total_sleep_time;
158
159/*
160 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
161 */
162struct timespec raw_time;
48 163
49/* flag for if timekeeping is suspended */ 164/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 165int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 171 timespec_add_ns(&xtime_cache, nsec);
57} 172}
58 173
59struct clocksource *clock; 174/* must hold xtime_lock */
60 175void timekeeping_leap_insert(int leapsecond)
176{
177 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock);
180}
61 181
62#ifdef CONFIG_GENERIC_TIME 182#ifdef CONFIG_GENERIC_TIME
183
63/** 184/**
64 * clocksource_forward_now - update clock to the current time 185 * timekeeping_forward_now - update clock to the current time
65 * 186 *
66 * Forward the current clock to update its state since the last call to 187 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 188 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 189 * as it avoids having to deal with this time offset explicitly.
69 */ 190 */
70static void clocksource_forward_now(void) 191static void timekeeping_forward_now(void)
71{ 192{
72 cycle_t cycle_now, cycle_delta; 193 cycle_t cycle_now, cycle_delta;
194 struct clocksource *clock;
73 s64 nsec; 195 s64 nsec;
74 196
75 cycle_now = clocksource_read(clock); 197 clock = timekeeper.clock;
198 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 200 clock->cycle_last = cycle_now;
78 201
79 nsec = cyc2ns(clock, cycle_delta); 202 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
203 timekeeper.shift);
80 204
81 /* If arch requires, add in gettimeoffset() */ 205 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 206 nsec += arch_gettimeoffset();
83 207
84 timespec_add_ns(&xtime, nsec); 208 timespec_add_ns(&xtime, nsec);
85 209
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 210 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 211 timespec_add_ns(&raw_time, nsec);
88} 212}
89 213
90/** 214/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
95 */ 219 */
96void getnstimeofday(struct timespec *ts) 220void getnstimeofday(struct timespec *ts)
97{ 221{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 222 unsigned long seq;
100 s64 nsecs; 223 s64 nsecs;
101 224
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 228 seq = read_seqbegin(&xtime_lock);
106 229
107 *ts = xtime; 230 *ts = xtime;
108 231 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 232
118 /* If arch requires, add in gettimeoffset() */ 233 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 234 nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
125 240
126EXPORT_SYMBOL(getnstimeofday); 241EXPORT_SYMBOL(getnstimeofday);
127 242
243ktime_t ktime_get(void)
244{
245 unsigned int seq;
246 s64 secs, nsecs;
247
248 WARN_ON(timekeeping_suspended);
249
250 do {
251 seq = read_seqbegin(&xtime_lock);
252 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
253 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
254 nsecs += timekeeping_get_ns();
255
256 } while (read_seqretry(&xtime_lock, seq));
257 /*
258 * Use ktime_set/ktime_add_ns to create a proper ktime on
259 * 32-bit architectures without CONFIG_KTIME_SCALAR.
260 */
261 return ktime_add_ns(ktime_set(secs, 0), nsecs);
262}
263EXPORT_SYMBOL_GPL(ktime_get);
264
265/**
266 * ktime_get_ts - get the monotonic clock in timespec format
267 * @ts: pointer to timespec variable
268 *
269 * The function calculates the monotonic clock from the realtime
270 * clock and the wall_to_monotonic offset and stores the result
271 * in normalized timespec format in the variable pointed to by @ts.
272 */
273void ktime_get_ts(struct timespec *ts)
274{
275 struct timespec tomono;
276 unsigned int seq;
277 s64 nsecs;
278
279 WARN_ON(timekeeping_suspended);
280
281 do {
282 seq = read_seqbegin(&xtime_lock);
283 *ts = xtime;
284 tomono = wall_to_monotonic;
285 nsecs = timekeeping_get_ns();
286
287 } while (read_seqretry(&xtime_lock, seq));
288
289 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
290 ts->tv_nsec + tomono.tv_nsec + nsecs);
291}
292EXPORT_SYMBOL_GPL(ktime_get_ts);
293
128/** 294/**
129 * do_gettimeofday - Returns the time of day in a timeval 295 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 296 * @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
157 323
158 write_seqlock_irqsave(&xtime_lock, flags); 324 write_seqlock_irqsave(&xtime_lock, flags);
159 325
160 clocksource_forward_now(); 326 timekeeping_forward_now();
161 327
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 328 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 329 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
167 333
168 update_xtime_cache(0); 334 update_xtime_cache(0);
169 335
170 clock->error = 0; 336 timekeeper.ntp_error = 0;
171 ntp_clear(); 337 ntp_clear();
172 338
173 update_vsyscall(&xtime, clock); 339 update_vsyscall(&xtime, timekeeper.clock);
174 340
175 write_sequnlock_irqrestore(&xtime_lock, flags); 341 write_sequnlock_irqrestore(&xtime_lock, flags);
176 342
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 353 *
188 * Accumulates current time interval and initializes new clocksource 354 * Accumulates current time interval and initializes new clocksource
189 */ 355 */
190static void change_clocksource(void) 356static int change_clocksource(void *data)
191{ 357{
192 struct clocksource *new, *old; 358 struct clocksource *new, *old;
193 359
194 new = clocksource_get_next(); 360 new = (struct clocksource *) data;
361
362 timekeeping_forward_now();
363 if (!new->enable || new->enable(new) == 0) {
364 old = timekeeper.clock;
365 timekeeper_setup_internals(new);
366 if (old->disable)
367 old->disable(old);
368 }
369 return 0;
370}
195 371
196 if (clock == new) 372/**
373 * timekeeping_notify - Install a new clock source
374 * @clock: pointer to the clock source
375 *
376 * This function is called from clocksource.c after a new, better clock
377 * source has been registered. The caller holds the clocksource_mutex.
378 */
379void timekeeping_notify(struct clocksource *clock)
380{
381 if (timekeeper.clock == clock)
197 return; 382 return;
383 stop_machine(change_clocksource, clock, NULL);
384 tick_clock_notify();
385}
198 386
199 clocksource_forward_now(); 387#else /* GENERIC_TIME */
200 388
201 if (clocksource_enable(new)) 389static inline void timekeeping_forward_now(void) { }
202 return;
203 390
204 new->raw_time = clock->raw_time; 391/**
205 old = clock; 392 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 393 *
207 clocksource_disable(old); 394 * returns the time in ktime_t format
395 */
396ktime_t ktime_get(void)
397{
398 struct timespec now;
208 399
209 clock->cycle_last = 0; 400 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 401
215 tick_clock_notify(); 402 return timespec_to_ktime(now);
403}
404EXPORT_SYMBOL_GPL(ktime_get);
216 405
217 /* 406/**
218 * We're holding xtime lock and waking up klogd would deadlock 407 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 408 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 409 *
221 clock->name); 410 * The function calculates the monotonic clock from the realtime
222 */ 411 * clock and the wall_to_monotonic offset and stores the result
412 * in normalized timespec format in the variable pointed to by @ts.
413 */
414void ktime_get_ts(struct timespec *ts)
415{
416 struct timespec tomono;
417 unsigned long seq;
418
419 do {
420 seq = read_seqbegin(&xtime_lock);
421 getnstimeofday(ts);
422 tomono = wall_to_monotonic;
423
424 } while (read_seqretry(&xtime_lock, seq));
425
426 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
427 ts->tv_nsec + tomono.tv_nsec);
223} 428}
224#else 429EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 430
226static inline void change_clocksource(void) { } 431#endif /* !GENERIC_TIME */
227#endif 432
433/**
434 * ktime_get_real - get the real (wall-) time in ktime_t format
435 *
436 * returns the time in ktime_t format
437 */
438ktime_t ktime_get_real(void)
439{
440 struct timespec now;
441
442 getnstimeofday(&now);
443
444 return timespec_to_ktime(now);
445}
446EXPORT_SYMBOL_GPL(ktime_get_real);
228 447
229/** 448/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 449 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 455{
237 unsigned long seq; 456 unsigned long seq;
238 s64 nsecs; 457 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 458
241 do { 459 do {
242 seq = read_seqbegin(&xtime_lock); 460 seq = read_seqbegin(&xtime_lock);
243 461 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 462 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 463
255 } while (read_seqretry(&xtime_lock, seq)); 464 } while (read_seqretry(&xtime_lock, seq));
256 465
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
270 do { 479 do {
271 seq = read_seqbegin(&xtime_lock); 480 seq = read_seqbegin(&xtime_lock);
272 481
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 482 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 483
275 } while (read_seqretry(&xtime_lock, seq)); 484 } while (read_seqretry(&xtime_lock, seq));
276 485
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
278} 487}
279 488
280/** 489/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 490 * read_persistent_clock - Return time from the persistent clock.
282 * 491 *
283 * Weak dummy function for arches that do not yet support it. 492 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 493 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 494 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 495 *
287 * XXX - Do be sure to remove it once all arches implement it. 496 * XXX - Do be sure to remove it once all arches implement it.
288 */ 497 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 498void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 499{
291 return 0; 500 ts->tv_sec = 0;
501 ts->tv_nsec = 0;
502}
503
504/**
505 * read_boot_clock - Return time of the system start.
506 *
507 * Weak dummy function for arches that do not yet support it.
508 * Function to read the exact time the system has been started.
509 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
510 *
511 * XXX - Do be sure to remove it once all arches implement it.
512 */
513void __attribute__((weak)) read_boot_clock(struct timespec *ts)
514{
515 ts->tv_sec = 0;
516 ts->tv_nsec = 0;
292} 517}
293 518
294/* 519/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 521 */
297void __init timekeeping_init(void) 522void __init timekeeping_init(void)
298{ 523{
524 struct clocksource *clock;
299 unsigned long flags; 525 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 526 struct timespec now, boot;
527
528 read_persistent_clock(&now);
529 read_boot_clock(&boot);
301 530
302 write_seqlock_irqsave(&xtime_lock, flags); 531 write_seqlock_irqsave(&xtime_lock, flags);
303 532
304 ntp_init(); 533 ntp_init();
305 534
306 clock = clocksource_get_next(); 535 clock = clocksource_default_clock();
307 clocksource_enable(clock); 536 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 537 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 538 timekeeper_setup_internals(clock);
310 539
311 xtime.tv_sec = sec; 540 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 541 xtime.tv_nsec = now.tv_nsec;
542 raw_time.tv_sec = 0;
543 raw_time.tv_nsec = 0;
544 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
545 boot.tv_sec = xtime.tv_sec;
546 boot.tv_nsec = xtime.tv_nsec;
547 }
313 set_normalized_timespec(&wall_to_monotonic, 548 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 549 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 550 update_xtime_cache(0);
316 total_sleep_time = 0; 551 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 553 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 554}
319 555
320/* time in seconds when suspend began */ 556/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 557static struct timespec timekeeping_suspend_time;
322 558
323/** 559/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 560 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 567static int timekeeping_resume(struct sys_device *dev)
332{ 568{
333 unsigned long flags; 569 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 570 struct timespec ts;
571
572 read_persistent_clock(&ts);
335 573
336 clocksource_resume(); 574 clocksource_resume();
337 575
338 write_seqlock_irqsave(&xtime_lock, flags); 576 write_seqlock_irqsave(&xtime_lock, flags);
339 577
340 if (now && (now > timekeeping_suspend_time)) { 578 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 579 ts = timespec_sub(ts, timekeeping_suspend_time);
342 580 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 582 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 583 }
347 update_xtime_cache(0); 584 update_xtime_cache(0);
348 /* re-base the last cycle value */ 585 /* re-base the last cycle value */
349 clock->cycle_last = 0; 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 587 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 588 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 589 write_sequnlock_irqrestore(&xtime_lock, flags);
354 590
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 602{
367 unsigned long flags; 603 unsigned long flags;
368 604
369 timekeeping_suspend_time = read_persistent_clock(); 605 read_persistent_clock(&timekeeping_suspend_time);
370 606
371 write_seqlock_irqsave(&xtime_lock, flags); 607 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 608 timekeeping_forward_now();
373 timekeeping_suspended = 1; 609 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 610 write_sequnlock_irqrestore(&xtime_lock, flags);
375 611
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 640 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 641 * to compensate for late or lost adjustments.
406 */ 642 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 643static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 644 s64 *offset)
409{ 645{
410 s64 tick_error, i; 646 s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 656 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 657 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 658 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 659 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 660 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 661 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 662 error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 665 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 666 * remove the single look ahead already included in the error.
431 */ 667 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 668 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 669 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 670 error = ((error - tick_error) >> look_ahead) + tick_error;
435 671
436 /* Finally calculate the adjustment shift value. */ 672 /* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 691 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 692 * for other values we can do a bit more work.
457 */ 693 */
458static void clocksource_adjust(s64 offset) 694static void timekeeping_adjust(s64 offset)
459{ 695{
460 s64 error, interval = clock->cycle_interval; 696 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 697 int adj;
462 698
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 699 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 700 if (error > interval) {
465 error >>= 2; 701 error >>= 2;
466 if (likely(error <= interval)) 702 if (likely(error <= interval))
467 adj = 1; 703 adj = 1;
468 else 704 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 705 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 706 } else if (error < -interval) {
471 error >>= 2; 707 error >>= 2;
472 if (likely(error >= -interval)) { 708 if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 710 interval = -interval;
475 offset = -offset; 711 offset = -offset;
476 } else 712 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 713 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 714 } else
479 return; 715 return;
480 716
481 clock->mult += adj; 717 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 718 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 719 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 720 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 721 timekeeper.ntp_error_shift;
486} 722}
487 723
488/** 724/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
492 */ 728 */
493void update_wall_time(void) 729void update_wall_time(void)
494{ 730{
731 struct clocksource *clock;
495 cycle_t offset; 732 cycle_t offset;
733 u64 nsecs;
496 734
497 /* Make sure we're fully resumed: */ 735 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 736 if (unlikely(timekeeping_suspended))
499 return; 737 return;
500 738
739 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 740#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 741 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 742#else
504 offset = clock->cycle_interval; 743 offset = timekeeper.cycle_interval;
505#endif 744#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 746
508 /* normally this loop will run just once, however in the 747 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 748 * case of lost or late ticks, it will accumulate correctly.
510 */ 749 */
511 while (offset >= clock->cycle_interval) { 750 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
752
512 /* accumulate one interval */ 753 /* accumulate one interval */
513 offset -= clock->cycle_interval; 754 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 755 clock->cycle_last += timekeeper.cycle_interval;
515 756
516 clock->xtime_nsec += clock->xtime_interval; 757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 758 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 759 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 760 xtime.tv_sec++;
520 second_overflow(); 761 second_overflow();
521 } 762 }
522 763
523 clock->raw_time.tv_nsec += clock->raw_interval; 764 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 766 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 767 raw_time.tv_sec++;
527 } 768 }
528 769
529 /* accumulate error between NTP and clock interval */ 770 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 771 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
532 } 774 }
533 775
534 /* correct the clock when NTP error is too big */ 776 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 777 timekeeping_adjust(offset);
536 778
537 /* 779 /*
538 * Since in the loop above, we accumulate any amount of time 780 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 781 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 782 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 783 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 784 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 785 * cause it to underflow.
544 * 786 *
@@ -550,24 +792,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 792 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 793 * xtime_nsec is not as small.
552 */ 794 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 795 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 796 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 797 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 798 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 799 }
558 800
559 /* store full nanoseconds into xtime after rounding it up and 801 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 802 * add the remainder to the error difference.
561 */ 803 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 804 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 805 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 806 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift;
565 808
566 update_xtime_cache(cyc2ns(clock, offset)); 809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
567 811
568 /* check to see if there is a new clocksource to use */ 812 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 813 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 814}
572 815
573/** 816/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
583 */ 826 */
584void getboottime(struct timespec *ts) 827void getboottime(struct timespec *ts)
585{ 828{
586 set_normalized_timespec(ts, 829 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 830 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 831 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
832 };
833
834 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 835}
590 836
591/** 837/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
594 */ 840 */
595void monotonic_to_bootbased(struct timespec *ts) 841void monotonic_to_bootbased(struct timespec *ts)
596{ 842{
597 ts->tv_sec += total_sleep_time; 843 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 844}
599 845
600unsigned long get_seconds(void) 846unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
603} 849}
604EXPORT_SYMBOL(get_seconds); 850EXPORT_SYMBOL(get_seconds);
605 851
852struct timespec __current_kernel_time(void)
853{
854 return xtime_cache;
855}
606 856
607struct timespec current_kernel_time(void) 857struct timespec current_kernel_time(void)
608{ 858{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
618 return now; 868 return now;
619} 869}
620EXPORT_SYMBOL(current_kernel_time); 870EXPORT_SYMBOL(current_kernel_time);
871
872struct timespec get_monotonic_coarse(void)
873{
874 struct timespec now, mono;
875 unsigned long seq;
876
877 do {
878 seq = read_seqbegin(&xtime_lock);
879
880 now = xtime_cache;
881 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq));
883
884 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
885 now.tv_nsec + mono.tv_nsec);
886 return now;
887}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
275 return single_open(filp, timer_list_show, NULL); 275 return single_open(filp, timer_list_show, NULL);
276} 276}
277 277
278static struct file_operations timer_list_fops = { 278static const struct file_operations timer_list_fops = {
279 .open = timer_list_open, 279 .open = timer_list_open,
280 .read = seq_read, 280 .read = seq_read,
281 .llseek = seq_lseek, 281 .llseek = seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
395 return single_open(filp, tstats_show, NULL); 395 return single_open(filp, tstats_show, NULL);
396} 396}
397 397
398static struct file_operations tstats_fops = { 398static const struct file_operations tstats_fops = {
399 .open = tstats_open, 399 .open = tstats_open,
400 .read = seq_read, 400 .read = seq_read,
401 .write = tstats_write, 401 .write = tstats_write,
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..5db5a8d26811 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
46#include <asm/timex.h> 46#include <asm/timex.h>
47#include <asm/io.h> 47#include <asm/io.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/timer.h>
51
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 52u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 53
51EXPORT_SYMBOL(jiffies_64); 54EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
72 spinlock_t lock; 75 spinlock_t lock;
73 struct timer_list *running_timer; 76 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 77 unsigned long timer_jiffies;
78 unsigned long next_timer;
75 struct tvec_root tv1; 79 struct tvec_root tv1;
76 struct tvec tv2; 80 struct tvec tv2;
77 struct tvec tv3; 81 struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
520static inline void debug_timer_deactivate(struct timer_list *timer) { } 524static inline void debug_timer_deactivate(struct timer_list *timer) { }
521#endif 525#endif
522 526
527static inline void debug_init(struct timer_list *timer)
528{
529 debug_timer_init(timer);
530 trace_timer_init(timer);
531}
532
533static inline void
534debug_activate(struct timer_list *timer, unsigned long expires)
535{
536 debug_timer_activate(timer);
537 trace_timer_start(timer, expires);
538}
539
540static inline void debug_deactivate(struct timer_list *timer)
541{
542 debug_timer_deactivate(timer);
543 trace_timer_cancel(timer);
544}
545
523static void __init_timer(struct timer_list *timer, 546static void __init_timer(struct timer_list *timer,
524 const char *name, 547 const char *name,
525 struct lock_class_key *key) 548 struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
548 const char *name, 571 const char *name,
549 struct lock_class_key *key) 572 struct lock_class_key *key)
550{ 573{
551 debug_timer_init(timer); 574 debug_init(timer);
552 __init_timer(timer, name, key); 575 __init_timer(timer, name, key);
553} 576}
554EXPORT_SYMBOL(init_timer_key); 577EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
567{ 590{
568 struct list_head *entry = &timer->entry; 591 struct list_head *entry = &timer->entry;
569 592
570 debug_timer_deactivate(timer); 593 debug_deactivate(timer);
571 594
572 __list_del(entry->prev, entry->next); 595 __list_del(entry->prev, entry->next);
573 if (clear_pending) 596 if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 645
623 if (timer_pending(timer)) { 646 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 647 detach_timer(timer, 0);
648 if (timer->expires == base->next_timer &&
649 !tbase_get_deferrable(timer->base))
650 base->next_timer = base->timer_jiffies;
625 ret = 1; 651 ret = 1;
626 } else { 652 } else {
627 if (pending_only) 653 if (pending_only)
628 goto out_unlock; 654 goto out_unlock;
629 } 655 }
630 656
631 debug_timer_activate(timer); 657 debug_activate(timer, expires);
632 658
633 new_base = __get_cpu_var(tvec_bases); 659 new_base = __get_cpu_var(tvec_bases);
634 660
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 689 }
664 690
665 timer->expires = expires; 691 timer->expires = expires;
692 if (time_before(timer->expires, base->next_timer) &&
693 !tbase_get_deferrable(timer->base))
694 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 695 internal_add_timer(base, timer);
667 696
668out_unlock: 697out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
780 BUG_ON(timer_pending(timer) || !timer->function); 809 BUG_ON(timer_pending(timer) || !timer->function);
781 spin_lock_irqsave(&base->lock, flags); 810 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 811 timer_set_base(timer, base);
783 debug_timer_activate(timer); 812 debug_activate(timer, timer->expires);
813 if (time_before(timer->expires, base->next_timer) &&
814 !tbase_get_deferrable(timer->base))
815 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 816 internal_add_timer(base, timer);
785 /* 817 /*
786 * Check whether the other CPU is idle and needs to be 818 * Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 849 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 850 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 851 detach_timer(timer, 1);
852 if (timer->expires == base->next_timer &&
853 !tbase_get_deferrable(timer->base))
854 base->next_timer = base->timer_jiffies;
820 ret = 1; 855 ret = 1;
821 } 856 }
822 spin_unlock_irqrestore(&base->lock, flags); 857 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 885 ret = 0;
851 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 887 detach_timer(timer, 1);
888 if (timer->expires == base->next_timer &&
889 !tbase_get_deferrable(timer->base))
890 base->next_timer = base->timer_jiffies;
853 ret = 1; 891 ret = 1;
854 } 892 }
855out: 893out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
984 */ 1022 */
985 lock_map_acquire(&lockdep_map); 1023 lock_map_acquire(&lockdep_map);
986 1024
1025 trace_timer_expire_entry(timer);
987 fn(data); 1026 fn(data);
1027 trace_timer_expire_exit(timer);
988 1028
989 lock_map_release(&lockdep_map); 1029 lock_map_release(&lockdep_map);
990 1030
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1047#ifdef CONFIG_NO_HZ
1008/* 1048/*
1009 * Find out when the next timer event is due to happen. This 1049 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1050 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1051 * This function needs to be called with interrupts disabled.
1012 */ 1052 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1053static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1054{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1174 unsigned long expires;
1135 1175
1136 spin_lock(&base->lock); 1176 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1177 if (time_before_eq(base->next_timer, base->timer_jiffies))
1178 base->next_timer = __next_timer_interrupt(base);
1179 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1180 spin_unlock(&base->lock);
1139 1181
1140 if (time_before_eq(expires, now)) 1182 if (time_before_eq(expires, now))
@@ -1156,8 +1198,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1198 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1199 account_process_tick(p, user_tick);
1158 run_local_timers(); 1200 run_local_timers();
1159 if (rcu_pending(cpu)) 1201 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1202 printk_tick();
1162 scheduler_tick(); 1203 scheduler_tick();
1163 run_posix_cpu_timers(p); 1204 run_posix_cpu_timers(p);
@@ -1170,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1211{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1212 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1213
1173 perf_counter_do_pending(); 1214 perf_event_do_pending();
1174 1215
1175 hrtimer_run_pending(); 1216 hrtimer_run_pending();
1176 1217
@@ -1523,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1523 INIT_LIST_HEAD(base->tv1.vec + j); 1564 INIT_LIST_HEAD(base->tv1.vec + j);
1524 1565
1525 base->timer_jiffies = jiffies; 1566 base->timer_jiffies = jiffies;
1567 base->next_timer = base->timer_jiffies;
1526 return 0; 1568 return 0;
1527} 1569}
1528 1570
@@ -1535,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1535 timer = list_first_entry(head, struct timer_list, entry); 1577 timer = list_first_entry(head, struct timer_list, entry);
1536 detach_timer(timer, 0); 1578 detach_timer(timer, 0);
1537 timer_set_base(timer, new_base); 1579 timer_set_base(timer, new_base);
1580 if (time_before(timer->expires, new_base->next_timer) &&
1581 !tbase_get_deferrable(timer->base))
1582 new_base->next_timer = timer->expires;
1538 internal_add_timer(new_base, timer); 1583 internal_add_timer(new_base, timer);
1539 } 1584 }
1540} 1585}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd764..b416512ad17f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_FTRACE_SYSCALLS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -60,15 +70,20 @@ config EVENT_TRACING
60 bool 70 bool
61 71
62config CONTEXT_SWITCH_TRACER 72config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool 73 bool
65 74
75config RING_BUFFER_ALLOW_SWAP
76 bool
77 help
78 Allow the use of ring_buffer_swap_cpu.
79 Adds a very slight overhead to tracing when enabled.
80
66# All tracer options should select GENERIC_TRACER. For those options that are 81# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING. 82# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
69# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
70# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
71# hidding of the automatic options options. 86# hidding of the automatic options.
72 87
73config TRACING 88config TRACING
74 bool 89 bool
@@ -147,6 +162,7 @@ config IRQSOFF_TRACER
147 select TRACE_IRQFLAGS 162 select TRACE_IRQFLAGS
148 select GENERIC_TRACER 163 select GENERIC_TRACER
149 select TRACER_MAX_TRACE 164 select TRACER_MAX_TRACE
165 select RING_BUFFER_ALLOW_SWAP
150 help 166 help
151 This option measures the time spent in irqs-off critical 167 This option measures the time spent in irqs-off critical
152 sections, with microsecond accuracy. 168 sections, with microsecond accuracy.
@@ -168,6 +184,7 @@ config PREEMPT_TRACER
168 depends on PREEMPT 184 depends on PREEMPT
169 select GENERIC_TRACER 185 select GENERIC_TRACER
170 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP
171 help 188 help
172 This option measures the time spent in preemption off critical 189 This option measures the time spent in preemption off critical
173 sections, with microsecond accuracy. 190 sections, with microsecond accuracy.
@@ -211,7 +228,7 @@ config ENABLE_DEFAULT_TRACERS
211 228
212config FTRACE_SYSCALLS 229config FTRACE_SYSCALLS
213 bool "Trace syscalls" 230 bool "Trace syscalls"
214 depends on HAVE_FTRACE_SYSCALLS 231 depends on HAVE_SYSCALL_TRACEPOINTS
215 select GENERIC_TRACER 232 select GENERIC_TRACER
216 select KALLSYMS 233 select KALLSYMS
217 help 234 help
@@ -462,6 +479,18 @@ config FTRACE_STARTUP_TEST
462 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
463 tracers of ftrace. 480 tracers of ftrace.
464 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
465config MMIOTRACE 494config MMIOTRACE
466 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
467 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 57
58libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7a34cb563fec..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
65{ 65{
66 struct blk_io_trace *t; 66 struct blk_io_trace *t;
67 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
68 int pc = 0; 69 int pc = 0;
69 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
70 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
71 72
72 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
73 pc = preempt_count(); 75 pc = preempt_count();
74 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
75 sizeof(*t) + len, 77 sizeof(*t) + len,
76 0, pc); 78 0, pc);
77 if (!event) 79 if (!event)
@@ -96,7 +98,7 @@ record_it:
96 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
97 99
98 if (blk_tracer) 100 if (blk_tracer)
99 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
100 } 102 }
101} 103}
102 104
@@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
179{ 181{
180 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
181 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
182 struct blk_io_trace *t; 185 struct blk_io_trace *t;
183 unsigned long flags = 0; 186 unsigned long flags = 0;
184 unsigned long *sequence; 187 unsigned long *sequence;
@@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
204 if (blk_tracer) { 207 if (blk_tracer) {
205 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
206 209
210 buffer = blk_tr->buffer;
207 pc = preempt_count(); 211 pc = preempt_count();
208 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
209 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
210 0, pc); 214 0, pc);
211 if (!event) 215 if (!event)
@@ -252,7 +256,7 @@ record_it:
252 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
253 257
254 if (blk_tracer) { 258 if (blk_tracer) {
255 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
256 return; 260 return;
257 } 261 }
258 } 262 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25edd5cc5935..46592feab5a6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1017{ 1017{
1018 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1019 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1020 1020
1021 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1022 1022
1023 ip = rec->ip;
1024
1025 /* 1023 /*
1026 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1027 * it is not enabled then do nothing. 1025 * then disable it.
1028 * 1026 *
1029 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1030 * it is enabled then disable it.
1031 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1032 */ 1031 */
1033 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1034 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1035 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1036 else 1035 }
1037 return 0;
1038
1039 } else if (ftrace_filtered && enable) {
1040 /*
1041 * Filtering is on:
1042 */
1043
1044 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1045
1046 /* Record is filtered and enabled, do nothing */
1047 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1048 return 0;
1049
1050 /* Record is not filtered or enabled, do nothing */
1051 if (!fl)
1052 return 0;
1053
1054 /* Record is not filtered but enabled, disable it */
1055 if (fl == FTRACE_FL_ENABLED)
1056 rec->flags &= ~FTRACE_FL_ENABLED;
1057 else
1058 /* Otherwise record is filtered but not enabled, enable it */
1059 rec->flags |= FTRACE_FL_ENABLED;
1060 } else {
1061 /* Disable or not filtered */
1062
1063 if (enable) {
1064 /* if record is enabled, do nothing */
1065 if (rec->flags & FTRACE_FL_ENABLED)
1066 return 0;
1067
1068 rec->flags |= FTRACE_FL_ENABLED;
1069
1070 } else {
1071 1036
1072 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1073 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1074 return 0; 1039 return 0;
1075 1040
1076 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1077 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1078 } 1044 }
1079 1045
1080 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1081 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1082 else
1083 return ftrace_make_nop(NULL, rec, ftrace_addr);
1084} 1048}
1085 1049
1086static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1359,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1359 1323
1360enum { 1324enum {
1361 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1362 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1363 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1364 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1365 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1366 FTRACE_ITER_HASH = (1 << 5),
1367}; 1330};
1368 1331
1369#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1373,9 +1336,7 @@ struct ftrace_iterator {
1373 int hidx; 1336 int hidx;
1374 int idx; 1337 int idx;
1375 unsigned flags; 1338 unsigned flags;
1376 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1377 unsigned buffer_idx;
1378 unsigned filtered;
1379}; 1340};
1380 1341
1381static void * 1342static void *
@@ -1438,18 +1399,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1438{ 1399{
1439 struct ftrace_func_probe *rec; 1400 struct ftrace_func_probe *rec;
1440 struct hlist_node *hnd = v; 1401 struct hlist_node *hnd = v;
1441 char str[KSYM_SYMBOL_LEN];
1442 1402
1443 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1403 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1444 1404
1445 if (rec->ops->print) 1405 if (rec->ops->print)
1446 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1447 1407
1448 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1449 seq_printf(m, "%s:", str);
1450
1451 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1452 seq_printf(m, "%s", str);
1453 1409
1454 if (rec->data) 1410 if (rec->data)
1455 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1503,6 @@ static int t_show(struct seq_file *m, void *v)
1547{ 1503{
1548 struct ftrace_iterator *iter = m->private; 1504 struct ftrace_iterator *iter = m->private;
1549 struct dyn_ftrace *rec = v; 1505 struct dyn_ftrace *rec = v;
1550 char str[KSYM_SYMBOL_LEN];
1551 1506
1552 if (iter->flags & FTRACE_ITER_HASH) 1507 if (iter->flags & FTRACE_ITER_HASH)
1553 return t_hash_show(m, v); 1508 return t_hash_show(m, v);
@@ -1560,14 +1515,12 @@ static int t_show(struct seq_file *m, void *v)
1560 if (!rec) 1515 if (!rec)
1561 return 0; 1516 return 0;
1562 1517
1563 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1564
1565 seq_printf(m, "%s\n", str);
1566 1519
1567 return 0; 1520 return 0;
1568} 1521}
1569 1522
1570static struct seq_operations show_ftrace_seq_ops = { 1523static const struct seq_operations show_ftrace_seq_ops = {
1571 .start = t_start, 1524 .start = t_start,
1572 .next = t_next, 1525 .next = t_next,
1573 .stop = t_stop, 1526 .stop = t_stop,
@@ -1601,17 +1554,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1601 return ret; 1554 return ret;
1602} 1555}
1603 1556
1604int ftrace_avail_release(struct inode *inode, struct file *file)
1605{
1606 struct seq_file *m = (struct seq_file *)file->private_data;
1607 struct ftrace_iterator *iter = m->private;
1608
1609 seq_release(inode, file);
1610 kfree(iter);
1611
1612 return 0;
1613}
1614
1615static int 1557static int
1616ftrace_failures_open(struct inode *inode, struct file *file) 1558ftrace_failures_open(struct inode *inode, struct file *file)
1617{ 1559{
@@ -1660,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1660 if (!iter) 1602 if (!iter)
1661 return -ENOMEM; 1603 return -ENOMEM;
1662 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1663 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1664 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1665 (file->f_flags & O_TRUNC)) 1612 (file->f_flags & O_TRUNC))
@@ -1674,8 +1621,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1674 if (!ret) { 1621 if (!ret) {
1675 struct seq_file *m = file->private_data; 1622 struct seq_file *m = file->private_data;
1676 m->private = iter; 1623 m->private = iter;
1677 } else 1624 } else {
1625 trace_parser_put(&iter->parser);
1678 kfree(iter); 1626 kfree(iter);
1627 }
1679 } else 1628 } else
1680 file->private_data = iter; 1629 file->private_data = iter;
1681 mutex_unlock(&ftrace_regex_lock); 1630 mutex_unlock(&ftrace_regex_lock);
@@ -2115,9 +2064,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2115 int i, len = 0; 2064 int i, len = 0;
2116 char *search; 2065 char *search;
2117 2066
2118 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2067 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2119 glob = NULL; 2068 glob = NULL;
2120 else { 2069 else if (glob) {
2121 int not; 2070 int not;
2122 2071
2123 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2072 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2252,11 +2201,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2252 size_t cnt, loff_t *ppos, int enable) 2201 size_t cnt, loff_t *ppos, int enable)
2253{ 2202{
2254 struct ftrace_iterator *iter; 2203 struct ftrace_iterator *iter;
2255 char ch; 2204 struct trace_parser *parser;
2256 size_t read = 0; 2205 ssize_t ret, read;
2257 ssize_t ret;
2258 2206
2259 if (!cnt || cnt < 0) 2207 if (!cnt)
2260 return 0; 2208 return 0;
2261 2209
2262 mutex_lock(&ftrace_regex_lock); 2210 mutex_lock(&ftrace_regex_lock);
@@ -2267,73 +2215,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2267 } else 2215 } else
2268 iter = file->private_data; 2216 iter = file->private_data;
2269 2217
2270 if (!*ppos) { 2218 parser = &iter->parser;
2271 iter->flags &= ~FTRACE_ITER_CONT; 2219 read = trace_get_user(parser, ubuf, cnt, ppos);
2272 iter->buffer_idx = 0;
2273 }
2274 2220
2275 ret = get_user(ch, ubuf++); 2221 if (read >= 0 && trace_parser_loaded(parser) &&
2276 if (ret) 2222 !trace_parser_cont(parser)) {
2277 goto out; 2223 ret = ftrace_process_regex(parser->buffer,
2278 read++; 2224 parser->idx, enable);
2279 cnt--;
2280
2281 /*
2282 * If the parser haven't finished with the last write,
2283 * continue reading the user input without skipping spaces.
2284 */
2285 if (!(iter->flags & FTRACE_ITER_CONT)) {
2286 /* skip white space */
2287 while (cnt && isspace(ch)) {
2288 ret = get_user(ch, ubuf++);
2289 if (ret)
2290 goto out;
2291 read++;
2292 cnt--;
2293 }
2294
2295 /* only spaces were written */
2296 if (isspace(ch)) {
2297 *ppos += read;
2298 ret = read;
2299 goto out;
2300 }
2301
2302 iter->buffer_idx = 0;
2303 }
2304
2305 while (cnt && !isspace(ch)) {
2306 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2307 iter->buffer[iter->buffer_idx++] = ch;
2308 else {
2309 ret = -EINVAL;
2310 goto out;
2311 }
2312 ret = get_user(ch, ubuf++);
2313 if (ret) 2225 if (ret)
2314 goto out; 2226 goto out;
2315 read++;
2316 cnt--;
2317 }
2318 2227
2319 if (isspace(ch)) { 2228 trace_parser_clear(parser);
2320 iter->filtered++;
2321 iter->buffer[iter->buffer_idx] = 0;
2322 ret = ftrace_process_regex(iter->buffer,
2323 iter->buffer_idx, enable);
2324 if (ret)
2325 goto out;
2326 iter->buffer_idx = 0;
2327 } else {
2328 iter->flags |= FTRACE_ITER_CONT;
2329 iter->buffer[iter->buffer_idx++] = ch;
2330 } 2229 }
2331 2230
2332 *ppos += read;
2333 ret = read; 2231 ret = read;
2334 out:
2335 mutex_unlock(&ftrace_regex_lock);
2336 2232
2233 mutex_unlock(&ftrace_regex_lock);
2234out:
2337 return ret; 2235 return ret;
2338} 2236}
2339 2237
@@ -2438,6 +2336,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2438{ 2336{
2439 struct seq_file *m = (struct seq_file *)file->private_data; 2337 struct seq_file *m = (struct seq_file *)file->private_data;
2440 struct ftrace_iterator *iter; 2338 struct ftrace_iterator *iter;
2339 struct trace_parser *parser;
2441 2340
2442 mutex_lock(&ftrace_regex_lock); 2341 mutex_lock(&ftrace_regex_lock);
2443 if (file->f_mode & FMODE_READ) { 2342 if (file->f_mode & FMODE_READ) {
@@ -2447,10 +2346,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2447 } else 2346 } else
2448 iter = file->private_data; 2347 iter = file->private_data;
2449 2348
2450 if (iter->buffer_idx) { 2349 parser = &iter->parser;
2451 iter->filtered++; 2350 if (trace_parser_loaded(parser)) {
2452 iter->buffer[iter->buffer_idx] = 0; 2351 parser->buffer[parser->idx] = 0;
2453 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2352 ftrace_match_records(parser->buffer, parser->idx, enable);
2454 } 2353 }
2455 2354
2456 mutex_lock(&ftrace_lock); 2355 mutex_lock(&ftrace_lock);
@@ -2458,7 +2357,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2458 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2357 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2459 mutex_unlock(&ftrace_lock); 2358 mutex_unlock(&ftrace_lock);
2460 2359
2360 trace_parser_put(parser);
2461 kfree(iter); 2361 kfree(iter);
2362
2462 mutex_unlock(&ftrace_regex_lock); 2363 mutex_unlock(&ftrace_regex_lock);
2463 return 0; 2364 return 0;
2464} 2365}
@@ -2479,14 +2380,14 @@ static const struct file_operations ftrace_avail_fops = {
2479 .open = ftrace_avail_open, 2380 .open = ftrace_avail_open,
2480 .read = seq_read, 2381 .read = seq_read,
2481 .llseek = seq_lseek, 2382 .llseek = seq_lseek,
2482 .release = ftrace_avail_release, 2383 .release = seq_release_private,
2483}; 2384};
2484 2385
2485static const struct file_operations ftrace_failures_fops = { 2386static const struct file_operations ftrace_failures_fops = {
2486 .open = ftrace_failures_open, 2387 .open = ftrace_failures_open,
2487 .read = seq_read, 2388 .read = seq_read,
2488 .llseek = seq_lseek, 2389 .llseek = seq_lseek,
2489 .release = ftrace_avail_release, 2390 .release = seq_release_private,
2490}; 2391};
2491 2392
2492static const struct file_operations ftrace_filter_fops = { 2393static const struct file_operations ftrace_filter_fops = {
@@ -2515,11 +2416,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2515static void * 2416static void *
2516__g_next(struct seq_file *m, loff_t *pos) 2417__g_next(struct seq_file *m, loff_t *pos)
2517{ 2418{
2518 unsigned long *array = m->private;
2519
2520 if (*pos >= ftrace_graph_count) 2419 if (*pos >= ftrace_graph_count)
2521 return NULL; 2420 return NULL;
2522 return &array[*pos]; 2421 return &ftrace_graph_funcs[*pos];
2523} 2422}
2524 2423
2525static void * 2424static void *
@@ -2548,7 +2447,6 @@ static void g_stop(struct seq_file *m, void *p)
2548static int g_show(struct seq_file *m, void *v) 2447static int g_show(struct seq_file *m, void *v)
2549{ 2448{
2550 unsigned long *ptr = v; 2449 unsigned long *ptr = v;
2551 char str[KSYM_SYMBOL_LEN];
2552 2450
2553 if (!ptr) 2451 if (!ptr)
2554 return 0; 2452 return 0;
@@ -2558,14 +2456,12 @@ static int g_show(struct seq_file *m, void *v)
2558 return 0; 2456 return 0;
2559 } 2457 }
2560 2458
2561 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2459 seq_printf(m, "%ps\n", (void *)*ptr);
2562
2563 seq_printf(m, "%s\n", str);
2564 2460
2565 return 0; 2461 return 0;
2566} 2462}
2567 2463
2568static struct seq_operations ftrace_graph_seq_ops = { 2464static const struct seq_operations ftrace_graph_seq_ops = {
2569 .start = g_start, 2465 .start = g_start,
2570 .next = g_next, 2466 .next = g_next,
2571 .stop = g_stop, 2467 .stop = g_stop,
@@ -2586,16 +2482,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2586 ftrace_graph_count = 0; 2482 ftrace_graph_count = 0;
2587 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2483 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2588 } 2484 }
2485 mutex_unlock(&graph_lock);
2589 2486
2590 if (file->f_mode & FMODE_READ) { 2487 if (file->f_mode & FMODE_READ)
2591 ret = seq_open(file, &ftrace_graph_seq_ops); 2488 ret = seq_open(file, &ftrace_graph_seq_ops);
2592 if (!ret) {
2593 struct seq_file *m = file->private_data;
2594 m->private = ftrace_graph_funcs;
2595 }
2596 } else
2597 file->private_data = ftrace_graph_funcs;
2598 mutex_unlock(&graph_lock);
2599 2489
2600 return ret; 2490 return ret;
2601} 2491}
@@ -2663,12 +2553,8 @@ static ssize_t
2663ftrace_graph_write(struct file *file, const char __user *ubuf, 2553ftrace_graph_write(struct file *file, const char __user *ubuf,
2664 size_t cnt, loff_t *ppos) 2554 size_t cnt, loff_t *ppos)
2665{ 2555{
2666 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2556 struct trace_parser parser;
2667 unsigned long *array; 2557 ssize_t read, ret;
2668 size_t read = 0;
2669 ssize_t ret;
2670 int index = 0;
2671 char ch;
2672 2558
2673 if (!cnt || cnt < 0) 2559 if (!cnt || cnt < 0)
2674 return 0; 2560 return 0;
@@ -2677,60 +2563,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2677 2563
2678 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { 2564 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2679 ret = -EBUSY; 2565 ret = -EBUSY;
2680 goto out; 2566 goto out_unlock;
2681 } 2567 }
2682 2568
2683 if (file->f_mode & FMODE_READ) { 2569 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2684 struct seq_file *m = file->private_data; 2570 ret = -ENOMEM;
2685 array = m->private; 2571 goto out_unlock;
2686 } else
2687 array = file->private_data;
2688
2689 ret = get_user(ch, ubuf++);
2690 if (ret)
2691 goto out;
2692 read++;
2693 cnt--;
2694
2695 /* skip white space */
2696 while (cnt && isspace(ch)) {
2697 ret = get_user(ch, ubuf++);
2698 if (ret)
2699 goto out;
2700 read++;
2701 cnt--;
2702 } 2572 }
2703 2573
2704 if (isspace(ch)) { 2574 read = trace_get_user(&parser, ubuf, cnt, ppos);
2705 *ppos += read;
2706 ret = read;
2707 goto out;
2708 }
2709 2575
2710 while (cnt && !isspace(ch)) { 2576 if (read >= 0 && trace_parser_loaded((&parser))) {
2711 if (index < FTRACE_BUFF_MAX) 2577 parser.buffer[parser.idx] = 0;
2712 buffer[index++] = ch; 2578
2713 else { 2579 /* we allow only one expression at a time */
2714 ret = -EINVAL; 2580 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2715 goto out; 2581 parser.buffer);
2716 }
2717 ret = get_user(ch, ubuf++);
2718 if (ret) 2582 if (ret)
2719 goto out; 2583 goto out_free;
2720 read++;
2721 cnt--;
2722 } 2584 }
2723 buffer[index] = 0;
2724
2725 /* we allow only one expression at a time */
2726 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2727 if (ret)
2728 goto out;
2729
2730 file->f_pos += read;
2731 2585
2732 ret = read; 2586 ret = read;
2733 out: 2587
2588out_free:
2589 trace_parser_put(&parser);
2590out_unlock:
2734 mutex_unlock(&graph_lock); 2591 mutex_unlock(&graph_lock);
2735 2592
2736 return ret; 2593 return ret;
@@ -3161,7 +3018,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
3161 3018
3162int 3019int
3163ftrace_enable_sysctl(struct ctl_table *table, int write, 3020ftrace_enable_sysctl(struct ctl_table *table, int write,
3164 struct file *file, void __user *buffer, size_t *lenp, 3021 void __user *buffer, size_t *lenp,
3165 loff_t *ppos) 3022 loff_t *ppos)
3166{ 3023{
3167 int ret; 3024 int ret;
@@ -3171,7 +3028,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3171 3028
3172 mutex_lock(&ftrace_lock); 3029 mutex_lock(&ftrace_lock);
3173 3030
3174 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3031 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3175 3032
3176 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) 3033 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3177 goto out; 3034 goto out;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu(cpu, cpu_possible_mask) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -218,17 +216,12 @@ enum {
218 216
219static inline int rb_null_event(struct ring_buffer_event *event) 217static inline int rb_null_event(struct ring_buffer_event *event)
220{ 218{
221 return event->type_len == RINGBUF_TYPE_PADDING 219 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
222 && event->time_delta == 0;
223}
224
225static inline int rb_discarded_event(struct ring_buffer_event *event)
226{
227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
228} 220}
229 221
230static void rb_event_set_padding(struct ring_buffer_event *event) 222static void rb_event_set_padding(struct ring_buffer_event *event)
231{ 223{
224 /* padding has a NULL time_delta */
232 event->type_len = RINGBUF_TYPE_PADDING; 225 event->type_len = RINGBUF_TYPE_PADDING;
233 event->time_delta = 0; 226 event->time_delta = 0;
234} 227}
@@ -322,6 +315,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 315 unsigned char data[]; /* data of buffer page */
323}; 316};
324 317
318/*
319 * Note, the buffer_page list must be first. The buffer pages
320 * are allocated in cache lines, which means that each buffer
321 * page will be at the beginning of a cache line, and thus
322 * the least significant bits will be zero. We use this to
323 * add flags in the list struct pointers, to make the ring buffer
324 * lockless.
325 */
325struct buffer_page { 326struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 327 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 328 local_t write; /* index for next write */
@@ -330,6 +331,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 331 struct buffer_data_page *page; /* Actual data page */
331}; 332};
332 333
334/*
335 * The buffer page counters, write and entries, must be reset
336 * atomically when crossing page boundaries. To synchronize this
337 * update, two counters are inserted into the number. One is
338 * the actual counter for the write position or count on the page.
339 *
340 * The other is a counter of updaters. Before an update happens
341 * the update partition of the counter is incremented. This will
342 * allow the updater to update the counter atomically.
343 *
344 * The counter is 20 bits, and the state data is 12.
345 */
346#define RB_WRITE_MASK 0xfffff
347#define RB_WRITE_INTCNT (1 << 20)
348
333static void rb_init_page(struct buffer_data_page *bpage) 349static void rb_init_page(struct buffer_data_page *bpage)
334{ 350{
335 local_set(&bpage->commit, 0); 351 local_set(&bpage->commit, 0);
@@ -403,21 +419,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 419struct ring_buffer_per_cpu {
404 int cpu; 420 int cpu;
405 struct ring_buffer *buffer; 421 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 422 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 423 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 424 struct lock_class_key lock_key;
409 struct list_head pages; 425 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 426 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 427 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 428 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 429 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 430 local_t commit_overrun;
415 unsigned long commit_overrun; 431 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 432 local_t entries;
419 local_t committing; 433 local_t committing;
420 local_t commits; 434 local_t commits;
435 unsigned long read;
421 u64 write_stamp; 436 u64 write_stamp;
422 u64 read_stamp; 437 u64 read_stamp;
423 atomic_t record_disabled; 438 atomic_t record_disabled;
@@ -450,14 +465,19 @@ struct ring_buffer_iter {
450}; 465};
451 466
452/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 467/* buffer may be either ring_buffer or ring_buffer_per_cpu */
453#define RB_WARN_ON(buffer, cond) \ 468#define RB_WARN_ON(b, cond) \
454 ({ \ 469 ({ \
455 int _____ret = unlikely(cond); \ 470 int _____ret = unlikely(cond); \
456 if (_____ret) { \ 471 if (_____ret) { \
457 atomic_inc(&buffer->record_disabled); \ 472 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
458 WARN_ON(1); \ 473 struct ring_buffer_per_cpu *__b = \
459 } \ 474 (void *)b; \
460 _____ret; \ 475 atomic_inc(&__b->buffer->record_disabled); \
476 } else \
477 atomic_inc(&b->record_disabled); \
478 WARN_ON(1); \
479 } \
480 _____ret; \
461 }) 481 })
462 482
463/* Up this if you want to test the TIME_EXTENTS and normalization */ 483/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -489,6 +509,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 509}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 510EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 511
512/*
513 * Making the ring buffer lockless makes things tricky.
514 * Although writes only happen on the CPU that they are on,
515 * and they only need to worry about interrupts. Reads can
516 * happen on any CPU.
517 *
518 * The reader page is always off the ring buffer, but when the
519 * reader finishes with a page, it needs to swap its page with
520 * a new one from the buffer. The reader needs to take from
521 * the head (writes go to the tail). But if a writer is in overwrite
522 * mode and wraps, it must push the head page forward.
523 *
524 * Here lies the problem.
525 *
526 * The reader must be careful to replace only the head page, and
527 * not another one. As described at the top of the file in the
528 * ASCII art, the reader sets its old page to point to the next
529 * page after head. It then sets the page after head to point to
530 * the old reader page. But if the writer moves the head page
531 * during this operation, the reader could end up with the tail.
532 *
533 * We use cmpxchg to help prevent this race. We also do something
534 * special with the page before head. We set the LSB to 1.
535 *
536 * When the writer must push the page forward, it will clear the
537 * bit that points to the head page, move the head, and then set
538 * the bit that points to the new head page.
539 *
540 * We also don't want an interrupt coming in and moving the head
541 * page on another writer. Thus we use the second LSB to catch
542 * that too. Thus:
543 *
544 * head->list->prev->next bit 1 bit 0
545 * ------- -------
546 * Normal page 0 0
547 * Points to head page 0 1
548 * New head page 1 0
549 *
550 * Note we can not trust the prev pointer of the head page, because:
551 *
552 * +----+ +-----+ +-----+
553 * | |------>| T |---X--->| N |
554 * | |<------| | | |
555 * +----+ +-----+ +-----+
556 * ^ ^ |
557 * | +-----+ | |
558 * +----------| R |----------+ |
559 * | |<-----------+
560 * +-----+
561 *
562 * Key: ---X--> HEAD flag set in pointer
563 * T Tail page
564 * R Reader page
565 * N Next page
566 *
567 * (see __rb_reserve_next() to see where this happens)
568 *
569 * What the above shows is that the reader just swapped out
570 * the reader page with a page in the buffer, but before it
571 * could make the new header point back to the new page added
572 * it was preempted by a writer. The writer moved forward onto
573 * the new page added by the reader and is about to move forward
574 * again.
575 *
576 * You can see, it is legitimate for the previous pointer of
577 * the head (or any page) not to point back to itself. But only
578 * temporarially.
579 */
580
581#define RB_PAGE_NORMAL 0UL
582#define RB_PAGE_HEAD 1UL
583#define RB_PAGE_UPDATE 2UL
584
585
586#define RB_FLAG_MASK 3UL
587
588/* PAGE_MOVED is not part of the mask */
589#define RB_PAGE_MOVED 4UL
590
591/*
592 * rb_list_head - remove any bit
593 */
594static struct list_head *rb_list_head(struct list_head *list)
595{
596 unsigned long val = (unsigned long)list;
597
598 return (struct list_head *)(val & ~RB_FLAG_MASK);
599}
600
601/*
602 * rb_is_head_page - test if the give page is the head page
603 *
604 * Because the reader may move the head_page pointer, we can
605 * not trust what the head page is (it may be pointing to
606 * the reader page). But if the next page is a header page,
607 * its flags will be non zero.
608 */
609static int inline
610rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
611 struct buffer_page *page, struct list_head *list)
612{
613 unsigned long val;
614
615 val = (unsigned long)list->next;
616
617 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
618 return RB_PAGE_MOVED;
619
620 return val & RB_FLAG_MASK;
621}
622
623/*
624 * rb_is_reader_page
625 *
626 * The unique thing about the reader page, is that, if the
627 * writer is ever on it, the previous pointer never points
628 * back to the reader page.
629 */
630static int rb_is_reader_page(struct buffer_page *page)
631{
632 struct list_head *list = page->list.prev;
633
634 return rb_list_head(list->next) != &page->list;
635}
636
637/*
638 * rb_set_list_to_head - set a list_head to be pointing to head.
639 */
640static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
641 struct list_head *list)
642{
643 unsigned long *ptr;
644
645 ptr = (unsigned long *)&list->next;
646 *ptr |= RB_PAGE_HEAD;
647 *ptr &= ~RB_PAGE_UPDATE;
648}
649
650/*
651 * rb_head_page_activate - sets up head page
652 */
653static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
654{
655 struct buffer_page *head;
656
657 head = cpu_buffer->head_page;
658 if (!head)
659 return;
660
661 /*
662 * Set the previous list pointer to have the HEAD flag.
663 */
664 rb_set_list_to_head(cpu_buffer, head->list.prev);
665}
666
667static void rb_list_head_clear(struct list_head *list)
668{
669 unsigned long *ptr = (unsigned long *)&list->next;
670
671 *ptr &= ~RB_FLAG_MASK;
672}
673
674/*
675 * rb_head_page_dactivate - clears head page ptr (for free list)
676 */
677static void
678rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
679{
680 struct list_head *hd;
681
682 /* Go through the whole list and clear any pointers found. */
683 rb_list_head_clear(cpu_buffer->pages);
684
685 list_for_each(hd, cpu_buffer->pages)
686 rb_list_head_clear(hd);
687}
688
689static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
690 struct buffer_page *head,
691 struct buffer_page *prev,
692 int old_flag, int new_flag)
693{
694 struct list_head *list;
695 unsigned long val = (unsigned long)&head->list;
696 unsigned long ret;
697
698 list = &prev->list;
699
700 val &= ~RB_FLAG_MASK;
701
702 ret = cmpxchg((unsigned long *)&list->next,
703 val | old_flag, val | new_flag);
704
705 /* check if the reader took the page */
706 if ((ret & ~RB_FLAG_MASK) != val)
707 return RB_PAGE_MOVED;
708
709 return ret & RB_FLAG_MASK;
710}
711
712static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
713 struct buffer_page *head,
714 struct buffer_page *prev,
715 int old_flag)
716{
717 return rb_head_page_set(cpu_buffer, head, prev,
718 old_flag, RB_PAGE_UPDATE);
719}
720
721static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
722 struct buffer_page *head,
723 struct buffer_page *prev,
724 int old_flag)
725{
726 return rb_head_page_set(cpu_buffer, head, prev,
727 old_flag, RB_PAGE_HEAD);
728}
729
730static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
731 struct buffer_page *head,
732 struct buffer_page *prev,
733 int old_flag)
734{
735 return rb_head_page_set(cpu_buffer, head, prev,
736 old_flag, RB_PAGE_NORMAL);
737}
738
739static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
740 struct buffer_page **bpage)
741{
742 struct list_head *p = rb_list_head((*bpage)->list.next);
743
744 *bpage = list_entry(p, struct buffer_page, list);
745}
746
747static struct buffer_page *
748rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
749{
750 struct buffer_page *head;
751 struct buffer_page *page;
752 struct list_head *list;
753 int i;
754
755 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
756 return NULL;
757
758 /* sanity check */
759 list = cpu_buffer->pages;
760 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
761 return NULL;
762
763 page = head = cpu_buffer->head_page;
764 /*
765 * It is possible that the writer moves the header behind
766 * where we started, and we miss in one loop.
767 * A second loop should grab the header, but we'll do
768 * three loops just because I'm paranoid.
769 */
770 for (i = 0; i < 3; i++) {
771 do {
772 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
773 cpu_buffer->head_page = page;
774 return page;
775 }
776 rb_inc_page(cpu_buffer, &page);
777 } while (page != head);
778 }
779
780 RB_WARN_ON(cpu_buffer, 1);
781
782 return NULL;
783}
784
785static int rb_head_page_replace(struct buffer_page *old,
786 struct buffer_page *new)
787{
788 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
789 unsigned long val;
790 unsigned long ret;
791
792 val = *ptr & ~RB_FLAG_MASK;
793 val |= RB_PAGE_HEAD;
794
795 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
796
797 return ret == val;
798}
799
800/*
801 * rb_tail_page_update - move the tail page forward
802 *
803 * Returns 1 if moved tail page, 0 if someone else did.
804 */
805static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
806 struct buffer_page *tail_page,
807 struct buffer_page *next_page)
808{
809 struct buffer_page *old_tail;
810 unsigned long old_entries;
811 unsigned long old_write;
812 int ret = 0;
813
814 /*
815 * The tail page now needs to be moved forward.
816 *
817 * We need to reset the tail page, but without messing
818 * with possible erasing of data brought in by interrupts
819 * that have moved the tail page and are currently on it.
820 *
821 * We add a counter to the write field to denote this.
822 */
823 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
824 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
825
826 /*
827 * Just make sure we have seen our old_write and synchronize
828 * with any interrupts that come in.
829 */
830 barrier();
831
832 /*
833 * If the tail page is still the same as what we think
834 * it is, then it is up to us to update the tail
835 * pointer.
836 */
837 if (tail_page == cpu_buffer->tail_page) {
838 /* Zero the write counter */
839 unsigned long val = old_write & ~RB_WRITE_MASK;
840 unsigned long eval = old_entries & ~RB_WRITE_MASK;
841
842 /*
843 * This will only succeed if an interrupt did
844 * not come in and change it. In which case, we
845 * do not want to modify it.
846 *
847 * We add (void) to let the compiler know that we do not care
848 * about the return value of these functions. We use the
849 * cmpxchg to only update if an interrupt did not already
850 * do it for us. If the cmpxchg fails, we don't care.
851 */
852 (void)local_cmpxchg(&next_page->write, old_write, val);
853 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
854
855 /*
856 * No need to worry about races with clearing out the commit.
857 * it only can increment when a commit takes place. But that
858 * only happens in the outer most nested commit.
859 */
860 local_set(&next_page->page->commit, 0);
861
862 old_tail = cmpxchg(&cpu_buffer->tail_page,
863 tail_page, next_page);
864
865 if (old_tail == tail_page)
866 ret = 1;
867 }
868
869 return ret;
870}
871
872static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
873 struct buffer_page *bpage)
874{
875 unsigned long val = (unsigned long)bpage;
876
877 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
878 return 1;
879
880 return 0;
881}
882
883/**
884 * rb_check_list - make sure a pointer to a list has the last bits zero
885 */
886static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
887 struct list_head *list)
888{
889 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
890 return 1;
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
892 return 1;
893 return 0;
894}
895
492/** 896/**
493 * check_pages - integrity check of buffer pages 897 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 898 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +902,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 902 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 903static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 904{
501 struct list_head *head = &cpu_buffer->pages; 905 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 906 struct buffer_page *bpage, *tmp;
503 907
908 rb_head_page_deactivate(cpu_buffer);
909
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 910 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 911 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 912 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 913 return -1;
508 914
915 if (rb_check_list(cpu_buffer, head))
916 return -1;
917
509 list_for_each_entry_safe(bpage, tmp, head, list) { 918 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 919 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 920 bpage->list.next->prev != &bpage->list))
@@ -513,25 +922,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 922 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 923 bpage->list.prev->next != &bpage->list))
515 return -1; 924 return -1;
925 if (rb_check_list(cpu_buffer, &bpage->list))
926 return -1;
516 } 927 }
517 928
929 rb_head_page_activate(cpu_buffer);
930
518 return 0; 931 return 0;
519} 932}
520 933
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 934static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 935 unsigned nr_pages)
523{ 936{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 937 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 938 unsigned long addr;
527 LIST_HEAD(pages); 939 LIST_HEAD(pages);
528 unsigned i; 940 unsigned i;
529 941
942 WARN_ON(!nr_pages);
943
530 for (i = 0; i < nr_pages; i++) { 944 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 945 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 946 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 947 if (!bpage)
534 goto free_pages; 948 goto free_pages;
949
950 rb_check_bpage(cpu_buffer, bpage);
951
535 list_add(&bpage->list, &pages); 952 list_add(&bpage->list, &pages);
536 953
537 addr = __get_free_page(GFP_KERNEL); 954 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +958,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 958 rb_init_page(bpage->page);
542 } 959 }
543 960
544 list_splice(&pages, head); 961 /*
962 * The ring buffer page list is a circular list that does not
963 * start and end with a list head. All page list items point to
964 * other pages.
965 */
966 cpu_buffer->pages = pages.next;
967 list_del(&pages);
545 968
546 rb_check_pages(cpu_buffer); 969 rb_check_pages(cpu_buffer);
547 970
@@ -573,13 +996,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 996 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 997 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 998 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 999
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1000 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1001 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1002 if (!bpage)
581 goto fail_free_buffer; 1003 goto fail_free_buffer;
582 1004
1005 rb_check_bpage(cpu_buffer, bpage);
1006
583 cpu_buffer->reader_page = bpage; 1007 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1008 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1009 if (!addr)
@@ -594,9 +1018,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1018 goto fail_free_reader;
595 1019
596 cpu_buffer->head_page 1020 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1021 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1022 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1023
1024 rb_head_page_activate(cpu_buffer);
1025
600 return cpu_buffer; 1026 return cpu_buffer;
601 1027
602 fail_free_reader: 1028 fail_free_reader:
@@ -609,15 +1035,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1035
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1036static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1037{
612 struct list_head *head = &cpu_buffer->pages; 1038 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1039 struct buffer_page *bpage, *tmp;
614 1040
615 free_buffer_page(cpu_buffer->reader_page); 1041 free_buffer_page(cpu_buffer->reader_page);
616 1042
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1043 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1044
1045 if (head) {
1046 list_for_each_entry_safe(bpage, tmp, head, list) {
1047 list_del_init(&bpage->list);
1048 free_buffer_page(bpage);
1049 }
1050 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1051 free_buffer_page(bpage);
620 } 1052 }
1053
621 kfree(cpu_buffer); 1054 kfree(cpu_buffer);
622} 1055}
623 1056
@@ -760,15 +1193,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
760 atomic_inc(&cpu_buffer->record_disabled); 1193 atomic_inc(&cpu_buffer->record_disabled);
761 synchronize_sched(); 1194 synchronize_sched();
762 1195
1196 rb_head_page_deactivate(cpu_buffer);
1197
763 for (i = 0; i < nr_pages; i++) { 1198 for (i = 0; i < nr_pages; i++) {
764 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1199 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
765 return; 1200 return;
766 p = cpu_buffer->pages.next; 1201 p = cpu_buffer->pages->next;
767 bpage = list_entry(p, struct buffer_page, list); 1202 bpage = list_entry(p, struct buffer_page, list);
768 list_del_init(&bpage->list); 1203 list_del_init(&bpage->list);
769 free_buffer_page(bpage); 1204 free_buffer_page(bpage);
770 } 1205 }
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1206 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1207 return;
773 1208
774 rb_reset_cpu(cpu_buffer); 1209 rb_reset_cpu(cpu_buffer);
@@ -790,15 +1225,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
790 atomic_inc(&cpu_buffer->record_disabled); 1225 atomic_inc(&cpu_buffer->record_disabled);
791 synchronize_sched(); 1226 synchronize_sched();
792 1227
1228 spin_lock_irq(&cpu_buffer->reader_lock);
1229 rb_head_page_deactivate(cpu_buffer);
1230
793 for (i = 0; i < nr_pages; i++) { 1231 for (i = 0; i < nr_pages; i++) {
794 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1232 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
795 return; 1233 return;
796 p = pages->next; 1234 p = pages->next;
797 bpage = list_entry(p, struct buffer_page, list); 1235 bpage = list_entry(p, struct buffer_page, list);
798 list_del_init(&bpage->list); 1236 list_del_init(&bpage->list);
799 list_add_tail(&bpage->list, &cpu_buffer->pages); 1237 list_add_tail(&bpage->list, cpu_buffer->pages);
800 } 1238 }
801 rb_reset_cpu(cpu_buffer); 1239 rb_reset_cpu(cpu_buffer);
1240 spin_unlock_irq(&cpu_buffer->reader_lock);
802 1241
803 rb_check_pages(cpu_buffer); 1242 rb_check_pages(cpu_buffer);
804 1243
@@ -949,21 +1388,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
949} 1388}
950 1389
951static inline struct ring_buffer_event * 1390static inline struct ring_buffer_event *
952rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
953{
954 return __rb_page_index(cpu_buffer->head_page,
955 cpu_buffer->head_page->read);
956}
957
958static inline struct ring_buffer_event *
959rb_iter_head_event(struct ring_buffer_iter *iter) 1391rb_iter_head_event(struct ring_buffer_iter *iter)
960{ 1392{
961 return __rb_page_index(iter->head_page, iter->head); 1393 return __rb_page_index(iter->head_page, iter->head);
962} 1394}
963 1395
964static inline unsigned rb_page_write(struct buffer_page *bpage) 1396static inline unsigned long rb_page_write(struct buffer_page *bpage)
965{ 1397{
966 return local_read(&bpage->write); 1398 return local_read(&bpage->write) & RB_WRITE_MASK;
967} 1399}
968 1400
969static inline unsigned rb_page_commit(struct buffer_page *bpage) 1401static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1403,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
971 return local_read(&bpage->page->commit); 1403 return local_read(&bpage->page->commit);
972} 1404}
973 1405
1406static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1407{
1408 return local_read(&bpage->entries) & RB_WRITE_MASK;
1409}
1410
974/* Size is determined by what has been commited */ 1411/* Size is determined by what has been commited */
975static inline unsigned rb_page_size(struct buffer_page *bpage) 1412static inline unsigned rb_page_size(struct buffer_page *bpage)
976{ 1413{
@@ -983,22 +1420,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
983 return rb_page_commit(cpu_buffer->commit_page); 1420 return rb_page_commit(cpu_buffer->commit_page);
984} 1421}
985 1422
986static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
987{
988 return rb_page_commit(cpu_buffer->head_page);
989}
990
991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
992 struct buffer_page **bpage)
993{
994 struct list_head *p = (*bpage)->list.next;
995
996 if (p == &cpu_buffer->pages)
997 p = p->next;
998
999 *bpage = list_entry(p, struct buffer_page, list);
1000}
1001
1002static inline unsigned 1423static inline unsigned
1003rb_event_index(struct ring_buffer_event *event) 1424rb_event_index(struct ring_buffer_event *event)
1004{ 1425{
@@ -1024,6 +1445,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1024static void 1445static void
1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1446rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1026{ 1447{
1448 unsigned long max_count;
1449
1027 /* 1450 /*
1028 * We only race with interrupts and NMIs on this CPU. 1451 * We only race with interrupts and NMIs on this CPU.
1029 * If we own the commit event, then we can commit 1452 * If we own the commit event, then we can commit
@@ -1033,9 +1456,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033 * assign the commit to the tail. 1456 * assign the commit to the tail.
1034 */ 1457 */
1035 again: 1458 again:
1459 max_count = cpu_buffer->buffer->pages * 100;
1460
1036 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1461 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1037 cpu_buffer->commit_page->page->commit = 1462 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1038 cpu_buffer->commit_page->write; 1463 return;
1464 if (RB_WARN_ON(cpu_buffer,
1465 rb_is_reader_page(cpu_buffer->tail_page)))
1466 return;
1467 local_set(&cpu_buffer->commit_page->page->commit,
1468 rb_page_write(cpu_buffer->commit_page));
1039 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1469 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1040 cpu_buffer->write_stamp = 1470 cpu_buffer->write_stamp =
1041 cpu_buffer->commit_page->page->time_stamp; 1471 cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1474,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1044 } 1474 }
1045 while (rb_commit_index(cpu_buffer) != 1475 while (rb_commit_index(cpu_buffer) !=
1046 rb_page_write(cpu_buffer->commit_page)) { 1476 rb_page_write(cpu_buffer->commit_page)) {
1047 cpu_buffer->commit_page->page->commit = 1477
1048 cpu_buffer->commit_page->write; 1478 local_set(&cpu_buffer->commit_page->page->commit,
1479 rb_page_write(cpu_buffer->commit_page));
1480 RB_WARN_ON(cpu_buffer,
1481 local_read(&cpu_buffer->commit_page->page->commit) &
1482 ~RB_WRITE_MASK);
1049 barrier(); 1483 barrier();
1050 } 1484 }
1051 1485
@@ -1078,7 +1512,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1078 * to the head page instead of next. 1512 * to the head page instead of next.
1079 */ 1513 */
1080 if (iter->head_page == cpu_buffer->reader_page) 1514 if (iter->head_page == cpu_buffer->reader_page)
1081 iter->head_page = cpu_buffer->head_page; 1515 iter->head_page = rb_set_head_page(cpu_buffer);
1082 else 1516 else
1083 rb_inc_page(cpu_buffer, &iter->head_page); 1517 rb_inc_page(cpu_buffer, &iter->head_page);
1084 1518
@@ -1122,6 +1556,163 @@ rb_update_event(struct ring_buffer_event *event,
1122 } 1556 }
1123} 1557}
1124 1558
1559/*
1560 * rb_handle_head_page - writer hit the head page
1561 *
1562 * Returns: +1 to retry page
1563 * 0 to continue
1564 * -1 on error
1565 */
1566static int
1567rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1568 struct buffer_page *tail_page,
1569 struct buffer_page *next_page)
1570{
1571 struct buffer_page *new_head;
1572 int entries;
1573 int type;
1574 int ret;
1575
1576 entries = rb_page_entries(next_page);
1577
1578 /*
1579 * The hard part is here. We need to move the head
1580 * forward, and protect against both readers on
1581 * other CPUs and writers coming in via interrupts.
1582 */
1583 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1584 RB_PAGE_HEAD);
1585
1586 /*
1587 * type can be one of four:
1588 * NORMAL - an interrupt already moved it for us
1589 * HEAD - we are the first to get here.
1590 * UPDATE - we are the interrupt interrupting
1591 * a current move.
1592 * MOVED - a reader on another CPU moved the next
1593 * pointer to its reader page. Give up
1594 * and try again.
1595 */
1596
1597 switch (type) {
1598 case RB_PAGE_HEAD:
1599 /*
1600 * We changed the head to UPDATE, thus
1601 * it is our responsibility to update
1602 * the counters.
1603 */
1604 local_add(entries, &cpu_buffer->overrun);
1605
1606 /*
1607 * The entries will be zeroed out when we move the
1608 * tail page.
1609 */
1610
1611 /* still more to do */
1612 break;
1613
1614 case RB_PAGE_UPDATE:
1615 /*
1616 * This is an interrupt that interrupt the
1617 * previous update. Still more to do.
1618 */
1619 break;
1620 case RB_PAGE_NORMAL:
1621 /*
1622 * An interrupt came in before the update
1623 * and processed this for us.
1624 * Nothing left to do.
1625 */
1626 return 1;
1627 case RB_PAGE_MOVED:
1628 /*
1629 * The reader is on another CPU and just did
1630 * a swap with our next_page.
1631 * Try again.
1632 */
1633 return 1;
1634 default:
1635 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1636 return -1;
1637 }
1638
1639 /*
1640 * Now that we are here, the old head pointer is
1641 * set to UPDATE. This will keep the reader from
1642 * swapping the head page with the reader page.
1643 * The reader (on another CPU) will spin till
1644 * we are finished.
1645 *
1646 * We just need to protect against interrupts
1647 * doing the job. We will set the next pointer
1648 * to HEAD. After that, we set the old pointer
1649 * to NORMAL, but only if it was HEAD before.
1650 * otherwise we are an interrupt, and only
1651 * want the outer most commit to reset it.
1652 */
1653 new_head = next_page;
1654 rb_inc_page(cpu_buffer, &new_head);
1655
1656 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1657 RB_PAGE_NORMAL);
1658
1659 /*
1660 * Valid returns are:
1661 * HEAD - an interrupt came in and already set it.
1662 * NORMAL - One of two things:
1663 * 1) We really set it.
1664 * 2) A bunch of interrupts came in and moved
1665 * the page forward again.
1666 */
1667 switch (ret) {
1668 case RB_PAGE_HEAD:
1669 case RB_PAGE_NORMAL:
1670 /* OK */
1671 break;
1672 default:
1673 RB_WARN_ON(cpu_buffer, 1);
1674 return -1;
1675 }
1676
1677 /*
1678 * It is possible that an interrupt came in,
1679 * set the head up, then more interrupts came in
1680 * and moved it again. When we get back here,
1681 * the page would have been set to NORMAL but we
1682 * just set it back to HEAD.
1683 *
1684 * How do you detect this? Well, if that happened
1685 * the tail page would have moved.
1686 */
1687 if (ret == RB_PAGE_NORMAL) {
1688 /*
1689 * If the tail had moved passed next, then we need
1690 * to reset the pointer.
1691 */
1692 if (cpu_buffer->tail_page != tail_page &&
1693 cpu_buffer->tail_page != next_page)
1694 rb_head_page_set_normal(cpu_buffer, new_head,
1695 next_page,
1696 RB_PAGE_HEAD);
1697 }
1698
1699 /*
1700 * If this was the outer most commit (the one that
1701 * changed the original pointer from HEAD to UPDATE),
1702 * then it is up to us to reset it to NORMAL.
1703 */
1704 if (type == RB_PAGE_HEAD) {
1705 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1706 tail_page,
1707 RB_PAGE_UPDATE);
1708 if (RB_WARN_ON(cpu_buffer,
1709 ret != RB_PAGE_UPDATE))
1710 return -1;
1711 }
1712
1713 return 0;
1714}
1715
1125static unsigned rb_calculate_event_length(unsigned length) 1716static unsigned rb_calculate_event_length(unsigned length)
1126{ 1717{
1127 struct ring_buffer_event event; /* Used only for sizeof array */ 1718 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1185,9 +1776,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1185 event->type_len = RINGBUF_TYPE_PADDING; 1776 event->type_len = RINGBUF_TYPE_PADDING;
1186 /* time delta must be non zero */ 1777 /* time delta must be non zero */
1187 event->time_delta = 1; 1778 event->time_delta = 1;
1188 /* Account for this as an entry */
1189 local_inc(&tail_page->entries);
1190 local_inc(&cpu_buffer->entries);
1191 1779
1192 /* Set write to end of buffer */ 1780 /* Set write to end of buffer */
1193 length = (tail + length) - BUF_PAGE_SIZE; 1781 length = (tail + length) - BUF_PAGE_SIZE;
@@ -1200,96 +1788,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1200 struct buffer_page *commit_page, 1788 struct buffer_page *commit_page,
1201 struct buffer_page *tail_page, u64 *ts) 1789 struct buffer_page *tail_page, u64 *ts)
1202{ 1790{
1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer; 1791 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false; 1792 struct buffer_page *next_page;
1206 unsigned long flags; 1793 int ret;
1207 1794
1208 next_page = tail_page; 1795 next_page = tail_page;
1209 1796
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1231 goto out_reset;
1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1235
1236 lock_taken = true;
1237
1238 rb_inc_page(cpu_buffer, &next_page); 1797 rb_inc_page(cpu_buffer, &next_page);
1239 1798
1240 head_page = cpu_buffer->head_page;
1241 reader_page = cpu_buffer->reader_page;
1242
1243 /* we grabbed the lock before incrementing */
1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1245 goto out_reset;
1246
1247 /* 1799 /*
1248 * If for some reason, we had an interrupt storm that made 1800 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn 1801 * it all the way around the buffer, bail, and warn
1250 * about it. 1802 * about it.
1251 */ 1803 */
1252 if (unlikely(next_page == commit_page)) { 1804 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++; 1805 local_inc(&cpu_buffer->commit_overrun);
1254 goto out_reset; 1806 goto out_reset;
1255 } 1807 }
1256 1808
1257 if (next_page == head_page) { 1809 /*
1258 if (!(buffer->flags & RB_FL_OVERWRITE)) 1810 * This is where the fun begins!
1259 goto out_reset; 1811 *
1260 1812 * We are fighting against races between a reader that
1261 /* tail_page has not moved yet? */ 1813 * could be on another CPU trying to swap its reader
1262 if (tail_page == cpu_buffer->tail_page) { 1814 * page with the buffer head.
1263 /* count overflows */ 1815 *
1264 cpu_buffer->overrun += 1816 * We are also fighting against interrupts coming in and
1265 local_read(&head_page->entries); 1817 * moving the head or tail on us as well.
1818 *
1819 * If the next page is the head page then we have filled
1820 * the buffer, unless the commit page is still on the
1821 * reader page.
1822 */
1823 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1266 1824
1267 rb_inc_page(cpu_buffer, &head_page); 1825 /*
1268 cpu_buffer->head_page = head_page; 1826 * If the commit is not on the reader page, then
1269 cpu_buffer->head_page->read = 0; 1827 * move the header page.
1828 */
1829 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1830 /*
1831 * If we are not in overwrite mode,
1832 * this is easy, just stop here.
1833 */
1834 if (!(buffer->flags & RB_FL_OVERWRITE))
1835 goto out_reset;
1836
1837 ret = rb_handle_head_page(cpu_buffer,
1838 tail_page,
1839 next_page);
1840 if (ret < 0)
1841 goto out_reset;
1842 if (ret)
1843 goto out_again;
1844 } else {
1845 /*
1846 * We need to be careful here too. The
1847 * commit page could still be on the reader
1848 * page. We could have a small buffer, and
1849 * have filled up the buffer with events
1850 * from interrupts and such, and wrapped.
1851 *
1852 * Note, if the tail page is also the on the
1853 * reader_page, we let it move out.
1854 */
1855 if (unlikely((cpu_buffer->commit_page !=
1856 cpu_buffer->tail_page) &&
1857 (cpu_buffer->commit_page ==
1858 cpu_buffer->reader_page))) {
1859 local_inc(&cpu_buffer->commit_overrun);
1860 goto out_reset;
1861 }
1270 } 1862 }
1271 } 1863 }
1272 1864
1273 /* 1865 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1274 * If the tail page is still the same as what we think 1866 if (ret) {
1275 * it is, then it is up to us to update the tail 1867 /*
1276 * pointer. 1868 * Nested commits always have zero deltas, so
1277 */ 1869 * just reread the time stamp
1278 if (tail_page == cpu_buffer->tail_page) { 1870 */
1279 local_set(&next_page->write, 0);
1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1871 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts; 1872 next_page->page->time_stamp = *ts;
1287 } 1873 }
1288 1874
1289 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1875 out_again:
1290 1876
1291 __raw_spin_unlock(&cpu_buffer->lock); 1877 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1292 local_irq_restore(flags);
1293 1878
1294 /* fail and let the caller try again */ 1879 /* fail and let the caller try again */
1295 return ERR_PTR(-EAGAIN); 1880 return ERR_PTR(-EAGAIN);
@@ -1298,9 +1883,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1298 /* reset write */ 1883 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1884 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1300 1885
1301 if (likely(lock_taken))
1302 __raw_spin_unlock(&cpu_buffer->lock);
1303 local_irq_restore(flags);
1304 return NULL; 1886 return NULL;
1305} 1887}
1306 1888
@@ -1317,6 +1899,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1317 barrier(); 1899 barrier();
1318 tail_page = cpu_buffer->tail_page; 1900 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write); 1901 write = local_add_return(length, &tail_page->write);
1902
1903 /* set write to only the index of the write */
1904 write &= RB_WRITE_MASK;
1320 tail = write - length; 1905 tail = write - length;
1321 1906
1322 /* See if we shot pass the end of this buffer page */ 1907 /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1946,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1361 bpage = cpu_buffer->tail_page; 1946 bpage = cpu_buffer->tail_page;
1362 1947
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1948 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1949 unsigned long write_mask =
1950 local_read(&bpage->write) & ~RB_WRITE_MASK;
1364 /* 1951 /*
1365 * This is on the tail page. It is possible that 1952 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page 1953 * a write could come in and move the tail page
1367 * and write to the next page. That is fine 1954 * and write to the next page. That is fine
1368 * because we just shorten what is on this page. 1955 * because we just shorten what is on this page.
1369 */ 1956 */
1957 old_index += write_mask;
1958 new_index += write_mask;
1370 index = local_cmpxchg(&bpage->write, old_index, new_index); 1959 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index) 1960 if (index == old_index)
1372 return 1; 1961 return 1;
@@ -1482,7 +2071,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1482} 2071}
1483 2072
1484static struct ring_buffer_event * 2073static struct ring_buffer_event *
1485rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2074rb_reserve_next_event(struct ring_buffer *buffer,
2075 struct ring_buffer_per_cpu *cpu_buffer,
1486 unsigned long length) 2076 unsigned long length)
1487{ 2077{
1488 struct ring_buffer_event *event; 2078 struct ring_buffer_event *event;
@@ -1492,6 +2082,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1492 2082
1493 rb_start_commit(cpu_buffer); 2083 rb_start_commit(cpu_buffer);
1494 2084
2085#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2086 /*
2087 * Due to the ability to swap a cpu buffer from a buffer
2088 * it is possible it was swapped before we committed.
2089 * (committing stops a swap). We check for it here and
2090 * if it happened, we have to fail the write.
2091 */
2092 barrier();
2093 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2094 local_dec(&cpu_buffer->committing);
2095 local_dec(&cpu_buffer->commits);
2096 return NULL;
2097 }
2098#endif
2099
1495 length = rb_calculate_event_length(length); 2100 length = rb_calculate_event_length(length);
1496 again: 2101 again:
1497 /* 2102 /*
@@ -1652,7 +2257,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1652 if (length > BUF_MAX_DATA_SIZE) 2257 if (length > BUF_MAX_DATA_SIZE)
1653 goto out; 2258 goto out;
1654 2259
1655 event = rb_reserve_next_event(cpu_buffer, length); 2260 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1656 if (!event) 2261 if (!event)
1657 goto out; 2262 goto out;
1658 2263
@@ -1675,18 +2280,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1675} 2280}
1676EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2281EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1677 2282
1678static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2283static void
2284rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1679 struct ring_buffer_event *event) 2285 struct ring_buffer_event *event)
1680{ 2286{
1681 local_inc(&cpu_buffer->entries);
1682
1683 /* 2287 /*
1684 * The event first in the commit queue updates the 2288 * The event first in the commit queue updates the
1685 * time stamp. 2289 * time stamp.
1686 */ 2290 */
1687 if (rb_event_is_commit(cpu_buffer, event)) 2291 if (rb_event_is_commit(cpu_buffer, event))
1688 cpu_buffer->write_stamp += event->time_delta; 2292 cpu_buffer->write_stamp += event->time_delta;
2293}
1689 2294
2295static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2296 struct ring_buffer_event *event)
2297{
2298 local_inc(&cpu_buffer->entries);
2299 rb_update_write_stamp(cpu_buffer, event);
1690 rb_end_commit(cpu_buffer); 2300 rb_end_commit(cpu_buffer);
1691} 2301}
1692 2302
@@ -1733,32 +2343,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1733 event->time_delta = 1; 2343 event->time_delta = 1;
1734} 2344}
1735 2345
1736/** 2346/*
1737 * ring_buffer_event_discard - discard any event in the ring buffer 2347 * Decrement the entries to the page that an event is on.
1738 * @event: the event to discard 2348 * The event does not even need to exist, only the pointer
1739 * 2349 * to the page it is on. This may only be called before the commit
1740 * Sometimes a event that is in the ring buffer needs to be ignored. 2350 * takes place.
1741 * This function lets the user discard an event in the ring buffer
1742 * and then that event will not be read later.
1743 *
1744 * Note, it is up to the user to be careful with this, and protect
1745 * against races. If the user discards an event that has been consumed
1746 * it is possible that it could corrupt the ring buffer.
1747 */ 2351 */
1748void ring_buffer_event_discard(struct ring_buffer_event *event) 2352static inline void
2353rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2354 struct ring_buffer_event *event)
1749{ 2355{
1750 rb_event_discard(event); 2356 unsigned long addr = (unsigned long)event;
2357 struct buffer_page *bpage = cpu_buffer->commit_page;
2358 struct buffer_page *start;
2359
2360 addr &= PAGE_MASK;
2361
2362 /* Do the likely case first */
2363 if (likely(bpage->page == (void *)addr)) {
2364 local_dec(&bpage->entries);
2365 return;
2366 }
2367
2368 /*
2369 * Because the commit page may be on the reader page we
2370 * start with the next page and check the end loop there.
2371 */
2372 rb_inc_page(cpu_buffer, &bpage);
2373 start = bpage;
2374 do {
2375 if (bpage->page == (void *)addr) {
2376 local_dec(&bpage->entries);
2377 return;
2378 }
2379 rb_inc_page(cpu_buffer, &bpage);
2380 } while (bpage != start);
2381
2382 /* commit not part of this buffer?? */
2383 RB_WARN_ON(cpu_buffer, 1);
1751} 2384}
1752EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1753 2385
1754/** 2386/**
1755 * ring_buffer_commit_discard - discard an event that has not been committed 2387 * ring_buffer_commit_discard - discard an event that has not been committed
1756 * @buffer: the ring buffer 2388 * @buffer: the ring buffer
1757 * @event: non committed event to discard 2389 * @event: non committed event to discard
1758 * 2390 *
1759 * This is similar to ring_buffer_event_discard but must only be 2391 * Sometimes an event that is in the ring buffer needs to be ignored.
1760 * performed on an event that has not been committed yet. The difference 2392 * This function lets the user discard an event in the ring buffer
1761 * is that this will also try to free the event from the ring buffer 2393 * and then that event will not be read later.
2394 *
2395 * This function only works if it is called before the the item has been
2396 * committed. It will try to free the event from the ring buffer
1762 * if another event has not been added behind it. 2397 * if another event has not been added behind it.
1763 * 2398 *
1764 * If another event has been added behind it, it will set the event 2399 * If another event has been added behind it, it will set the event
@@ -1786,14 +2421,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1786 */ 2421 */
1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 2422 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1788 2423
2424 rb_decrement_entry(cpu_buffer, event);
1789 if (rb_try_to_discard(cpu_buffer, event)) 2425 if (rb_try_to_discard(cpu_buffer, event))
1790 goto out; 2426 goto out;
1791 2427
1792 /* 2428 /*
1793 * The commit is still visible by the reader, so we 2429 * The commit is still visible by the reader, so we
1794 * must increment entries. 2430 * must still update the timestamp.
1795 */ 2431 */
1796 local_inc(&cpu_buffer->entries); 2432 rb_update_write_stamp(cpu_buffer, event);
1797 out: 2433 out:
1798 rb_end_commit(cpu_buffer); 2434 rb_end_commit(cpu_buffer);
1799 2435
@@ -1854,7 +2490,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1854 if (length > BUF_MAX_DATA_SIZE) 2490 if (length > BUF_MAX_DATA_SIZE)
1855 goto out; 2491 goto out;
1856 2492
1857 event = rb_reserve_next_event(cpu_buffer, length); 2493 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1858 if (!event) 2494 if (!event)
1859 goto out; 2495 goto out;
1860 2496
@@ -1875,9 +2511,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1875static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2511static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1876{ 2512{
1877 struct buffer_page *reader = cpu_buffer->reader_page; 2513 struct buffer_page *reader = cpu_buffer->reader_page;
1878 struct buffer_page *head = cpu_buffer->head_page; 2514 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1879 struct buffer_page *commit = cpu_buffer->commit_page; 2515 struct buffer_page *commit = cpu_buffer->commit_page;
1880 2516
2517 /* In case of error, head will be NULL */
2518 if (unlikely(!head))
2519 return 1;
2520
1881 return reader->read == rb_page_commit(reader) && 2521 return reader->read == rb_page_commit(reader) &&
1882 (commit == reader || 2522 (commit == reader ||
1883 (commit == head && 2523 (commit == head &&
@@ -1968,7 +2608,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1968 return 0; 2608 return 0;
1969 2609
1970 cpu_buffer = buffer->buffers[cpu]; 2610 cpu_buffer = buffer->buffers[cpu];
1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2611 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1972 - cpu_buffer->read; 2612 - cpu_buffer->read;
1973 2613
1974 return ret; 2614 return ret;
@@ -1989,33 +2629,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989 return 0; 2629 return 0;
1990 2630
1991 cpu_buffer = buffer->buffers[cpu]; 2631 cpu_buffer = buffer->buffers[cpu];
1992 ret = cpu_buffer->overrun; 2632 ret = local_read(&cpu_buffer->overrun);
1993 2633
1994 return ret; 2634 return ret;
1995} 2635}
1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2636EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1997 2637
1998/** 2638/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2639 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer 2640 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from 2641 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2650,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2030 return 0; 2650 return 0;
2031 2651
2032 cpu_buffer = buffer->buffers[cpu]; 2652 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun; 2653 ret = local_read(&cpu_buffer->commit_overrun);
2034 2654
2035 return ret; 2655 return ret;
2036} 2656}
@@ -2053,7 +2673,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2053 for_each_buffer_cpu(buffer, cpu) { 2673 for_each_buffer_cpu(buffer, cpu) {
2054 cpu_buffer = buffer->buffers[cpu]; 2674 cpu_buffer = buffer->buffers[cpu];
2055 entries += (local_read(&cpu_buffer->entries) - 2675 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read; 2676 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2057 } 2677 }
2058 2678
2059 return entries; 2679 return entries;
@@ -2076,7 +2696,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2076 /* if you care about this being correct, lock the buffer */ 2696 /* if you care about this being correct, lock the buffer */
2077 for_each_buffer_cpu(buffer, cpu) { 2697 for_each_buffer_cpu(buffer, cpu) {
2078 cpu_buffer = buffer->buffers[cpu]; 2698 cpu_buffer = buffer->buffers[cpu];
2079 overruns += cpu_buffer->overrun; 2699 overruns += local_read(&cpu_buffer->overrun);
2080 } 2700 }
2081 2701
2082 return overruns; 2702 return overruns;
@@ -2089,8 +2709,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2089 2709
2090 /* Iterator usage is expected to have record disabled */ 2710 /* Iterator usage is expected to have record disabled */
2091 if (list_empty(&cpu_buffer->reader_page->list)) { 2711 if (list_empty(&cpu_buffer->reader_page->list)) {
2092 iter->head_page = cpu_buffer->head_page; 2712 iter->head_page = rb_set_head_page(cpu_buffer);
2093 iter->head = cpu_buffer->head_page->read; 2713 if (unlikely(!iter->head_page))
2714 return;
2715 iter->head = iter->head_page->read;
2094 } else { 2716 } else {
2095 iter->head_page = cpu_buffer->reader_page; 2717 iter->head_page = cpu_buffer->reader_page;
2096 iter->head = cpu_buffer->reader_page->read; 2718 iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2829,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 struct buffer_page *reader = NULL; 2829 struct buffer_page *reader = NULL;
2208 unsigned long flags; 2830 unsigned long flags;
2209 int nr_loops = 0; 2831 int nr_loops = 0;
2832 int ret;
2210 2833
2211 local_irq_save(flags); 2834 local_irq_save(flags);
2212 __raw_spin_lock(&cpu_buffer->lock); 2835 __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2863,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2240 goto out; 2863 goto out;
2241 2864
2242 /* 2865 /*
2243 * Splice the empty reader page into the list around the head.
2244 * Reset the reader page to size zero. 2866 * Reset the reader page to size zero.
2245 */ 2867 */
2868 local_set(&cpu_buffer->reader_page->write, 0);
2869 local_set(&cpu_buffer->reader_page->entries, 0);
2870 local_set(&cpu_buffer->reader_page->page->commit, 0);
2246 2871
2247 reader = cpu_buffer->head_page; 2872 spin:
2873 /*
2874 * Splice the empty reader page into the list around the head.
2875 */
2876 reader = rb_set_head_page(cpu_buffer);
2248 cpu_buffer->reader_page->list.next = reader->list.next; 2877 cpu_buffer->reader_page->list.next = reader->list.next;
2249 cpu_buffer->reader_page->list.prev = reader->list.prev; 2878 cpu_buffer->reader_page->list.prev = reader->list.prev;
2250 2879
2251 local_set(&cpu_buffer->reader_page->write, 0); 2880 /*
2252 local_set(&cpu_buffer->reader_page->entries, 0); 2881 * cpu_buffer->pages just needs to point to the buffer, it
2253 local_set(&cpu_buffer->reader_page->page->commit, 0); 2882 * has no specific buffer page to point to. Lets move it out
2883 * of our way so we don't accidently swap it.
2884 */
2885 cpu_buffer->pages = reader->list.prev;
2254 2886
2255 /* Make the reader page now replace the head */ 2887 /* The reader page will be pointing to the new head */
2256 reader->list.prev->next = &cpu_buffer->reader_page->list; 2888 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2257 reader->list.next->prev = &cpu_buffer->reader_page->list;
2258 2889
2259 /* 2890 /*
2260 * If the tail is on the reader, then we must set the head 2891 * Here's the tricky part.
2261 * to the inserted page, otherwise we set it one before. 2892 *
2893 * We need to move the pointer past the header page.
2894 * But we can only do that if a writer is not currently
2895 * moving it. The page before the header page has the
2896 * flag bit '1' set if it is pointing to the page we want.
2897 * but if the writer is in the process of moving it
2898 * than it will be '2' or already moved '0'.
2262 */ 2899 */
2263 cpu_buffer->head_page = cpu_buffer->reader_page;
2264 2900
2265 if (cpu_buffer->commit_page != reader) 2901 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2266 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2902
2903 /*
2904 * If we did not convert it, then we must try again.
2905 */
2906 if (!ret)
2907 goto spin;
2908
2909 /*
2910 * Yeah! We succeeded in replacing the page.
2911 *
2912 * Now make the new head point back to the reader page.
2913 */
2914 reader->list.next->prev = &cpu_buffer->reader_page->list;
2915 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2267 2916
2268 /* Finally update the reader page to the new head */ 2917 /* Finally update the reader page to the new head */
2269 cpu_buffer->reader_page = reader; 2918 cpu_buffer->reader_page = reader;
@@ -2292,8 +2941,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2292 2941
2293 event = rb_reader_event(cpu_buffer); 2942 event = rb_reader_event(cpu_buffer);
2294 2943
2295 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2944 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2296 || rb_discarded_event(event))
2297 cpu_buffer->read++; 2945 cpu_buffer->read++;
2298 2946
2299 rb_update_read_stamp(cpu_buffer, event); 2947 rb_update_read_stamp(cpu_buffer, event);
@@ -2347,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2347} 2995}
2348 2996
2349static struct ring_buffer_event * 2997static struct ring_buffer_event *
2350rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 2998rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
2351{ 2999{
2352 struct ring_buffer_per_cpu *cpu_buffer;
2353 struct ring_buffer_event *event; 3000 struct ring_buffer_event *event;
2354 struct buffer_page *reader; 3001 struct buffer_page *reader;
2355 int nr_loops = 0; 3002 int nr_loops = 0;
2356 3003
2357 cpu_buffer = buffer->buffers[cpu];
2358
2359 again: 3004 again:
2360 /* 3005 /*
2361 * We repeat when a timestamp is encountered. It is possible 3006 * We repeat when a timestamp is encountered. It is possible
@@ -2399,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2399 case RINGBUF_TYPE_DATA: 3044 case RINGBUF_TYPE_DATA:
2400 if (ts) { 3045 if (ts) {
2401 *ts = cpu_buffer->read_stamp + event->time_delta; 3046 *ts = cpu_buffer->read_stamp + event->time_delta;
2402 ring_buffer_normalize_time_stamp(buffer, 3047 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
2403 cpu_buffer->cpu, ts); 3048 cpu_buffer->cpu, ts);
2404 } 3049 }
2405 return event; 3050 return event;
@@ -2518,17 +3163,15 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2518 local_irq_save(flags); 3163 local_irq_save(flags);
2519 if (dolock) 3164 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock); 3165 spin_lock(&cpu_buffer->reader_lock);
2521 event = rb_buffer_peek(buffer, cpu, ts); 3166 event = rb_buffer_peek(cpu_buffer, ts);
2522 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3167 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer); 3168 rb_advance_reader(cpu_buffer);
2524 if (dolock) 3169 if (dolock)
2525 spin_unlock(&cpu_buffer->reader_lock); 3170 spin_unlock(&cpu_buffer->reader_lock);
2526 local_irq_restore(flags); 3171 local_irq_restore(flags);
2527 3172
2528 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3173 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2529 cpu_relax();
2530 goto again; 3174 goto again;
2531 }
2532 3175
2533 return event; 3176 return event;
2534} 3177}
@@ -2553,10 +3196,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2553 event = rb_iter_peek(iter, ts); 3196 event = rb_iter_peek(iter, ts);
2554 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3197 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2555 3198
2556 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3199 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2557 cpu_relax();
2558 goto again; 3200 goto again;
2559 }
2560 3201
2561 return event; 3202 return event;
2562} 3203}
@@ -2591,7 +3232,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2591 if (dolock) 3232 if (dolock)
2592 spin_lock(&cpu_buffer->reader_lock); 3233 spin_lock(&cpu_buffer->reader_lock);
2593 3234
2594 event = rb_buffer_peek(buffer, cpu, ts); 3235 event = rb_buffer_peek(cpu_buffer, ts);
2595 if (event) 3236 if (event)
2596 rb_advance_reader(cpu_buffer); 3237 rb_advance_reader(cpu_buffer);
2597 3238
@@ -2602,10 +3243,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2602 out: 3243 out:
2603 preempt_enable(); 3244 preempt_enable();
2604 3245
2605 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3246 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2606 cpu_relax();
2607 goto again; 3247 goto again;
2608 }
2609 3248
2610 return event; 3249 return event;
2611} 3250}
@@ -2685,21 +3324,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2685 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3324 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2686 unsigned long flags; 3325 unsigned long flags;
2687 3326
2688 again:
2689 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3327 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3328 again:
2690 event = rb_iter_peek(iter, ts); 3329 event = rb_iter_peek(iter, ts);
2691 if (!event) 3330 if (!event)
2692 goto out; 3331 goto out;
2693 3332
3333 if (event->type_len == RINGBUF_TYPE_PADDING)
3334 goto again;
3335
2694 rb_advance_iter(iter); 3336 rb_advance_iter(iter);
2695 out: 3337 out:
2696 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3338 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2697 3339
2698 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2699 cpu_relax();
2700 goto again;
2701 }
2702
2703 return event; 3340 return event;
2704} 3341}
2705EXPORT_SYMBOL_GPL(ring_buffer_read); 3342EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2717,8 +3354,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2717static void 3354static void
2718rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3355rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2719{ 3356{
3357 rb_head_page_deactivate(cpu_buffer);
3358
2720 cpu_buffer->head_page 3359 cpu_buffer->head_page
2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3360 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2722 local_set(&cpu_buffer->head_page->write, 0); 3361 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0); 3362 local_set(&cpu_buffer->head_page->entries, 0);
2724 local_set(&cpu_buffer->head_page->page->commit, 0); 3363 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3373,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2734 local_set(&cpu_buffer->reader_page->page->commit, 0); 3373 local_set(&cpu_buffer->reader_page->page->commit, 0);
2735 cpu_buffer->reader_page->read = 0; 3374 cpu_buffer->reader_page->read = 0;
2736 3375
2737 cpu_buffer->nmi_dropped = 0; 3376 local_set(&cpu_buffer->commit_overrun, 0);
2738 cpu_buffer->commit_overrun = 0; 3377 local_set(&cpu_buffer->overrun, 0);
2739 cpu_buffer->overrun = 0;
2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0); 3378 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0); 3379 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0); 3380 local_set(&cpu_buffer->commits, 0);
3381 cpu_buffer->read = 0;
2744 3382
2745 cpu_buffer->write_stamp = 0; 3383 cpu_buffer->write_stamp = 0;
2746 cpu_buffer->read_stamp = 0; 3384 cpu_buffer->read_stamp = 0;
3385
3386 rb_head_page_activate(cpu_buffer);
2747} 3387}
2748 3388
2749/** 3389/**
@@ -2763,12 +3403,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2763 3403
2764 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3404 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2765 3405
3406 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3407 goto out;
3408
2766 __raw_spin_lock(&cpu_buffer->lock); 3409 __raw_spin_lock(&cpu_buffer->lock);
2767 3410
2768 rb_reset_cpu(cpu_buffer); 3411 rb_reset_cpu(cpu_buffer);
2769 3412
2770 __raw_spin_unlock(&cpu_buffer->lock); 3413 __raw_spin_unlock(&cpu_buffer->lock);
2771 3414
3415 out:
2772 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3416 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2773 3417
2774 atomic_dec(&cpu_buffer->record_disabled); 3418 atomic_dec(&cpu_buffer->record_disabled);
@@ -2851,6 +3495,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2851} 3495}
2852EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3496EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2853 3497
3498#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2854/** 3499/**
2855 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3500 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2856 * @buffer_a: One buffer to swap with 3501 * @buffer_a: One buffer to swap with
@@ -2905,20 +3550,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2905 atomic_inc(&cpu_buffer_a->record_disabled); 3550 atomic_inc(&cpu_buffer_a->record_disabled);
2906 atomic_inc(&cpu_buffer_b->record_disabled); 3551 atomic_inc(&cpu_buffer_b->record_disabled);
2907 3552
3553 ret = -EBUSY;
3554 if (local_read(&cpu_buffer_a->committing))
3555 goto out_dec;
3556 if (local_read(&cpu_buffer_b->committing))
3557 goto out_dec;
3558
2908 buffer_a->buffers[cpu] = cpu_buffer_b; 3559 buffer_a->buffers[cpu] = cpu_buffer_b;
2909 buffer_b->buffers[cpu] = cpu_buffer_a; 3560 buffer_b->buffers[cpu] = cpu_buffer_a;
2910 3561
2911 cpu_buffer_b->buffer = buffer_a; 3562 cpu_buffer_b->buffer = buffer_a;
2912 cpu_buffer_a->buffer = buffer_b; 3563 cpu_buffer_a->buffer = buffer_b;
2913 3564
3565 ret = 0;
3566
3567out_dec:
2914 atomic_dec(&cpu_buffer_a->record_disabled); 3568 atomic_dec(&cpu_buffer_a->record_disabled);
2915 atomic_dec(&cpu_buffer_b->record_disabled); 3569 atomic_dec(&cpu_buffer_b->record_disabled);
2916
2917 ret = 0;
2918out: 3570out:
2919 return ret; 3571 return ret;
2920} 3572}
2921EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3573EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3574#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2922 3575
2923/** 3576/**
2924 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3577 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3091,7 +3744,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3091 read = 0; 3744 read = 0;
3092 } else { 3745 } else {
3093 /* update the entry counter */ 3746 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries); 3747 cpu_buffer->read += rb_page_entries(reader);
3095 3748
3096 /* swap the pages */ 3749 /* swap the pages */
3097 rb_init_page(bpage); 3750 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c358395d338..45068269ebb1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,14 +43,11 @@
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45 45
46unsigned long __read_mostly tracing_max_latency;
47unsigned long __read_mostly tracing_thresh;
48
49/* 46/*
50 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
51 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
52 */ 49 */
53static int ring_buffer_expanded; 50int ring_buffer_expanded;
54 51
55/* 52/*
56 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -64,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
64/* 61/*
65 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
66 */ 63 */
67static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
68 65
69/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
70static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
89 */ 86 */
90static int tracing_disabled = 1; 87static int tracing_disabled = 1;
91 88
92static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
93 90
94static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
95{ 92{
@@ -128,13 +125,13 @@ int ftrace_dump_on_oops;
128 125
129static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
130 127
131#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
132static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
133static char *default_bootup_tracer; 130static char *default_bootup_tracer;
134 131
135static int __init set_ftrace(char *str) 132static int __init set_ftrace(char *str)
136{ 133{
137 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
138 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
139 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
140 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
@@ -172,10 +169,11 @@ static struct trace_array global_trace;
172 169
173static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
174 171
175int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
176 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
177{ 175{
178 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
179} 177}
180EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
181 179
@@ -244,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
244static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
245 243
246/* 244/*
247 * max_tracer_type_len is used to simplify the allocating of
248 * buffers to read userspace tracer names. We keep track of
249 * the longest tracer name registered.
250 */
251static int max_tracer_type_len;
252
253/*
254 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
255 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
256 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -266,6 +257,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
266 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 257 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
267 TRACE_ITER_GRAPH_TIME; 258 TRACE_ITER_GRAPH_TIME;
268 259
260static int trace_stop_count;
261static DEFINE_SPINLOCK(tracing_start_lock);
262
269/** 263/**
270 * trace_wake_up - wake up tasks waiting for trace input 264 * trace_wake_up - wake up tasks waiting for trace input
271 * 265 *
@@ -274,12 +268,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
274 */ 268 */
275void trace_wake_up(void) 269void trace_wake_up(void)
276{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
277 /* 275 /*
278 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
279 * have for now: 277 * have for now:
280 */ 278 */
281 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
282 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
283} 283}
284 284
285static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -323,49 +323,125 @@ static const char *trace_options[] = {
323 "printk-msg-only", 323 "printk-msg-only",
324 "context-info", 324 "context-info",
325 "latency-format", 325 "latency-format",
326 "global-clock",
327 "sleep-time", 326 "sleep-time",
328 "graph-time", 327 "graph-time",
329 NULL 328 NULL
330}; 329};
331 330
331static struct {
332 u64 (*func)(void);
333 const char *name;
334} trace_clocks[] = {
335 { trace_clock_local, "local" },
336 { trace_clock_global, "global" },
337};
338
339int trace_clock_id;
340
332/* 341/*
333 * ftrace_max_lock is used to protect the swapping of buffers 342 * trace_parser_get_init - gets the buffer for trace parser
334 * when taking a max snapshot. The buffers themselves are
335 * protected by per_cpu spinlocks. But the action of the swap
336 * needs its own lock.
337 *
338 * This is defined as a raw_spinlock_t in order to help
339 * with performance when lockdep debugging is enabled.
340 */ 343 */
341static raw_spinlock_t ftrace_max_lock = 344int trace_parser_get_init(struct trace_parser *parser, int size)
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
343 355
344/* 356/*
345 * Copy the new maximum trace into the separate maximum-trace 357 * trace_parser_put - frees the buffer for trace parser
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */ 358 */
349static void 359void trace_parser_put(struct trace_parser *parser)
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{ 360{
352 struct trace_array_cpu *data = tr->data[cpu]; 361 kfree(parser->buffer);
362}
353 363
354 max_tr.cpu = cpu; 364/*
355 max_tr.time_start = data->preempt_timestamp; 365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
356 381
357 data = max_tr.data[cpu]; 382 if (!*ppos)
358 data->saved_latency = tracing_max_latency; 383 trace_parser_clear(parser);
359 384
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 385 ret = get_user(ch, ubuf++);
361 data->pid = tsk->pid; 386 if (ret)
362 data->uid = task_uid(tsk); 387 goto out;
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366 388
367 /* record this tasks comm */ 389 read++;
368 tracing_record_cmdline(tsk); 390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size - 1)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
369} 445}
370 446
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
@@ -411,6 +487,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 487 return cnt;
412} 488}
413 489
490/*
491 * ftrace_max_lock is used to protect the swapping of buffers
492 * when taking a max snapshot. The buffers themselves are
493 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock.
495 *
496 * This is defined as a raw_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled.
498 *
499 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE.
502 */
503static raw_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
505
506#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency;
508unsigned long __read_mostly tracing_thresh;
509
510/*
511 * Copy the new maximum trace into the separate maximum-trace
512 * structure. (this way the maximum trace is permanently saved,
513 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
514 */
515static void
516__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
517{
518 struct trace_array_cpu *data = tr->data[cpu];
519 struct trace_array_cpu *max_data = tr->data[cpu];
520
521 max_tr.cpu = cpu;
522 max_tr.time_start = data->preempt_timestamp;
523
524 max_data = max_tr.data[cpu];
525 max_data->saved_latency = tracing_max_latency;
526 max_data->critical_start = data->critical_start;
527 max_data->critical_end = data->critical_end;
528
529 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
530 max_data->pid = tsk->pid;
531 max_data->uid = task_uid(tsk);
532 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
533 max_data->policy = tsk->policy;
534 max_data->rt_priority = tsk->rt_priority;
535
536 /* record this tasks comm */
537 tracing_record_cmdline(tsk);
538}
539
414/** 540/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 541 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 542 * @tr: tracer
@@ -425,16 +551,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 551{
426 struct ring_buffer *buf = tr->buffer; 552 struct ring_buffer *buf = tr->buffer;
427 553
554 if (trace_stop_count)
555 return;
556
428 WARN_ON_ONCE(!irqs_disabled()); 557 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 558 __raw_spin_lock(&ftrace_max_lock);
430 559
431 tr->buffer = max_tr.buffer; 560 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 561 max_tr.buffer = buf;
433 562
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 563 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 564 __raw_spin_unlock(&ftrace_max_lock);
440} 565}
@@ -452,21 +577,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 577{
453 int ret; 578 int ret;
454 579
580 if (trace_stop_count)
581 return;
582
455 WARN_ON_ONCE(!irqs_disabled()); 583 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 584 __raw_spin_lock(&ftrace_max_lock);
457 585
458 ftrace_disable_cpu(); 586 ftrace_disable_cpu();
459 587
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 588 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 589
590 if (ret == -EBUSY) {
591 /*
592 * We failed to swap the buffer due to a commit taking
593 * place on this CPU. We fail to record, but we reset
594 * the max trace buffer (no one writes directly to it)
595 * and flag that it failed.
596 */
597 trace_array_printk(&max_tr, _THIS_IP_,
598 "Failed to swap buffers due to commit in progress\n");
599 }
600
463 ftrace_enable_cpu(); 601 ftrace_enable_cpu();
464 602
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 604
467 __update_max_tr(tr, tsk, cpu); 605 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 606 __raw_spin_unlock(&ftrace_max_lock);
469} 607}
608#endif /* CONFIG_TRACER_MAX_TRACE */
470 609
471/** 610/**
472 * register_tracer - register a tracer with the ftrace system. 611 * register_tracer - register a tracer with the ftrace system.
@@ -479,7 +618,6 @@ __releases(kernel_lock)
479__acquires(kernel_lock) 618__acquires(kernel_lock)
480{ 619{
481 struct tracer *t; 620 struct tracer *t;
482 int len;
483 int ret = 0; 621 int ret = 0;
484 622
485 if (!type->name) { 623 if (!type->name) {
@@ -487,6 +625,11 @@ __acquires(kernel_lock)
487 return -1; 625 return -1;
488 } 626 }
489 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
490 /* 633 /*
491 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
492 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -501,7 +644,7 @@ __acquires(kernel_lock)
501 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
502 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
503 /* already found */ 646 /* already found */
504 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
505 type->name); 648 type->name);
506 ret = -1; 649 ret = -1;
507 goto out; 650 goto out;
@@ -523,7 +666,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 666 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 667 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 668 struct trace_array *tr = &global_trace;
526 int i;
527 669
528 /* 670 /*
529 * Run a selftest on this tracer. 671 * Run a selftest on this tracer.
@@ -532,8 +674,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 674 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 675 * If we fail, we do not register this tracer.
534 */ 676 */
535 for_each_tracing_cpu(i) 677 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 678
538 current_trace = type; 679 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 680 /* the test is responsible for initializing and enabling */
@@ -546,8 +687,7 @@ __acquires(kernel_lock)
546 goto out; 687 goto out;
547 } 688 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 689 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 690 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 691
552 printk(KERN_CONT "PASSED\n"); 692 printk(KERN_CONT "PASSED\n");
553 } 693 }
@@ -555,9 +695,6 @@ __acquires(kernel_lock)
555 695
556 type->next = trace_types; 696 type->next = trace_types;
557 trace_types = type; 697 trace_types = type;
558 len = strlen(type->name);
559 if (len > max_tracer_type_len)
560 max_tracer_type_len = len;
561 698
562 out: 699 out:
563 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -566,7 +703,7 @@ __acquires(kernel_lock)
566 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
567 goto out_unlock; 704 goto out_unlock;
568 705
569 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
570 goto out_unlock; 707 goto out_unlock;
571 708
572 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -588,14 +725,13 @@ __acquires(kernel_lock)
588void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
589{ 726{
590 struct tracer **t; 727 struct tracer **t;
591 int len;
592 728
593 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
594 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
595 if (*t == type) 731 if (*t == type)
596 goto found; 732 goto found;
597 } 733 }
598 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
599 goto out; 735 goto out;
600 736
601 found: 737 found:
@@ -608,35 +744,46 @@ void unregister_tracer(struct tracer *type)
608 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
609 current_trace = &nop_trace; 745 current_trace = &nop_trace;
610 } 746 }
611 747out:
612 if (strlen(type->name) != max_tracer_type_len)
613 goto out;
614
615 max_tracer_type_len = 0;
616 for (t = &trace_types; *t; t = &(*t)->next) {
617 len = strlen((*t)->name);
618 if (len > max_tracer_type_len)
619 max_tracer_type_len = len;
620 }
621 out:
622 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
623} 749}
624 750
625void tracing_reset(struct trace_array *tr, int cpu) 751static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 752{
627 ftrace_disable_cpu(); 753 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 754 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 755 ftrace_enable_cpu();
630} 756}
631 757
758void tracing_reset(struct trace_array *tr, int cpu)
759{
760 struct ring_buffer *buffer = tr->buffer;
761
762 ring_buffer_record_disable(buffer);
763
764 /* Make sure all commits have finished */
765 synchronize_sched();
766 __tracing_reset(tr, cpu);
767
768 ring_buffer_record_enable(buffer);
769}
770
632void tracing_reset_online_cpus(struct trace_array *tr) 771void tracing_reset_online_cpus(struct trace_array *tr)
633{ 772{
773 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 774 int cpu;
635 775
776 ring_buffer_record_disable(buffer);
777
778 /* Make sure all commits have finished */
779 synchronize_sched();
780
636 tr->time_start = ftrace_now(tr->cpu); 781 tr->time_start = ftrace_now(tr->cpu);
637 782
638 for_each_online_cpu(cpu) 783 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 784 __tracing_reset(tr, cpu);
785
786 ring_buffer_record_enable(buffer);
640} 787}
641 788
642void tracing_reset_current(int cpu) 789void tracing_reset_current(int cpu)
@@ -667,8 +814,10 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 814 cmdline_idx = 0;
668} 815}
669 816
670static int trace_stop_count; 817int is_tracing_stopped(void)
671static DEFINE_SPINLOCK(tracing_start_lock); 818{
819 return trace_stop_count;
820}
672 821
673/** 822/**
674 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
@@ -837,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
837 986
838 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
839 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
840 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
841 entry->flags = 990 entry->flags =
842#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
843 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -850,14 +999,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
850} 999}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1000EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
852 1001
853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 1002struct ring_buffer_event *
854 int type, 1003trace_buffer_lock_reserve(struct ring_buffer *buffer,
855 unsigned long len, 1004 int type,
856 unsigned long flags, int pc) 1005 unsigned long len,
1006 unsigned long flags, int pc)
857{ 1007{
858 struct ring_buffer_event *event; 1008 struct ring_buffer_event *event;
859 1009
860 event = ring_buffer_lock_reserve(tr->buffer, len); 1010 event = ring_buffer_lock_reserve(buffer, len);
861 if (event != NULL) { 1011 if (event != NULL) {
862 struct trace_entry *ent = ring_buffer_event_data(event); 1012 struct trace_entry *ent = ring_buffer_event_data(event);
863 1013
@@ -867,58 +1017,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
867 1017
868 return event; 1018 return event;
869} 1019}
870static void ftrace_trace_stack(struct trace_array *tr,
871 unsigned long flags, int skip, int pc);
872static void ftrace_trace_userstack(struct trace_array *tr,
873 unsigned long flags, int pc);
874 1020
875static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 1021static inline void
876 struct ring_buffer_event *event, 1022__trace_buffer_unlock_commit(struct ring_buffer *buffer,
877 unsigned long flags, int pc, 1023 struct ring_buffer_event *event,
878 int wake) 1024 unsigned long flags, int pc,
1025 int wake)
879{ 1026{
880 ring_buffer_unlock_commit(tr->buffer, event); 1027 ring_buffer_unlock_commit(buffer, event);
881 1028
882 ftrace_trace_stack(tr, flags, 6, pc); 1029 ftrace_trace_stack(buffer, flags, 6, pc);
883 ftrace_trace_userstack(tr, flags, pc); 1030 ftrace_trace_userstack(buffer, flags, pc);
884 1031
885 if (wake) 1032 if (wake)
886 trace_wake_up(); 1033 trace_wake_up();
887} 1034}
888 1035
889void trace_buffer_unlock_commit(struct trace_array *tr, 1036void trace_buffer_unlock_commit(struct ring_buffer *buffer,
890 struct ring_buffer_event *event, 1037 struct ring_buffer_event *event,
891 unsigned long flags, int pc) 1038 unsigned long flags, int pc)
892{ 1039{
893 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 1040 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
894} 1041}
895 1042
896struct ring_buffer_event * 1043struct ring_buffer_event *
897trace_current_buffer_lock_reserve(int type, unsigned long len, 1044trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1045 int type, unsigned long len,
898 unsigned long flags, int pc) 1046 unsigned long flags, int pc)
899{ 1047{
900 return trace_buffer_lock_reserve(&global_trace, 1048 *current_rb = global_trace.buffer;
1049 return trace_buffer_lock_reserve(*current_rb,
901 type, len, flags, pc); 1050 type, len, flags, pc);
902} 1051}
903EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 1052EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
904 1053
905void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 1054void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1055 struct ring_buffer_event *event,
906 unsigned long flags, int pc) 1056 unsigned long flags, int pc)
907{ 1057{
908 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 1058 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
909} 1059}
910EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1060EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
911 1061
912void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 1062void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
913 unsigned long flags, int pc) 1063 struct ring_buffer_event *event,
1064 unsigned long flags, int pc)
914{ 1065{
915 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 1066 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
916} 1067}
917EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1068EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
918 1069
919void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 1070void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1071 struct ring_buffer_event *event)
920{ 1072{
921 ring_buffer_discard_commit(global_trace.buffer, event); 1073 ring_buffer_discard_commit(buffer, event);
922} 1074}
923EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 1075EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
924 1076
@@ -928,6 +1080,7 @@ trace_function(struct trace_array *tr,
928 int pc) 1080 int pc)
929{ 1081{
930 struct ftrace_event_call *call = &event_function; 1082 struct ftrace_event_call *call = &event_function;
1083 struct ring_buffer *buffer = tr->buffer;
931 struct ring_buffer_event *event; 1084 struct ring_buffer_event *event;
932 struct ftrace_entry *entry; 1085 struct ftrace_entry *entry;
933 1086
@@ -935,7 +1088,7 @@ trace_function(struct trace_array *tr,
935 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
936 return; 1089 return;
937 1090
938 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
939 flags, pc); 1092 flags, pc);
940 if (!event) 1093 if (!event)
941 return; 1094 return;
@@ -943,58 +1096,10 @@ trace_function(struct trace_array *tr,
943 entry->ip = ip; 1096 entry->ip = ip;
944 entry->parent_ip = parent_ip; 1097 entry->parent_ip = parent_ip;
945 1098
946 if (!filter_check_discard(call, entry, tr->buffer, event)) 1099 if (!filter_check_discard(call, entry, buffer, event))
947 ring_buffer_unlock_commit(tr->buffer, event); 1100 ring_buffer_unlock_commit(buffer, event);
948}
949
950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
951static int __trace_graph_entry(struct trace_array *tr,
952 struct ftrace_graph_ent *trace,
953 unsigned long flags,
954 int pc)
955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
957 struct ring_buffer_event *event;
958 struct ftrace_graph_ent_entry *entry;
959
960 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
961 return 0;
962
963 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
964 sizeof(*entry), flags, pc);
965 if (!event)
966 return 0;
967 entry = ring_buffer_event_data(event);
968 entry->graph_ent = *trace;
969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
971
972 return 1;
973} 1101}
974 1102
975static void __trace_graph_return(struct trace_array *tr,
976 struct ftrace_graph_ret *trace,
977 unsigned long flags,
978 int pc)
979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
981 struct ring_buffer_event *event;
982 struct ftrace_graph_ret_entry *entry;
983
984 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
985 return;
986
987 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
988 sizeof(*entry), flags, pc);
989 if (!event)
990 return;
991 entry = ring_buffer_event_data(event);
992 entry->ret = *trace;
993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
995}
996#endif
997
998void 1103void
999ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1104ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1000 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1105 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,17 +1109,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1004 trace_function(tr, ip, parent_ip, flags, pc); 1109 trace_function(tr, ip, parent_ip, flags, pc);
1005} 1110}
1006 1111
1007static void __ftrace_trace_stack(struct trace_array *tr, 1112#ifdef CONFIG_STACKTRACE
1113static void __ftrace_trace_stack(struct ring_buffer *buffer,
1008 unsigned long flags, 1114 unsigned long flags,
1009 int skip, int pc) 1115 int skip, int pc)
1010{ 1116{
1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack; 1117 struct ftrace_event_call *call = &event_kernel_stack;
1013 struct ring_buffer_event *event; 1118 struct ring_buffer_event *event;
1014 struct stack_entry *entry; 1119 struct stack_entry *entry;
1015 struct stack_trace trace; 1120 struct stack_trace trace;
1016 1121
1017 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1122 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1018 sizeof(*entry), flags, pc); 1123 sizeof(*entry), flags, pc);
1019 if (!event) 1124 if (!event)
1020 return; 1125 return;
@@ -1027,32 +1132,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1027 trace.entries = entry->caller; 1132 trace.entries = entry->caller;
1028 1133
1029 save_stack_trace(&trace); 1134 save_stack_trace(&trace);
1030 if (!filter_check_discard(call, entry, tr->buffer, event)) 1135 if (!filter_check_discard(call, entry, buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event); 1136 ring_buffer_unlock_commit(buffer, event);
1032#endif
1033} 1137}
1034 1138
1035static void ftrace_trace_stack(struct trace_array *tr, 1139void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1036 unsigned long flags, 1140 int skip, int pc)
1037 int skip, int pc)
1038{ 1141{
1039 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1142 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1040 return; 1143 return;
1041 1144
1042 __ftrace_trace_stack(tr, flags, skip, pc); 1145 __ftrace_trace_stack(buffer, flags, skip, pc);
1043} 1146}
1044 1147
1045void __trace_stack(struct trace_array *tr, 1148void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1046 unsigned long flags, 1149 int pc)
1047 int skip, int pc)
1048{ 1150{
1049 __ftrace_trace_stack(tr, flags, skip, pc); 1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1050} 1152}
1051 1153
1052static void ftrace_trace_userstack(struct trace_array *tr, 1154void
1053 unsigned long flags, int pc) 1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1054{ 1156{
1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack; 1157 struct ftrace_event_call *call = &event_user_stack;
1057 struct ring_buffer_event *event; 1158 struct ring_buffer_event *event;
1058 struct userstack_entry *entry; 1159 struct userstack_entry *entry;
@@ -1061,12 +1162,13 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1061 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1162 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1062 return; 1163 return;
1063 1164
1064 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1165 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1065 sizeof(*entry), flags, pc); 1166 sizeof(*entry), flags, pc);
1066 if (!event) 1167 if (!event)
1067 return; 1168 return;
1068 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1069 1170
1171 entry->tgid = current->tgid;
1070 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1071 1173
1072 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1075,9 +1177,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1075 trace.entries = entry->caller; 1177 trace.entries = entry->caller;
1076 1178
1077 save_stack_trace_user(&trace); 1179 save_stack_trace_user(&trace);
1078 if (!filter_check_discard(call, entry, tr->buffer, event)) 1180 if (!filter_check_discard(call, entry, buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event); 1181 ring_buffer_unlock_commit(buffer, event);
1080#endif
1081} 1182}
1082 1183
1083#ifdef UNUSED 1184#ifdef UNUSED
@@ -1087,16 +1188,20 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1087} 1188}
1088#endif /* UNUSED */ 1189#endif /* UNUSED */
1089 1190
1191#endif /* CONFIG_STACKTRACE */
1192
1090static void 1193static void
1091ftrace_trace_special(void *__tr, 1194ftrace_trace_special(void *__tr,
1092 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1093 int pc) 1196 int pc)
1094{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1095 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1096 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1201 struct ring_buffer *buffer = tr->buffer;
1097 struct special_entry *entry; 1202 struct special_entry *entry;
1098 1203
1099 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1204 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1100 sizeof(*entry), 0, pc); 1205 sizeof(*entry), 0, pc);
1101 if (!event) 1206 if (!event)
1102 return; 1207 return;
@@ -1104,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1104 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1105 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1106 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1107 trace_buffer_unlock_commit(tr, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1108} 1215}
1109 1216
1110void 1217void
@@ -1115,62 +1222,6 @@ __trace_special(void *__tr, void *__data,
1115} 1222}
1116 1223
1117void 1224void
1118tracing_sched_switch_trace(struct trace_array *tr,
1119 struct task_struct *prev,
1120 struct task_struct *next,
1121 unsigned long flags, int pc)
1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1124 struct ring_buffer_event *event;
1125 struct ctx_switch_entry *entry;
1126
1127 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1128 sizeof(*entry), flags, pc);
1129 if (!event)
1130 return;
1131 entry = ring_buffer_event_data(event);
1132 entry->prev_pid = prev->pid;
1133 entry->prev_prio = prev->prio;
1134 entry->prev_state = prev->state;
1135 entry->next_pid = next->pid;
1136 entry->next_prio = next->prio;
1137 entry->next_state = next->state;
1138 entry->next_cpu = task_cpu(next);
1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1142}
1143
1144void
1145tracing_sched_wakeup_trace(struct trace_array *tr,
1146 struct task_struct *wakee,
1147 struct task_struct *curr,
1148 unsigned long flags, int pc)
1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1151 struct ring_buffer_event *event;
1152 struct ctx_switch_entry *entry;
1153
1154 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1155 sizeof(*entry), flags, pc);
1156 if (!event)
1157 return;
1158 entry = ring_buffer_event_data(event);
1159 entry->prev_pid = curr->pid;
1160 entry->prev_prio = curr->prio;
1161 entry->prev_state = curr->state;
1162 entry->next_pid = wakee->pid;
1163 entry->next_prio = wakee->prio;
1164 entry->next_state = wakee->state;
1165 entry->next_cpu = task_cpu(wakee);
1166
1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1169 ftrace_trace_stack(tr, flags, 6, pc);
1170 ftrace_trace_userstack(tr, flags, pc);
1171}
1172
1173void
1174ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1225ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1175{ 1226{
1176 struct trace_array *tr = &global_trace; 1227 struct trace_array *tr = &global_trace;
@@ -1194,68 +1245,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1194 local_irq_restore(flags); 1245 local_irq_restore(flags);
1195} 1246}
1196 1247
1197#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1198int trace_graph_entry(struct ftrace_graph_ent *trace)
1199{
1200 struct trace_array *tr = &global_trace;
1201 struct trace_array_cpu *data;
1202 unsigned long flags;
1203 long disabled;
1204 int ret;
1205 int cpu;
1206 int pc;
1207
1208 if (!ftrace_trace_task(current))
1209 return 0;
1210
1211 if (!ftrace_graph_addr(trace->func))
1212 return 0;
1213
1214 local_irq_save(flags);
1215 cpu = raw_smp_processor_id();
1216 data = tr->data[cpu];
1217 disabled = atomic_inc_return(&data->disabled);
1218 if (likely(disabled == 1)) {
1219 pc = preempt_count();
1220 ret = __trace_graph_entry(tr, trace, flags, pc);
1221 } else {
1222 ret = 0;
1223 }
1224 /* Only do the atomic if it is not already set */
1225 if (!test_tsk_trace_graph(current))
1226 set_tsk_trace_graph(current);
1227
1228 atomic_dec(&data->disabled);
1229 local_irq_restore(flags);
1230
1231 return ret;
1232}
1233
1234void trace_graph_return(struct ftrace_graph_ret *trace)
1235{
1236 struct trace_array *tr = &global_trace;
1237 struct trace_array_cpu *data;
1238 unsigned long flags;
1239 long disabled;
1240 int cpu;
1241 int pc;
1242
1243 local_irq_save(flags);
1244 cpu = raw_smp_processor_id();
1245 data = tr->data[cpu];
1246 disabled = atomic_inc_return(&data->disabled);
1247 if (likely(disabled == 1)) {
1248 pc = preempt_count();
1249 __trace_graph_return(tr, trace, flags, pc);
1250 }
1251 if (!trace->depth)
1252 clear_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255}
1256#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1257
1258
1259/** 1248/**
1260 * trace_vbprintk - write binary msg to tracing buffer 1249 * trace_vbprintk - write binary msg to tracing buffer
1261 * 1250 *
@@ -1268,6 +1257,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1268 1257
1269 struct ftrace_event_call *call = &event_bprint; 1258 struct ftrace_event_call *call = &event_bprint;
1270 struct ring_buffer_event *event; 1259 struct ring_buffer_event *event;
1260 struct ring_buffer *buffer;
1271 struct trace_array *tr = &global_trace; 1261 struct trace_array *tr = &global_trace;
1272 struct trace_array_cpu *data; 1262 struct trace_array_cpu *data;
1273 struct bprint_entry *entry; 1263 struct bprint_entry *entry;
@@ -1300,7 +1290,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 goto out_unlock; 1290 goto out_unlock;
1301 1291
1302 size = sizeof(*entry) + sizeof(u32) * len; 1292 size = sizeof(*entry) + sizeof(u32) * len;
1303 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1293 buffer = tr->buffer;
1294 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1295 flags, pc);
1304 if (!event) 1296 if (!event)
1305 goto out_unlock; 1297 goto out_unlock;
1306 entry = ring_buffer_event_data(event); 1298 entry = ring_buffer_event_data(event);
@@ -1308,8 +1300,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1308 entry->fmt = fmt; 1300 entry->fmt = fmt;
1309 1301
1310 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1302 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1311 if (!filter_check_discard(call, entry, tr->buffer, event)) 1303 if (!filter_check_discard(call, entry, buffer, event))
1312 ring_buffer_unlock_commit(tr->buffer, event); 1304 ring_buffer_unlock_commit(buffer, event);
1313 1305
1314out_unlock: 1306out_unlock:
1315 __raw_spin_unlock(&trace_buf_lock); 1307 __raw_spin_unlock(&trace_buf_lock);
@@ -1324,14 +1316,30 @@ out:
1324} 1316}
1325EXPORT_SYMBOL_GPL(trace_vbprintk); 1317EXPORT_SYMBOL_GPL(trace_vbprintk);
1326 1318
1327int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1319int trace_array_printk(struct trace_array *tr,
1320 unsigned long ip, const char *fmt, ...)
1321{
1322 int ret;
1323 va_list ap;
1324
1325 if (!(trace_flags & TRACE_ITER_PRINTK))
1326 return 0;
1327
1328 va_start(ap, fmt);
1329 ret = trace_array_vprintk(tr, ip, fmt, ap);
1330 va_end(ap);
1331 return ret;
1332}
1333
1334int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args)
1328{ 1336{
1329 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1330 static char trace_buf[TRACE_BUF_SIZE]; 1338 static char trace_buf[TRACE_BUF_SIZE];
1331 1339
1332 struct ftrace_event_call *call = &event_print; 1340 struct ftrace_event_call *call = &event_print;
1333 struct ring_buffer_event *event; 1341 struct ring_buffer_event *event;
1334 struct trace_array *tr = &global_trace; 1342 struct ring_buffer *buffer;
1335 struct trace_array_cpu *data; 1343 struct trace_array_cpu *data;
1336 int cpu, len = 0, size, pc; 1344 int cpu, len = 0, size, pc;
1337 struct print_entry *entry; 1345 struct print_entry *entry;
@@ -1359,7 +1367,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1359 trace_buf[len] = 0; 1367 trace_buf[len] = 0;
1360 1368
1361 size = sizeof(*entry) + len + 1; 1369 size = sizeof(*entry) + len + 1;
1362 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1370 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1372 irq_flags, pc);
1363 if (!event) 1373 if (!event)
1364 goto out_unlock; 1374 goto out_unlock;
1365 entry = ring_buffer_event_data(event); 1375 entry = ring_buffer_event_data(event);
@@ -1367,8 +1377,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1367 1377
1368 memcpy(&entry->buf, trace_buf, len); 1378 memcpy(&entry->buf, trace_buf, len);
1369 entry->buf[len] = 0; 1379 entry->buf[len] = 0;
1370 if (!filter_check_discard(call, entry, tr->buffer, event)) 1380 if (!filter_check_discard(call, entry, buffer, event))
1371 ring_buffer_unlock_commit(tr->buffer, event); 1381 ring_buffer_unlock_commit(buffer, event);
1372 1382
1373 out_unlock: 1383 out_unlock:
1374 __raw_spin_unlock(&trace_buf_lock); 1384 __raw_spin_unlock(&trace_buf_lock);
@@ -1380,6 +1390,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1380 1390
1381 return len; 1391 return len;
1382} 1392}
1393
1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1395{
1396 return trace_array_printk(&global_trace, ip, fmt, args);
1397}
1383EXPORT_SYMBOL_GPL(trace_vprintk); 1398EXPORT_SYMBOL_GPL(trace_vprintk);
1384 1399
1385enum trace_file_type { 1400enum trace_file_type {
@@ -1519,6 +1534,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1519 return ent; 1534 return ent;
1520} 1535}
1521 1536
1537static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1538{
1539 struct trace_array *tr = iter->tr;
1540 struct ring_buffer_event *event;
1541 struct ring_buffer_iter *buf_iter;
1542 unsigned long entries = 0;
1543 u64 ts;
1544
1545 tr->data[cpu]->skipped_entries = 0;
1546
1547 if (!iter->buffer_iter[cpu])
1548 return;
1549
1550 buf_iter = iter->buffer_iter[cpu];
1551 ring_buffer_iter_reset(buf_iter);
1552
1553 /*
1554 * We could have the case with the max latency tracers
1555 * that a reset never took place on a cpu. This is evident
1556 * by the timestamp being before the start of the buffer.
1557 */
1558 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1559 if (ts >= iter->tr->time_start)
1560 break;
1561 entries++;
1562 ring_buffer_read(buf_iter, NULL);
1563 }
1564
1565 tr->data[cpu]->skipped_entries = entries;
1566}
1567
1522/* 1568/*
1523 * No necessary locking here. The worst thing which can 1569 * No necessary locking here. The worst thing which can
1524 * happen is loosing events consumed at the same time 1570 * happen is loosing events consumed at the same time
@@ -1557,10 +1603,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1557 1603
1558 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1604 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1559 for_each_tracing_cpu(cpu) 1605 for_each_tracing_cpu(cpu)
1560 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1606 tracing_iter_reset(iter, cpu);
1561 } else 1607 } else
1562 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1608 tracing_iter_reset(iter, cpu_file);
1563
1564 1609
1565 ftrace_enable_cpu(); 1610 ftrace_enable_cpu();
1566 1611
@@ -1589,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m)
1589 seq_puts(m, "# | / _----=> need-resched \n"); 1634 seq_puts(m, "# | / _----=> need-resched \n");
1590 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1635 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1591 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1636 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1592 seq_puts(m, "# |||| / \n"); 1637 seq_puts(m, "# |||| /_--=> lock-depth \n");
1593 seq_puts(m, "# ||||| delay \n"); 1638 seq_puts(m, "# |||||/ delay \n");
1594 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1639 seq_puts(m, "# cmd pid |||||| time | caller \n");
1595 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1640 seq_puts(m, "# \\ / |||||| \\ | / \n");
1596} 1641}
1597 1642
1598static void print_func_help_header(struct seq_file *m) 1643static void print_func_help_header(struct seq_file *m)
@@ -1609,16 +1654,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1609 struct trace_array *tr = iter->tr; 1654 struct trace_array *tr = iter->tr;
1610 struct trace_array_cpu *data = tr->data[tr->cpu]; 1655 struct trace_array_cpu *data = tr->data[tr->cpu];
1611 struct tracer *type = current_trace; 1656 struct tracer *type = current_trace;
1612 unsigned long total; 1657 unsigned long entries = 0;
1613 unsigned long entries; 1658 unsigned long total = 0;
1659 unsigned long count;
1614 const char *name = "preemption"; 1660 const char *name = "preemption";
1661 int cpu;
1615 1662
1616 if (type) 1663 if (type)
1617 name = type->name; 1664 name = type->name;
1618 1665
1619 entries = ring_buffer_entries(iter->tr->buffer); 1666
1620 total = entries + 1667 for_each_tracing_cpu(cpu) {
1621 ring_buffer_overruns(iter->tr->buffer); 1668 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1669 /*
1670 * If this buffer has skipped entries, then we hold all
1671 * entries for the trace and we need to ignore the
1672 * ones before the time stamp.
1673 */
1674 if (tr->data[cpu]->skipped_entries) {
1675 count -= tr->data[cpu]->skipped_entries;
1676 /* total is the same as the entries */
1677 total += count;
1678 } else
1679 total += count +
1680 ring_buffer_overrun_cpu(tr->buffer, cpu);
1681 entries += count;
1682 }
1622 1683
1623 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1684 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1624 name, UTS_RELEASE); 1685 name, UTS_RELEASE);
@@ -1660,7 +1721,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1660 seq_puts(m, "\n# => ended at: "); 1721 seq_puts(m, "\n# => ended at: ");
1661 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1722 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1662 trace_print_seq(m, &iter->seq); 1723 trace_print_seq(m, &iter->seq);
1663 seq_puts(m, "#\n"); 1724 seq_puts(m, "\n#\n");
1664 } 1725 }
1665 1726
1666 seq_puts(m, "#\n"); 1727 seq_puts(m, "#\n");
@@ -1679,6 +1740,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1679 if (cpumask_test_cpu(iter->cpu, iter->started)) 1740 if (cpumask_test_cpu(iter->cpu, iter->started))
1680 return; 1741 return;
1681 1742
1743 if (iter->tr->data[iter->cpu]->skipped_entries)
1744 return;
1745
1682 cpumask_set_cpu(iter->cpu, iter->started); 1746 cpumask_set_cpu(iter->cpu, iter->started);
1683 1747
1684 /* Don't print started cpu buffer for the first entry of the trace */ 1748 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1885,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
1885 return 0; 1949 return 0;
1886} 1950}
1887 1951
1888static struct seq_operations tracer_seq_ops = { 1952static const struct seq_operations tracer_seq_ops = {
1889 .start = s_start, 1953 .start = s_start,
1890 .next = s_next, 1954 .next = s_next,
1891 .stop = s_stop, 1955 .stop = s_stop,
@@ -1920,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file)
1920 if (current_trace) 1984 if (current_trace)
1921 *iter->trace = *current_trace; 1985 *iter->trace = *current_trace;
1922 1986
1923 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) 1987 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
1924 goto fail; 1988 goto fail;
1925 1989
1926 cpumask_clear(iter->started);
1927
1928 if (current_trace && current_trace->print_max) 1990 if (current_trace && current_trace->print_max)
1929 iter->tr = &max_tr; 1991 iter->tr = &max_tr;
1930 else 1992 else
@@ -1941,19 +2003,23 @@ __tracing_open(struct inode *inode, struct file *file)
1941 if (ring_buffer_overruns(iter->tr->buffer)) 2003 if (ring_buffer_overruns(iter->tr->buffer))
1942 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2004 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1943 2005
2006 /* stop the trace while dumping */
2007 tracing_stop();
2008
1944 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2009 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1945 for_each_tracing_cpu(cpu) { 2010 for_each_tracing_cpu(cpu) {
1946 2011
1947 iter->buffer_iter[cpu] = 2012 iter->buffer_iter[cpu] =
1948 ring_buffer_read_start(iter->tr->buffer, cpu); 2013 ring_buffer_read_start(iter->tr->buffer, cpu);
2014 tracing_iter_reset(iter, cpu);
1949 } 2015 }
1950 } else { 2016 } else {
1951 cpu = iter->cpu_file; 2017 cpu = iter->cpu_file;
1952 iter->buffer_iter[cpu] = 2018 iter->buffer_iter[cpu] =
1953 ring_buffer_read_start(iter->tr->buffer, cpu); 2019 ring_buffer_read_start(iter->tr->buffer, cpu);
2020 tracing_iter_reset(iter, cpu);
1954 } 2021 }
1955 2022
1956 /* TODO stop tracer */
1957 ret = seq_open(file, &tracer_seq_ops); 2023 ret = seq_open(file, &tracer_seq_ops);
1958 if (ret < 0) { 2024 if (ret < 0) {
1959 fail_ret = ERR_PTR(ret); 2025 fail_ret = ERR_PTR(ret);
@@ -1963,9 +2029,6 @@ __tracing_open(struct inode *inode, struct file *file)
1963 m = file->private_data; 2029 m = file->private_data;
1964 m->private = iter; 2030 m->private = iter;
1965 2031
1966 /* stop the trace while dumping */
1967 tracing_stop();
1968
1969 mutex_unlock(&trace_types_lock); 2032 mutex_unlock(&trace_types_lock);
1970 2033
1971 return iter; 2034 return iter;
@@ -1976,6 +2039,7 @@ __tracing_open(struct inode *inode, struct file *file)
1976 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2039 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1977 } 2040 }
1978 free_cpumask_var(iter->started); 2041 free_cpumask_var(iter->started);
2042 tracing_start();
1979 fail: 2043 fail:
1980 mutex_unlock(&trace_types_lock); 2044 mutex_unlock(&trace_types_lock);
1981 kfree(iter->trace); 2045 kfree(iter->trace);
@@ -2097,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v)
2097 return 0; 2161 return 0;
2098} 2162}
2099 2163
2100static struct seq_operations show_traces_seq_ops = { 2164static const struct seq_operations show_traces_seq_ops = {
2101 .start = t_start, 2165 .start = t_start,
2102 .next = t_next, 2166 .next = t_next,
2103 .stop = t_stop, 2167 .stop = t_stop,
@@ -2257,8 +2321,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2257 len += 3; /* "no" and newline */ 2321 len += 3; /* "no" and newline */
2258 } 2322 }
2259 2323
2260 /* +2 for \n and \0 */ 2324 /* +1 for \0 */
2261 buf = kmalloc(len + 2, GFP_KERNEL); 2325 buf = kmalloc(len + 1, GFP_KERNEL);
2262 if (!buf) { 2326 if (!buf) {
2263 mutex_unlock(&trace_types_lock); 2327 mutex_unlock(&trace_types_lock);
2264 return -ENOMEM; 2328 return -ENOMEM;
@@ -2281,7 +2345,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2281 } 2345 }
2282 mutex_unlock(&trace_types_lock); 2346 mutex_unlock(&trace_types_lock);
2283 2347
2284 WARN_ON(r >= len + 2); 2348 WARN_ON(r >= len + 1);
2285 2349
2286 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2350 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2287 2351
@@ -2292,23 +2356,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2292/* Try to assign a tracer specific option */ 2356/* Try to assign a tracer specific option */
2293static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2357static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2294{ 2358{
2295 struct tracer_flags *trace_flags = trace->flags; 2359 struct tracer_flags *tracer_flags = trace->flags;
2296 struct tracer_opt *opts = NULL; 2360 struct tracer_opt *opts = NULL;
2297 int ret = 0, i = 0; 2361 int ret = 0, i = 0;
2298 int len; 2362 int len;
2299 2363
2300 for (i = 0; trace_flags->opts[i].name; i++) { 2364 for (i = 0; tracer_flags->opts[i].name; i++) {
2301 opts = &trace_flags->opts[i]; 2365 opts = &tracer_flags->opts[i];
2302 len = strlen(opts->name); 2366 len = strlen(opts->name);
2303 2367
2304 if (strncmp(cmp, opts->name, len) == 0) { 2368 if (strncmp(cmp, opts->name, len) == 0) {
2305 ret = trace->set_flag(trace_flags->val, 2369 ret = trace->set_flag(tracer_flags->val,
2306 opts->bit, !neg); 2370 opts->bit, !neg);
2307 break; 2371 break;
2308 } 2372 }
2309 } 2373 }
2310 /* Not found */ 2374 /* Not found */
2311 if (!trace_flags->opts[i].name) 2375 if (!tracer_flags->opts[i].name)
2312 return -EINVAL; 2376 return -EINVAL;
2313 2377
2314 /* Refused to handle */ 2378 /* Refused to handle */
@@ -2316,9 +2380,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2316 return ret; 2380 return ret;
2317 2381
2318 if (neg) 2382 if (neg)
2319 trace_flags->val &= ~opts->bit; 2383 tracer_flags->val &= ~opts->bit;
2320 else 2384 else
2321 trace_flags->val |= opts->bit; 2385 tracer_flags->val |= opts->bit;
2322 2386
2323 return 0; 2387 return 0;
2324} 2388}
@@ -2333,22 +2397,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2333 trace_flags |= mask; 2397 trace_flags |= mask;
2334 else 2398 else
2335 trace_flags &= ~mask; 2399 trace_flags &= ~mask;
2336
2337 if (mask == TRACE_ITER_GLOBAL_CLK) {
2338 u64 (*func)(void);
2339
2340 if (enabled)
2341 func = trace_clock_global;
2342 else
2343 func = trace_clock_local;
2344
2345 mutex_lock(&trace_types_lock);
2346 ring_buffer_set_clock(global_trace.buffer, func);
2347
2348 if (max_tr.buffer)
2349 ring_buffer_set_clock(max_tr.buffer, func);
2350 mutex_unlock(&trace_types_lock);
2351 }
2352} 2400}
2353 2401
2354static ssize_t 2402static ssize_t
@@ -2543,7 +2591,7 @@ static ssize_t
2543tracing_set_trace_read(struct file *filp, char __user *ubuf, 2591tracing_set_trace_read(struct file *filp, char __user *ubuf,
2544 size_t cnt, loff_t *ppos) 2592 size_t cnt, loff_t *ppos)
2545{ 2593{
2546 char buf[max_tracer_type_len+2]; 2594 char buf[MAX_TRACER_SIZE+2];
2547 int r; 2595 int r;
2548 2596
2549 mutex_lock(&trace_types_lock); 2597 mutex_lock(&trace_types_lock);
@@ -2693,15 +2741,15 @@ static ssize_t
2693tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2741tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2694 size_t cnt, loff_t *ppos) 2742 size_t cnt, loff_t *ppos)
2695{ 2743{
2696 char buf[max_tracer_type_len+1]; 2744 char buf[MAX_TRACER_SIZE+1];
2697 int i; 2745 int i;
2698 size_t ret; 2746 size_t ret;
2699 int err; 2747 int err;
2700 2748
2701 ret = cnt; 2749 ret = cnt;
2702 2750
2703 if (cnt > max_tracer_type_len) 2751 if (cnt > MAX_TRACER_SIZE)
2704 cnt = max_tracer_type_len; 2752 cnt = MAX_TRACER_SIZE;
2705 2753
2706 if (copy_from_user(&buf, ubuf, cnt)) 2754 if (copy_from_user(&buf, ubuf, cnt))
2707 return -EFAULT; 2755 return -EFAULT;
@@ -3316,6 +3364,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3316 return cnt; 3364 return cnt;
3317} 3365}
3318 3366
3367static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3368 size_t cnt, loff_t *ppos)
3369{
3370 char buf[64];
3371 int bufiter = 0;
3372 int i;
3373
3374 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3375 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3376 "%s%s%s%s", i ? " " : "",
3377 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3378 i == trace_clock_id ? "]" : "");
3379 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3380
3381 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3382}
3383
3384static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3385 size_t cnt, loff_t *fpos)
3386{
3387 char buf[64];
3388 const char *clockstr;
3389 int i;
3390
3391 if (cnt >= sizeof(buf))
3392 return -EINVAL;
3393
3394 if (copy_from_user(&buf, ubuf, cnt))
3395 return -EFAULT;
3396
3397 buf[cnt] = 0;
3398
3399 clockstr = strstrip(buf);
3400
3401 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3402 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3403 break;
3404 }
3405 if (i == ARRAY_SIZE(trace_clocks))
3406 return -EINVAL;
3407
3408 trace_clock_id = i;
3409
3410 mutex_lock(&trace_types_lock);
3411
3412 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3413 if (max_tr.buffer)
3414 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3415
3416 mutex_unlock(&trace_types_lock);
3417
3418 *fpos += cnt;
3419
3420 return cnt;
3421}
3422
3319static const struct file_operations tracing_max_lat_fops = { 3423static const struct file_operations tracing_max_lat_fops = {
3320 .open = tracing_open_generic, 3424 .open = tracing_open_generic,
3321 .read = tracing_max_lat_read, 3425 .read = tracing_max_lat_read,
@@ -3353,6 +3457,12 @@ static const struct file_operations tracing_mark_fops = {
3353 .write = tracing_mark_write, 3457 .write = tracing_mark_write,
3354}; 3458};
3355 3459
3460static const struct file_operations trace_clock_fops = {
3461 .open = tracing_open_generic,
3462 .read = tracing_clock_read,
3463 .write = tracing_clock_write,
3464};
3465
3356struct ftrace_buffer_info { 3466struct ftrace_buffer_info {
3357 struct trace_array *tr; 3467 struct trace_array *tr;
3358 void *spare; 3468 void *spare;
@@ -3633,9 +3743,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3743 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3744 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635 3745
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3746 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640 3747
3641 kfree(s); 3748 kfree(s);
@@ -4066,11 +4173,13 @@ static __init int tracer_init_debugfs(void)
4066 trace_create_file("current_tracer", 0644, d_tracer, 4173 trace_create_file("current_tracer", 0644, d_tracer,
4067 &global_trace, &set_tracer_fops); 4174 &global_trace, &set_tracer_fops);
4068 4175
4176#ifdef CONFIG_TRACER_MAX_TRACE
4069 trace_create_file("tracing_max_latency", 0644, d_tracer, 4177 trace_create_file("tracing_max_latency", 0644, d_tracer,
4070 &tracing_max_latency, &tracing_max_lat_fops); 4178 &tracing_max_latency, &tracing_max_lat_fops);
4071 4179
4072 trace_create_file("tracing_thresh", 0644, d_tracer, 4180 trace_create_file("tracing_thresh", 0644, d_tracer,
4073 &tracing_thresh, &tracing_max_lat_fops); 4181 &tracing_thresh, &tracing_max_lat_fops);
4182#endif
4074 4183
4075 trace_create_file("README", 0444, d_tracer, 4184 trace_create_file("README", 0444, d_tracer,
4076 NULL, &tracing_readme_fops); 4185 NULL, &tracing_readme_fops);
@@ -4087,6 +4196,9 @@ static __init int tracer_init_debugfs(void)
4087 trace_create_file("saved_cmdlines", 0444, d_tracer, 4196 trace_create_file("saved_cmdlines", 0444, d_tracer,
4088 NULL, &tracing_saved_cmdlines_fops); 4197 NULL, &tracing_saved_cmdlines_fops);
4089 4198
4199 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4200 &trace_clock_fops);
4201
4090#ifdef CONFIG_DYNAMIC_FTRACE 4202#ifdef CONFIG_DYNAMIC_FTRACE
4091 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4203 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4092 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4204 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4265,7 +4377,6 @@ void ftrace_dump(void)
4265 4377
4266__init static int tracer_alloc_buffers(void) 4378__init static int tracer_alloc_buffers(void)
4267{ 4379{
4268 struct trace_array_cpu *data;
4269 int ring_buf_size; 4380 int ring_buf_size;
4270 int i; 4381 int i;
4271 int ret = -ENOMEM; 4382 int ret = -ENOMEM;
@@ -4276,7 +4387,7 @@ __init static int tracer_alloc_buffers(void)
4276 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4277 goto out_free_buffer_mask; 4388 goto out_free_buffer_mask;
4278 4389
4279 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) 4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4280 goto out_free_tracing_cpumask; 4391 goto out_free_tracing_cpumask;
4281 4392
4282 /* To save memory, keep the ring buffer size to its minimum */ 4393 /* To save memory, keep the ring buffer size to its minimum */
@@ -4287,7 +4398,6 @@ __init static int tracer_alloc_buffers(void)
4287 4398
4288 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4399 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
4289 cpumask_copy(tracing_cpumask, cpu_all_mask); 4400 cpumask_copy(tracing_cpumask, cpu_all_mask);
4290 cpumask_clear(tracing_reader_cpumask);
4291 4401
4292 /* TODO: make the number of buffers hot pluggable with CPUS */ 4402 /* TODO: make the number of buffers hot pluggable with CPUS */
4293 global_trace.buffer = ring_buffer_alloc(ring_buf_size, 4403 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
@@ -4315,7 +4425,7 @@ __init static int tracer_alloc_buffers(void)
4315 4425
4316 /* Allocate the first page for all buffers */ 4426 /* Allocate the first page for all buffers */
4317 for_each_tracing_cpu(i) { 4427 for_each_tracing_cpu(i) {
4318 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4428 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4319 max_tr.data[i] = &per_cpu(max_data, i); 4429 max_tr.data[i] = &per_cpu(max_data, i);
4320 } 4430 }
4321 4431
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e9559..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,10 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -34,167 +34,61 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 35 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
41 TRACE_POWER,
42 TRACE_BLK, 39 TRACE_BLK,
43 40
44 __TRACE_LAST_TYPE, 41 __TRACE_LAST_TYPE,
45}; 42};
46 43
47/* 44enum kmemtrace_type_id {
48 * Function trace entry - function address and parent function addres: 45 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
49 */ 46 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
50struct ftrace_entry { 47 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
51 struct trace_entry ent;
52 unsigned long ip;
53 unsigned long parent_ip;
54};
55
56/* Function call entry */
57struct ftrace_graph_ent_entry {
58 struct trace_entry ent;
59 struct ftrace_graph_ent graph_ent;
60}; 48};
61 49
62/* Function return entry */
63struct ftrace_graph_ret_entry {
64 struct trace_entry ent;
65 struct ftrace_graph_ret ret;
66};
67extern struct tracer boot_tracer; 50extern struct tracer boot_tracer;
68 51
69/* 52#undef __field
70 * Context switch trace entry - which task (and prio) we switched from/to: 53#define __field(type, item) type item;
71 */
72struct ctx_switch_entry {
73 struct trace_entry ent;
74 unsigned int prev_pid;
75 unsigned char prev_prio;
76 unsigned char prev_state;
77 unsigned int next_pid;
78 unsigned char next_prio;
79 unsigned char next_state;
80 unsigned int next_cpu;
81};
82 54
83/* 55#undef __field_struct
84 * Special (free-form) trace entry: 56#define __field_struct(type, item) __field(type, item)
85 */
86struct special_entry {
87 struct trace_entry ent;
88 unsigned long arg1;
89 unsigned long arg2;
90 unsigned long arg3;
91};
92 57
93/* 58#undef __field_desc
94 * Stack-trace entry: 59#define __field_desc(type, container, item)
95 */
96 60
97#define FTRACE_STACK_ENTRIES 8 61#undef __array
62#define __array(type, item, size) type item[size];
98 63
99struct stack_entry { 64#undef __array_desc
100 struct trace_entry ent; 65#define __array_desc(type, container, item, size)
101 unsigned long caller[FTRACE_STACK_ENTRIES];
102};
103 66
104struct userstack_entry { 67#undef __dynamic_array
105 struct trace_entry ent; 68#define __dynamic_array(type, item) type item[];
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108 69
109/* 70#undef F_STRUCT
110 * trace_printk entry: 71#define F_STRUCT(args...) args
111 */
112struct bprint_entry {
113 struct trace_entry ent;
114 unsigned long ip;
115 const char *fmt;
116 u32 buf[];
117};
118 72
119struct print_entry { 73#undef FTRACE_ENTRY
120 struct trace_entry ent; 74#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121 unsigned long ip; 75 struct struct_name { \
122 char buf[]; 76 struct trace_entry ent; \
123}; 77 tstruct \
124 78 }
125#define TRACE_OLD_SIZE 88
126
127struct trace_field_cont {
128 unsigned char type;
129 /* Temporary till we get rid of this completely */
130 char buf[TRACE_OLD_SIZE - 1];
131};
132
133struct trace_mmiotrace_rw {
134 struct trace_entry ent;
135 struct mmiotrace_rw rw;
136};
137
138struct trace_mmiotrace_map {
139 struct trace_entry ent;
140 struct mmiotrace_map map;
141};
142
143struct trace_boot_call {
144 struct trace_entry ent;
145 struct boot_trace_call boot_call;
146};
147
148struct trace_boot_ret {
149 struct trace_entry ent;
150 struct boot_trace_ret boot_ret;
151};
152
153#define TRACE_FUNC_SIZE 30
154#define TRACE_FILE_SIZE 20
155struct trace_branch {
156 struct trace_entry ent;
157 unsigned line;
158 char func[TRACE_FUNC_SIZE+1];
159 char file[TRACE_FILE_SIZE+1];
160 char correct;
161};
162
163struct hw_branch_entry {
164 struct trace_entry ent;
165 u64 from;
166 u64 to;
167};
168 79
169struct trace_power { 80#undef TP_ARGS
170 struct trace_entry ent; 81#define TP_ARGS(args...) args
171 struct power_trace state_data;
172};
173 82
174enum kmemtrace_type_id { 83#undef FTRACE_ENTRY_DUP
175 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 84#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
176 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
177 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
178};
179
180struct kmemtrace_alloc_entry {
181 struct trace_entry ent;
182 enum kmemtrace_type_id type_id;
183 unsigned long call_site;
184 const void *ptr;
185 size_t bytes_req;
186 size_t bytes_alloc;
187 gfp_t gfp_flags;
188 int node;
189};
190 85
191struct kmemtrace_free_entry { 86#include "trace_entries.h"
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196};
197 87
88/*
89 * syscalls are special, and need special handling, this is why
90 * they are not included in trace_entries.h
91 */
198struct syscall_trace_enter { 92struct syscall_trace_enter {
199 struct trace_entry ent; 93 struct trace_entry ent;
200 int nr; 94 int nr;
@@ -207,13 +101,12 @@ struct syscall_trace_exit {
207 unsigned long ret; 101 unsigned long ret;
208}; 102};
209 103
210
211/* 104/*
212 * trace_flag_type is an enumeration that holds different 105 * trace_flag_type is an enumeration that holds different
213 * states when a trace occurs. These are: 106 * states when a trace occurs. These are:
214 * IRQS_OFF - interrupts were disabled 107 * IRQS_OFF - interrupts were disabled
215 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 108 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
216 * NEED_RESCED - reschedule is requested 109 * NEED_RESCHED - reschedule is requested
217 * HARDIRQ - inside an interrupt handler 110 * HARDIRQ - inside an interrupt handler
218 * SOFTIRQ - inside a softirq handler 111 * SOFTIRQ - inside a softirq handler
219 */ 112 */
@@ -236,9 +129,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 129 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 130 void *buffer_page; /* ring buffer spare */
238 131
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 132 unsigned long saved_latency;
243 unsigned long critical_start; 133 unsigned long critical_start;
244 unsigned long critical_end; 134 unsigned long critical_end;
@@ -246,6 +136,7 @@ struct trace_array_cpu {
246 unsigned long nice; 136 unsigned long nice;
247 unsigned long policy; 137 unsigned long policy;
248 unsigned long rt_priority; 138 unsigned long rt_priority;
139 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 140 cycle_t preempt_timestamp;
250 pid_t pid; 141 pid_t pid;
251 uid_t uid; 142 uid_t uid;
@@ -314,15 +205,10 @@ extern void __ftrace_bad_type(void);
314 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 205 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
315 TRACE_GRAPH_RET); \ 206 TRACE_GRAPH_RET); \
316 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 207 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
317 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
318 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 208 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
319 TRACE_KMEM_ALLOC); \ 209 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 211 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 212 __ftrace_bad_type(); \
327 } while (0) 213 } while (0)
328 214
@@ -398,7 +284,6 @@ struct tracer {
398 struct tracer *next; 284 struct tracer *next;
399 int print_max; 285 int print_max;
400 struct tracer_flags *flags; 286 struct tracer_flags *flags;
401 struct tracer_stat *stats;
402}; 287};
403 288
404 289
@@ -423,12 +308,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 308
424struct ring_buffer_event; 309struct ring_buffer_event;
425 310
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 311struct ring_buffer_event *
427 int type, 312trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 313 int type,
429 unsigned long flags, 314 unsigned long len,
430 int pc); 315 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 316 int pc);
317void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 318 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 319 unsigned long flags, int pc);
434 320
@@ -467,6 +353,7 @@ void trace_function(struct trace_array *tr,
467 353
468void trace_graph_return(struct ftrace_graph_ret *trace); 354void trace_graph_return(struct ftrace_graph_ret *trace);
469int trace_graph_entry(struct ftrace_graph_ent *trace); 355int trace_graph_entry(struct ftrace_graph_ent *trace);
356void set_graph_array(struct trace_array *tr);
470 357
471void tracing_start_cmdline_record(void); 358void tracing_start_cmdline_record(void);
472void tracing_stop_cmdline_record(void); 359void tracing_stop_cmdline_record(void);
@@ -475,35 +362,46 @@ void tracing_stop_sched_switch_record(void);
475void tracing_start_sched_switch_record(void); 362void tracing_start_sched_switch_record(void);
476int register_tracer(struct tracer *type); 363int register_tracer(struct tracer *type);
477void unregister_tracer(struct tracer *type); 364void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void);
478 366
479extern unsigned long nsecs_to_usecs(unsigned long nsecs); 367extern unsigned long nsecs_to_usecs(unsigned long nsecs);
480 368
369#ifdef CONFIG_TRACER_MAX_TRACE
481extern unsigned long tracing_max_latency; 370extern unsigned long tracing_max_latency;
482extern unsigned long tracing_thresh; 371extern unsigned long tracing_thresh;
483 372
484void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 373void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
485void update_max_tr_single(struct trace_array *tr, 374void update_max_tr_single(struct trace_array *tr,
486 struct task_struct *tsk, int cpu); 375 struct task_struct *tsk, int cpu);
376#endif /* CONFIG_TRACER_MAX_TRACE */
487 377
488void __trace_stack(struct trace_array *tr, 378#ifdef CONFIG_STACKTRACE
489 unsigned long flags, 379void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
490 int skip, int pc); 380 int skip, int pc);
491 381
492extern cycle_t ftrace_now(int cpu); 382void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
383 int pc);
493 384
494#ifdef CONFIG_CONTEXT_SWITCH_TRACER 385void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
495typedef void 386 int pc);
496(*tracer_switch_func_t)(void *private, 387#else
497 void *__rq, 388static inline void ftrace_trace_stack(struct trace_array *tr,
498 struct task_struct *prev, 389 unsigned long flags, int skip, int pc)
499 struct task_struct *next); 390{
500 391}
501struct tracer_switch_ops { 392
502 tracer_switch_func_t func; 393static inline void ftrace_trace_userstack(struct trace_array *tr,
503 void *private; 394 unsigned long flags, int pc)
504 struct tracer_switch_ops *next; 395{
505}; 396}
506#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 397
398static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
399 int skip, int pc)
400{
401}
402#endif /* CONFIG_STACKTRACE */
403
404extern cycle_t ftrace_now(int cpu);
507 405
508extern void trace_find_cmdline(int pid, char comm[]); 406extern void trace_find_cmdline(int pid, char comm[]);
509 407
@@ -513,6 +411,10 @@ extern unsigned long ftrace_update_tot_cnt;
513extern int DYN_FTRACE_TEST_NAME(void); 411extern int DYN_FTRACE_TEST_NAME(void);
514#endif 412#endif
515 413
414extern int ring_buffer_expanded;
415extern bool tracing_selftest_disabled;
416DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
417
516#ifdef CONFIG_FTRACE_STARTUP_TEST 418#ifdef CONFIG_FTRACE_STARTUP_TEST
517extern int trace_selftest_startup_function(struct tracer *trace, 419extern int trace_selftest_startup_function(struct tracer *trace,
518 struct trace_array *tr); 420 struct trace_array *tr);
@@ -544,9 +446,16 @@ extern int
544trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 446trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
545extern int 447extern int
546trace_vprintk(unsigned long ip, const char *fmt, va_list args); 448trace_vprintk(unsigned long ip, const char *fmt, va_list args);
449extern int
450trace_array_vprintk(struct trace_array *tr,
451 unsigned long ip, const char *fmt, va_list args);
452int trace_array_printk(struct trace_array *tr,
453 unsigned long ip, const char *fmt, ...);
547 454
548extern unsigned long trace_flags; 455extern unsigned long trace_flags;
549 456
457extern int trace_clock_id;
458
550/* Standard output formatting function used for function return traces */ 459/* Standard output formatting function used for function return traces */
551#ifdef CONFIG_FUNCTION_GRAPH_TRACER 460#ifdef CONFIG_FUNCTION_GRAPH_TRACER
552extern enum print_line_t print_graph_function(struct trace_iterator *iter); 461extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -609,6 +518,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
609#endif 518#endif
610 519
611/* 520/*
521 * struct trace_parser - servers for reading the user input separated by spaces
522 * @cont: set if the input is not complete - no final space char was found
523 * @buffer: holds the parsed user input
524 * @idx: user input lenght
525 * @size: buffer size
526 */
527struct trace_parser {
528 bool cont;
529 char *buffer;
530 unsigned idx;
531 unsigned size;
532};
533
534static inline bool trace_parser_loaded(struct trace_parser *parser)
535{
536 return (parser->idx != 0);
537}
538
539static inline bool trace_parser_cont(struct trace_parser *parser)
540{
541 return parser->cont;
542}
543
544static inline void trace_parser_clear(struct trace_parser *parser)
545{
546 parser->cont = false;
547 parser->idx = 0;
548}
549
550extern int trace_parser_get_init(struct trace_parser *parser, int size);
551extern void trace_parser_put(struct trace_parser *parser);
552extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
553 size_t cnt, loff_t *ppos);
554
555/*
612 * trace_iterator_flags is an enumeration that defines bit 556 * trace_iterator_flags is an enumeration that defines bit
613 * positions into trace_flags that controls the output. 557 * positions into trace_flags that controls the output.
614 * 558 *
@@ -635,9 +579,8 @@ enum trace_iterator_flags {
635 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 579 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
636 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 580 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
637 TRACE_ITER_LATENCY_FMT = 0x40000, 581 TRACE_ITER_LATENCY_FMT = 0x40000,
638 TRACE_ITER_GLOBAL_CLK = 0x80000, 582 TRACE_ITER_SLEEP_TIME = 0x80000,
639 TRACE_ITER_SLEEP_TIME = 0x100000, 583 TRACE_ITER_GRAPH_TIME = 0x100000,
640 TRACE_ITER_GRAPH_TIME = 0x200000,
641}; 584};
642 585
643/* 586/*
@@ -734,6 +677,7 @@ struct ftrace_event_field {
734 struct list_head link; 677 struct list_head link;
735 char *name; 678 char *name;
736 char *type; 679 char *type;
680 int filter_type;
737 int offset; 681 int offset;
738 int size; 682 int size;
739 int is_signed; 683 int is_signed;
@@ -743,13 +687,15 @@ struct event_filter {
743 int n_preds; 687 int n_preds;
744 struct filter_pred **preds; 688 struct filter_pred **preds;
745 char *filter_string; 689 char *filter_string;
690 bool no_reset;
746}; 691};
747 692
748struct event_subsystem { 693struct event_subsystem {
749 struct list_head list; 694 struct list_head list;
750 const char *name; 695 const char *name;
751 struct dentry *entry; 696 struct dentry *entry;
752 void *filter; 697 struct event_filter *filter;
698 int nr_events;
753}; 699};
754 700
755struct filter_pred; 701struct filter_pred;
@@ -777,6 +723,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
777 char *filter_string); 723 char *filter_string);
778extern void print_subsystem_event_filter(struct event_subsystem *system, 724extern void print_subsystem_event_filter(struct event_subsystem *system,
779 struct trace_seq *s); 725 struct trace_seq *s);
726extern int filter_assign_type(const char *type);
780 727
781static inline int 728static inline int
782filter_check_discard(struct ftrace_event_call *call, void *rec, 729filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -791,58 +738,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
791 return 0; 738 return 0;
792} 739}
793 740
794#define DEFINE_COMPARISON_PRED(type) \
795static int filter_pred_##type(struct filter_pred *pred, void *event, \
796 int val1, int val2) \
797{ \
798 type *addr = (type *)(event + pred->offset); \
799 type val = (type)pred->val; \
800 int match = 0; \
801 \
802 switch (pred->op) { \
803 case OP_LT: \
804 match = (*addr < val); \
805 break; \
806 case OP_LE: \
807 match = (*addr <= val); \
808 break; \
809 case OP_GT: \
810 match = (*addr > val); \
811 break; \
812 case OP_GE: \
813 match = (*addr >= val); \
814 break; \
815 default: \
816 break; \
817 } \
818 \
819 return match; \
820}
821
822#define DEFINE_EQUALITY_PRED(size) \
823static int filter_pred_##size(struct filter_pred *pred, void *event, \
824 int val1, int val2) \
825{ \
826 u##size *addr = (u##size *)(event + pred->offset); \
827 u##size val = (u##size)pred->val; \
828 int match; \
829 \
830 match = (val == *addr) ^ pred->not; \
831 \
832 return match; \
833}
834
835extern struct mutex event_mutex; 741extern struct mutex event_mutex;
836extern struct list_head ftrace_events; 742extern struct list_head ftrace_events;
837 743
838extern const char *__start___trace_bprintk_fmt[]; 744extern const char *__start___trace_bprintk_fmt[];
839extern const char *__stop___trace_bprintk_fmt[]; 745extern const char *__stop___trace_bprintk_fmt[];
840 746
841#undef TRACE_EVENT_FORMAT 747#undef FTRACE_ENTRY
842#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 748#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
843 extern struct ftrace_event_call event_##call; 749 extern struct ftrace_event_call event_##call;
844#undef TRACE_EVENT_FORMAT_NOFILTER 750#undef FTRACE_ENTRY_DUP
845#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 751#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
846#include "trace_event_types.h" 752 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
753#include "trace_entries.h"
847 754
848#endif /* _LINUX_KERNEL_TRACE_H */ 755#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -131,7 +129,9 @@ struct tracer boot_tracer __read_mostly =
131 129
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
134 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 136 struct trace_array *tr = boot_trace;
137 137
@@ -144,20 +144,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 145 preempt_disable();
146 146
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 149 sizeof(*entry), 0, 0);
149 if (!event) 150 if (!event)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 166 struct trace_array *tr = boot_trace;
163 167
@@ -167,13 +171,15 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 171 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 172 preempt_disable();
169 173
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 176 sizeof(*entry), 0, 0);
172 if (!event) 177 if (!event)
173 goto out; 178 goto out;
174 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 183 out:
178 preempt_enable(); 184 preempt_enable();
179} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..ead3d724599d
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,366 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..dd44b8768867 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,60 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count++) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret)
50 return 0;
51
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi:
54 kfree(trace_profile_buf);
55fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count);
58
59 return ret;
60}
61
10int ftrace_profile_enable(int event_id) 62int ftrace_profile_enable(int event_id)
11{ 63{
12 struct ftrace_event_call *event; 64 struct ftrace_event_call *event;
@@ -14,8 +66,9 @@ int ftrace_profile_enable(int event_id)
14 66
15 mutex_lock(&event_mutex); 67 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 68 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 69 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 70 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event);
19 break; 72 break;
20 } 73 }
21 } 74 }
@@ -24,6 +77,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 77 return ret;
25} 78}
26 79
80static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{
82 char *buf, *nmi_buf;
83
84 if (!atomic_add_negative(-1, &event->profile_count))
85 return;
86
87 event->profile_disable();
88
89 if (!--total_profile_count) {
90 buf = trace_profile_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL);
92
93 nmi_buf = trace_profile_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
95
96 /*
97 * Ensure every events in profiling have finished before
98 * releasing the buffers
99 */
100 synchronize_sched();
101
102 free_percpu(buf);
103 free_percpu(nmi_buf);
104 }
105}
106
27void ftrace_profile_disable(int event_id) 107void ftrace_profile_disable(int event_id)
28{ 108{
29 struct ftrace_event_call *event; 109 struct ftrace_event_call *event;
@@ -31,7 +111,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 111 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 112 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 113 if (event->id == event_id) {
34 event->profile_disable(event); 114 ftrace_profile_disable_event(event);
115 module_put(event->mod);
35 break; 116 break;
36 } 117 }
37 } 118 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..d128f65778e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,16 +17,20 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
24#undef TRACE_SYSTEM
22#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
23 26
24DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
25 28
26LIST_HEAD(ftrace_events); 29LIST_HEAD(ftrace_events);
27 30
28int trace_define_field(struct ftrace_event_call *call, char *type, 31int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 32 const char *name, int offset, int size, int is_signed,
33 int filter_type)
30{ 34{
31 struct ftrace_event_field *field; 35 struct ftrace_event_field *field;
32 36
@@ -42,9 +46,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 46 if (!field->type)
43 goto err; 47 goto err;
44 48
49 if (filter_type == FILTER_OTHER)
50 field->filter_type = filter_assign_type(type);
51 else
52 field->filter_type = filter_type;
53
45 field->offset = offset; 54 field->offset = offset;
46 field->size = size; 55 field->size = size;
47 field->is_signed = is_signed; 56 field->is_signed = is_signed;
57
48 list_add(&field->link, &call->fields); 58 list_add(&field->link, &call->fields);
49 59
50 return 0; 60 return 0;
@@ -60,6 +70,29 @@ err:
60} 70}
61EXPORT_SYMBOL_GPL(trace_define_field); 71EXPORT_SYMBOL_GPL(trace_define_field);
62 72
73#define __common_field(type, item) \
74 ret = trace_define_field(call, #type, "common_" #item, \
75 offsetof(typeof(ent), item), \
76 sizeof(ent.item), \
77 is_signed_type(type), FILTER_OTHER); \
78 if (ret) \
79 return ret;
80
81int trace_define_common_fields(struct ftrace_event_call *call)
82{
83 int ret;
84 struct trace_entry ent;
85
86 __common_field(unsigned short, type);
87 __common_field(unsigned char, flags);
88 __common_field(unsigned char, preempt_count);
89 __common_field(int, pid);
90 __common_field(int, lock_depth);
91
92 return ret;
93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95
63#ifdef CONFIG_MODULES 96#ifdef CONFIG_MODULES
64 97
65static void trace_destroy_fields(struct ftrace_event_call *call) 98static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +117,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 117 if (call->enabled) {
85 call->enabled = 0; 118 call->enabled = 0;
86 tracing_stop_cmdline_record(); 119 tracing_stop_cmdline_record();
87 call->unregfunc(); 120 call->unregfunc(call->data);
88 } 121 }
89 break; 122 break;
90 case 1: 123 case 1:
91 if (!call->enabled) { 124 if (!call->enabled) {
92 call->enabled = 1; 125 call->enabled = 1;
93 tracing_start_cmdline_record(); 126 tracing_start_cmdline_record();
94 call->regfunc(); 127 call->regfunc(call->data);
95 } 128 }
96 break; 129 break;
97 } 130 }
@@ -198,73 +231,38 @@ static ssize_t
198ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
199 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
200{ 233{
201 size_t read = 0; 234 struct trace_parser parser;
202 int i, set = 1; 235 ssize_t read, ret;
203 ssize_t ret;
204 char *buf;
205 char ch;
206 236
207 if (!cnt || cnt < 0) 237 if (!cnt)
208 return 0; 238 return 0;
209 239
210 ret = tracing_update_buffers(); 240 ret = tracing_update_buffers();
211 if (ret < 0) 241 if (ret < 0)
212 return ret; 242 return ret;
213 243
214 ret = get_user(ch, ubuf++); 244 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
215 if (ret)
216 return ret;
217 read++;
218 cnt--;
219
220 /* skip white space */
221 while (cnt && isspace(ch)) {
222 ret = get_user(ch, ubuf++);
223 if (ret)
224 return ret;
225 read++;
226 cnt--;
227 }
228
229 /* Only white space found? */
230 if (isspace(ch)) {
231 file->f_pos += read;
232 ret = read;
233 return ret;
234 }
235
236 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
237 if (!buf)
238 return -ENOMEM; 245 return -ENOMEM;
239 246
240 if (cnt > EVENT_BUF_SIZE) 247 read = trace_get_user(&parser, ubuf, cnt, ppos);
241 cnt = EVENT_BUF_SIZE;
242 248
243 i = 0; 249 if (read >= 0 && trace_parser_loaded((&parser))) {
244 while (cnt && !isspace(ch)) { 250 int set = 1;
245 if (!i && ch == '!') 251
252 if (*parser.buffer == '!')
246 set = 0; 253 set = 0;
247 else
248 buf[i++] = ch;
249 254
250 ret = get_user(ch, ubuf++); 255 parser.buffer[parser.idx] = 0;
256
257 ret = ftrace_set_clr_event(parser.buffer + !set, set);
251 if (ret) 258 if (ret)
252 goto out_free; 259 goto out_put;
253 read++;
254 cnt--;
255 } 260 }
256 buf[i] = 0;
257
258 file->f_pos += read;
259
260 ret = ftrace_set_clr_event(buf, set);
261 if (ret)
262 goto out_free;
263 261
264 ret = read; 262 ret = read;
265 263
266 out_free: 264 out_put:
267 kfree(buf); 265 trace_parser_put(&parser);
268 266
269 return ret; 267 return ret;
270} 268}
@@ -272,42 +270,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
272static void * 270static void *
273t_next(struct seq_file *m, void *v, loff_t *pos) 271t_next(struct seq_file *m, void *v, loff_t *pos)
274{ 272{
275 struct list_head *list = m->private; 273 struct ftrace_event_call *call = v;
276 struct ftrace_event_call *call;
277 274
278 (*pos)++; 275 (*pos)++;
279 276
280 for (;;) { 277 list_for_each_entry_continue(call, &ftrace_events, list) {
281 if (list == &ftrace_events)
282 return NULL;
283
284 call = list_entry(list, struct ftrace_event_call, list);
285
286 /* 278 /*
287 * The ftrace subsystem is for showing formats only. 279 * The ftrace subsystem is for showing formats only.
288 * They can not be enabled or disabled via the event files. 280 * They can not be enabled or disabled via the event files.
289 */ 281 */
290 if (call->regfunc) 282 if (call->regfunc)
291 break; 283 return call;
292
293 list = list->next;
294 } 284 }
295 285
296 m->private = list->next; 286 return NULL;
297
298 return call;
299} 287}
300 288
301static void *t_start(struct seq_file *m, loff_t *pos) 289static void *t_start(struct seq_file *m, loff_t *pos)
302{ 290{
303 struct ftrace_event_call *call = NULL; 291 struct ftrace_event_call *call;
304 loff_t l; 292 loff_t l;
305 293
306 mutex_lock(&event_mutex); 294 mutex_lock(&event_mutex);
307 295
308 m->private = ftrace_events.next; 296 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
309 for (l = 0; l <= *pos; ) { 297 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l); 298 call = t_next(m, call, &l);
311 if (!call) 299 if (!call)
312 break; 300 break;
313 } 301 }
@@ -317,37 +305,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
317static void * 305static void *
318s_next(struct seq_file *m, void *v, loff_t *pos) 306s_next(struct seq_file *m, void *v, loff_t *pos)
319{ 307{
320 struct list_head *list = m->private; 308 struct ftrace_event_call *call = v;
321 struct ftrace_event_call *call;
322 309
323 (*pos)++; 310 (*pos)++;
324 311
325 retry: 312 list_for_each_entry_continue(call, &ftrace_events, list) {
326 if (list == &ftrace_events) 313 if (call->enabled)
327 return NULL; 314 return call;
328
329 call = list_entry(list, struct ftrace_event_call, list);
330
331 if (!call->enabled) {
332 list = list->next;
333 goto retry;
334 } 315 }
335 316
336 m->private = list->next; 317 return NULL;
337
338 return call;
339} 318}
340 319
341static void *s_start(struct seq_file *m, loff_t *pos) 320static void *s_start(struct seq_file *m, loff_t *pos)
342{ 321{
343 struct ftrace_event_call *call = NULL; 322 struct ftrace_event_call *call;
344 loff_t l; 323 loff_t l;
345 324
346 mutex_lock(&event_mutex); 325 mutex_lock(&event_mutex);
347 326
348 m->private = ftrace_events.next; 327 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
349 for (l = 0; l <= *pos; ) { 328 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l); 329 call = s_next(m, call, &l);
351 if (!call) 330 if (!call)
352 break; 331 break;
353 } 332 }
@@ -546,7 +525,7 @@ static int trace_write_header(struct trace_seq *s)
546 FIELD(unsigned char, flags), 525 FIELD(unsigned char, flags),
547 FIELD(unsigned char, preempt_count), 526 FIELD(unsigned char, preempt_count),
548 FIELD(int, pid), 527 FIELD(int, pid),
549 FIELD(int, tgid)); 528 FIELD(int, lock_depth));
550} 529}
551 530
552static ssize_t 531static ssize_t
@@ -574,7 +553,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
574 trace_seq_printf(s, "format:\n"); 553 trace_seq_printf(s, "format:\n");
575 trace_write_header(s); 554 trace_write_header(s);
576 555
577 r = call->show_format(s); 556 r = call->show_format(call, s);
578 if (!r) { 557 if (!r) {
579 /* 558 /*
580 * ug! The format output is bigger than a PAGE!! 559 * ug! The format output is bigger than a PAGE!!
@@ -849,8 +828,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 828
850 /* First see if we did not already create this dir */ 829 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 830 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 831 if (strcmp(system->name, name) == 0) {
832 system->nr_events++;
853 return system->entry; 833 return system->entry;
834 }
854 } 835 }
855 836
856 /* need to create new entry */ 837 /* need to create new entry */
@@ -869,6 +850,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 850 return d_events;
870 } 851 }
871 852
853 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 854 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 855 if (!system->name) {
874 debugfs_remove(system->entry); 856 debugfs_remove(system->entry);
@@ -920,15 +902,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 902 if (strcmp(call->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 903 d_events = event_subsystem_dir(call->system, d_events);
922 904
923 if (call->raw_init) {
924 ret = call->raw_init();
925 if (ret < 0) {
926 pr_warning("Could not initialize trace point"
927 " events/%s\n", call->name);
928 return ret;
929 }
930 }
931
932 call->dir = debugfs_create_dir(call->name, d_events); 905 call->dir = debugfs_create_dir(call->name, d_events);
933 if (!call->dir) { 906 if (!call->dir) {
934 pr_warning("Could not create debugfs " 907 pr_warning("Could not create debugfs "
@@ -945,7 +918,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 id); 918 id);
946 919
947 if (call->define_fields) { 920 if (call->define_fields) {
948 ret = call->define_fields(); 921 ret = call->define_fields(call);
949 if (ret < 0) { 922 if (ret < 0) {
950 pr_warning("Could not initialize trace point" 923 pr_warning("Could not initialize trace point"
951 " events/%s\n", call->name); 924 " events/%s\n", call->name);
@@ -987,6 +960,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 960 struct file_operations filter;
988}; 961};
989 962
963static void remove_subsystem_dir(const char *name)
964{
965 struct event_subsystem *system;
966
967 if (strcmp(name, TRACE_SYSTEM) == 0)
968 return;
969
970 list_for_each_entry(system, &event_subsystems, list) {
971 if (strcmp(system->name, name) == 0) {
972 if (!--system->nr_events) {
973 struct event_filter *filter = system->filter;
974
975 debugfs_remove_recursive(system->entry);
976 list_del(&system->list);
977 if (filter) {
978 kfree(filter->filter_string);
979 kfree(filter);
980 }
981 kfree(system->name);
982 kfree(system);
983 }
984 break;
985 }
986 }
987}
988
990static struct ftrace_module_file_ops * 989static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 990trace_create_file_ops(struct module *mod)
992{ 991{
@@ -1027,6 +1026,7 @@ static void trace_module_add_events(struct module *mod)
1027 struct ftrace_module_file_ops *file_ops = NULL; 1026 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end; 1027 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events; 1028 struct dentry *d_events;
1029 int ret;
1030 1030
1031 start = mod->trace_events; 1031 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events; 1032 end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1042,15 @@ static void trace_module_add_events(struct module *mod)
1042 /* The linker may leave blanks */ 1042 /* The linker may leave blanks */
1043 if (!call->name) 1043 if (!call->name)
1044 continue; 1044 continue;
1045 1045 if (call->raw_init) {
1046 ret = call->raw_init();
1047 if (ret < 0) {
1048 if (ret != -ENOSYS)
1049 pr_warning("Could not initialize trace "
1050 "point events/%s\n", call->name);
1051 continue;
1052 }
1053 }
1046 /* 1054 /*
1047 * This module has events, create file ops for this module 1055 * This module has events, create file ops for this module
1048 * if not already done. 1056 * if not already done.
@@ -1077,6 +1085,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1085 list_del(&call->list);
1078 trace_destroy_fields(call); 1086 trace_destroy_fields(call);
1079 destroy_preds(call); 1087 destroy_preds(call);
1088 remove_subsystem_dir(call->system);
1080 } 1089 }
1081 } 1090 }
1082 1091
@@ -1125,7 +1134,7 @@ static int trace_module_notify(struct notifier_block *self,
1125} 1134}
1126#endif /* CONFIG_MODULES */ 1135#endif /* CONFIG_MODULES */
1127 1136
1128struct notifier_block trace_module_nb = { 1137static struct notifier_block trace_module_nb = {
1129 .notifier_call = trace_module_notify, 1138 .notifier_call = trace_module_notify,
1130 .priority = 0, 1139 .priority = 0,
1131}; 1140};
@@ -1133,6 +1142,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1142extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1143extern struct ftrace_event_call __stop_ftrace_events[];
1135 1144
1145static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1146
1147static __init int setup_trace_event(char *str)
1148{
1149 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1150 ring_buffer_expanded = 1;
1151 tracing_selftest_disabled = 1;
1152
1153 return 1;
1154}
1155__setup("trace_event=", setup_trace_event);
1156
1136static __init int event_trace_init(void) 1157static __init int event_trace_init(void)
1137{ 1158{
1138 struct ftrace_event_call *call; 1159 struct ftrace_event_call *call;
@@ -1140,6 +1161,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1161 struct dentry *entry;
1141 struct dentry *d_events; 1162 struct dentry *d_events;
1142 int ret; 1163 int ret;
1164 char *buf = bootup_event_buf;
1165 char *token;
1143 1166
1144 d_tracer = tracing_init_dentry(); 1167 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1168 if (!d_tracer)
@@ -1179,12 +1202,34 @@ static __init int event_trace_init(void)
1179 /* The linker may leave blanks */ 1202 /* The linker may leave blanks */
1180 if (!call->name) 1203 if (!call->name)
1181 continue; 1204 continue;
1205 if (call->raw_init) {
1206 ret = call->raw_init();
1207 if (ret < 0) {
1208 if (ret != -ENOSYS)
1209 pr_warning("Could not initialize trace "
1210 "point events/%s\n", call->name);
1211 continue;
1212 }
1213 }
1182 list_add(&call->list, &ftrace_events); 1214 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops, 1215 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops, 1216 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops); 1217 &ftrace_event_format_fops);
1186 } 1218 }
1187 1219
1220 while (true) {
1221 token = strsep(&buf, ",");
1222
1223 if (!token)
1224 break;
1225 if (!*token)
1226 continue;
1227
1228 ret = ftrace_set_clr_event(token, 1);
1229 if (ret)
1230 pr_warning("Failed to enable trace event: %s\n", token);
1231 }
1232
1188 ret = register_module_notifier(&trace_module_nb); 1233 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1234 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1235 pr_warning("Failed to register trace events module notifier\n");
@@ -1261,6 +1306,18 @@ static __init void event_trace_self_tests(void)
1261 if (!call->regfunc) 1306 if (!call->regfunc)
1262 continue; 1307 continue;
1263 1308
1309/*
1310 * Testing syscall events here is pretty useless, but
1311 * we still do it if configured. But this is time consuming.
1312 * What we really need is a user thread to perform the
1313 * syscalls as we test.
1314 */
1315#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1316 if (call->system &&
1317 strcmp(call->system, "syscalls") == 0)
1318 continue;
1319#endif
1320
1264 pr_info("Testing event %s: ", call->name); 1321 pr_info("Testing event %s: ", call->name);
1265 1322
1266 /* 1323 /*
@@ -1334,12 +1391,13 @@ static __init void event_trace_self_tests(void)
1334 1391
1335#ifdef CONFIG_FUNCTION_TRACER 1392#ifdef CONFIG_FUNCTION_TRACER
1336 1393
1337static DEFINE_PER_CPU(atomic_t, test_event_disable); 1394static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1338 1395
1339static void 1396static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1397function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{ 1398{
1342 struct ring_buffer_event *event; 1399 struct ring_buffer_event *event;
1400 struct ring_buffer *buffer;
1343 struct ftrace_entry *entry; 1401 struct ftrace_entry *entry;
1344 unsigned long flags; 1402 unsigned long flags;
1345 long disabled; 1403 long disabled;
@@ -1350,14 +1408,15 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1350 pc = preempt_count(); 1408 pc = preempt_count();
1351 resched = ftrace_preempt_disable(); 1409 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id(); 1410 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1411 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1354 1412
1355 if (disabled != 1) 1413 if (disabled != 1)
1356 goto out; 1414 goto out;
1357 1415
1358 local_save_flags(flags); 1416 local_save_flags(flags);
1359 1417
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1418 event = trace_current_buffer_lock_reserve(&buffer,
1419 TRACE_FN, sizeof(*entry),
1361 flags, pc); 1420 flags, pc);
1362 if (!event) 1421 if (!event)
1363 goto out; 1422 goto out;
@@ -1365,10 +1424,10 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1365 entry->ip = ip; 1424 entry->ip = ip;
1366 entry->parent_ip = parent_ip; 1425 entry->parent_ip = parent_ip;
1367 1426
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1427 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1369 1428
1370 out: 1429 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1430 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1372 ftrace_preempt_enable(resched); 1431 ftrace_preempt_enable(resched);
1373} 1432}
1374 1433
@@ -1392,10 +1451,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1451
1393static __init int event_trace_self_tests_init(void) 1452static __init int event_trace_self_tests_init(void)
1394{ 1453{
1395 1454 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1455 event_trace_self_tests();
1397 1456 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1457 }
1399 1458
1400 return 0; 1459 return 0;
1401} 1460}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
@@ -163,6 +204,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
163 return match; 204 return match;
164} 205}
165 206
207/* Filter predicate for char * pointers */
208static int filter_pred_pchar(struct filter_pred *pred, void *event,
209 int val1, int val2)
210{
211 char **addr = (char **)(event + pred->offset);
212 int cmp, match;
213
214 cmp = strncmp(*addr, pred->str_val, pred->str_len);
215
216 match = (!cmp) ^ pred->not;
217
218 return match;
219}
220
166/* 221/*
167 * Filter predicate for dynamic sized arrays of characters. 222 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end 223 * These are implemented through a list of strings at the end
@@ -176,11 +231,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 231static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 232 int val1, int val2)
178{ 233{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 234 u32 str_item = *(u32 *)(event + pred->offset);
235 int str_loc = str_item & 0xffff;
236 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 237 char *addr = (char *)(event + str_loc);
181 int cmp, match; 238 int cmp, match;
182 239
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 240 cmp = strncmp(addr, pred->str_val, str_len);
184 241
185 match = (!cmp) ^ pred->not; 242 match = (!cmp) ^ pred->not;
186 243
@@ -293,7 +350,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
293 struct event_filter *filter = call->filter; 350 struct event_filter *filter = call->filter;
294 351
295 mutex_lock(&event_mutex); 352 mutex_lock(&event_mutex);
296 if (filter->filter_string) 353 if (filter && filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string); 354 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else 355 else
299 trace_seq_printf(s, "none\n"); 356 trace_seq_printf(s, "none\n");
@@ -306,7 +363,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
306 struct event_filter *filter = system->filter; 363 struct event_filter *filter = system->filter;
307 364
308 mutex_lock(&event_mutex); 365 mutex_lock(&event_mutex);
309 if (filter->filter_string) 366 if (filter && filter->filter_string)
310 trace_seq_printf(s, "%s\n", filter->filter_string); 367 trace_seq_printf(s, "%s\n", filter->filter_string);
311 else 368 else
312 trace_seq_printf(s, "none\n"); 369 trace_seq_printf(s, "none\n");
@@ -374,6 +431,9 @@ void destroy_preds(struct ftrace_event_call *call)
374 struct event_filter *filter = call->filter; 431 struct event_filter *filter = call->filter;
375 int i; 432 int i;
376 433
434 if (!filter)
435 return;
436
377 for (i = 0; i < MAX_FILTER_PRED; i++) { 437 for (i = 0; i < MAX_FILTER_PRED; i++) {
378 if (filter->preds[i]) 438 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]); 439 filter_free_pred(filter->preds[i]);
@@ -384,17 +444,19 @@ void destroy_preds(struct ftrace_event_call *call)
384 call->filter = NULL; 444 call->filter = NULL;
385} 445}
386 446
387int init_preds(struct ftrace_event_call *call) 447static int init_preds(struct ftrace_event_call *call)
388{ 448{
389 struct event_filter *filter; 449 struct event_filter *filter;
390 struct filter_pred *pred; 450 struct filter_pred *pred;
391 int i; 451 int i;
392 452
453 if (call->filter)
454 return 0;
455
393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
394 if (!call->filter) 457 if (!call->filter)
395 return -ENOMEM; 458 return -ENOMEM;
396 459
397 call->filter_active = 0;
398 filter->n_preds = 0; 460 filter->n_preds = 0;
399 461
400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 462 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -416,30 +478,55 @@ oom:
416 478
417 return -ENOMEM; 479 return -ENOMEM;
418} 480}
419EXPORT_SYMBOL_GPL(init_preds);
420 481
421static void filter_free_subsystem_preds(struct event_subsystem *system) 482static int init_subsystem_preds(struct event_subsystem *system)
422{ 483{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 484 struct ftrace_event_call *call;
425 int i; 485 int err;
426 486
427 if (filter->n_preds) { 487 list_for_each_entry(call, &ftrace_events, list) {
428 for (i = 0; i < filter->n_preds; i++) 488 if (!call->define_fields)
429 filter_free_pred(filter->preds[i]); 489 continue;
430 kfree(filter->preds); 490
431 filter->preds = NULL; 491 if (strcmp(call->system, system->name) != 0)
432 filter->n_preds = 0; 492 continue;
493
494 err = init_preds(call);
495 if (err)
496 return err;
433 } 497 }
434 498
499 return 0;
500}
501
502enum {
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{
511 struct ftrace_event_call *call;
512
435 list_for_each_entry(call, &ftrace_events, list) { 513 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 514 if (!call->define_fields)
437 continue; 515 continue;
438 516
439 if (!strcmp(call->system, system->name)) { 517 if (strcmp(call->system, system->name) != 0)
440 filter_disable_preds(call); 518 continue;
441 remove_filter_string(call->filter); 519
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
442 } 523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call);
529 remove_filter_string(call->filter);
443 } 530 }
444} 531}
445 532
@@ -468,12 +555,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
468 return 0; 555 return 0;
469} 556}
470 557
471enum { 558int filter_assign_type(const char *type)
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
476static int is_string_field(const char *type)
477{ 559{
478 if (strstr(type, "__data_loc") && strstr(type, "char")) 560 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING; 561 return FILTER_DYN_STRING;
@@ -481,12 +563,19 @@ static int is_string_field(const char *type)
481 if (strchr(type, '[') && strstr(type, "char")) 563 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 564 return FILTER_STATIC_STRING;
483 565
484 return 0; 566 return FILTER_OTHER;
567}
568
569static bool is_string_field(struct ftrace_event_field *field)
570{
571 return field->filter_type == FILTER_DYN_STRING ||
572 field->filter_type == FILTER_STATIC_STRING ||
573 field->filter_type == FILTER_PTR_STRING;
485} 574}
486 575
487static int is_legal_op(struct ftrace_event_field *field, int op) 576static int is_legal_op(struct ftrace_event_field *field, int op)
488{ 577{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
490 return 0; 579 return 0;
491 580
492 return 1; 581 return 1;
@@ -537,22 +626,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 626
538static int filter_add_pred(struct filter_parse_state *ps, 627static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 628 struct ftrace_event_call *call,
540 struct filter_pred *pred) 629 struct filter_pred *pred,
630 bool dry_run)
541{ 631{
542 struct ftrace_event_field *field; 632 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 633 filter_pred_fn_t fn;
544 unsigned long long val; 634 unsigned long long val;
545 int string_type;
546 int ret; 635 int ret;
547 636
548 pred->fn = filter_pred_none; 637 pred->fn = filter_pred_none;
549 638
550 if (pred->op == OP_AND) { 639 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 640 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 641 fn = filter_pred_and;
642 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 643 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 644 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 645 fn = filter_pred_or;
646 goto add_pred_fn;
556 } 647 }
557 648
558 field = find_event_field(call, pred->field_name); 649 field = find_event_field(call, pred->field_name);
@@ -568,16 +659,17 @@ static int filter_add_pred(struct filter_parse_state *ps,
568 return -EINVAL; 659 return -EINVAL;
569 } 660 }
570 661
571 string_type = is_string_field(field->type); 662 if (is_string_field(field)) {
572 if (string_type) { 663 pred->str_len = field->size;
573 if (string_type == FILTER_STATIC_STRING) 664
665 if (field->filter_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string; 666 fn = filter_pred_string;
575 else 667 else if (field->filter_type == FILTER_DYN_STRING)
576 fn = filter_pred_strloc; 668 fn = filter_pred_strloc;
577 pred->str_len = field->size; 669 else {
578 if (pred->op == OP_NE) 670 fn = filter_pred_pchar;
579 pred->not = 1; 671 pred->str_len = strlen(pred->str_val);
580 return filter_add_pred_fn(ps, call, pred, fn); 672 }
581 } else { 673 } else {
582 if (field->is_signed) 674 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 675 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +680,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 680 return -EINVAL;
589 } 681 }
590 pred->val = val; 682 pred->val = val;
591 }
592 683
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 684 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 685 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 686 if (!fn) {
596 return -EINVAL; 687 parse_error(ps, FILT_ERR_INVALID_OP, 0);
688 return -EINVAL;
689 }
597 } 690 }
598 691
599 if (pred->op == OP_NE) 692 if (pred->op == OP_NE)
600 pred->not = 1; 693 pred->not = 1;
601 694
602 return filter_add_pred_fn(ps, call, pred, fn); 695add_pred_fn:
696 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn);
698 return 0;
603} 699}
604 700
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 702 struct event_subsystem *system,
607 struct filter_pred *pred, 703 struct filter_pred *pred,
608 char *filter_string) 704 char *filter_string,
705 bool dry_run)
609{ 706{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 707 struct ftrace_event_call *call;
612 int err = 0; 708 int err = 0;
613 709 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626 710
627 list_for_each_entry(call, &ftrace_events, list) { 711 list_for_each_entry(call, &ftrace_events, list) {
628 712
@@ -632,19 +716,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
632 if (strcmp(call->system, system->name)) 716 if (strcmp(call->system, system->name))
633 continue; 717 continue;
634 718
635 err = filter_add_pred(ps, call, pred); 719 if (call->filter->no_reset)
636 if (err) { 720 continue;
637 filter_free_subsystem_preds(system); 721
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 722 err = filter_add_pred(ps, call, pred, dry_run);
639 goto out; 723 if (err)
640 } 724 call->filter->no_reset = true;
641 replace_filter_string(call->filter, filter_string); 725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
642 } 730 }
643 731
644 filter->preds[filter->n_preds] = pred; 732 if (fail) {
645 filter->n_preds++; 733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
646out: 734 return err;
647 return err; 735 }
736 return 0;
648} 737}
649 738
650static void parse_init(struct filter_parse_state *ps, 739static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1092,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1092static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1093 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1094 struct filter_parse_state *ps,
1006 char *filter_string) 1095 char *filter_string,
1096 bool dry_run)
1007{ 1097{
1008 char *operand1 = NULL, *operand2 = NULL; 1098 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1099 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1100 struct postfix_elt *elt;
1011 int err; 1101 int err;
1102 int n_preds = 0;
1012 1103
1013 err = check_preds(ps); 1104 err = check_preds(ps);
1014 if (err) 1105 if (err)
@@ -1027,24 +1118,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1118 continue;
1028 } 1119 }
1029 1120
1121 if (n_preds++ == MAX_FILTER_PRED) {
1122 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1123 return -ENOSPC;
1124 }
1125
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1126 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1127 pred = create_logical_pred(elt->op);
1032 if (!pred) 1128 goto add_pred;
1033 return -ENOMEM;
1034 if (call) {
1035 err = filter_add_pred(ps, call, pred);
1036 filter_free_pred(pred);
1037 } else {
1038 err = filter_add_subsystem_pred(ps, system,
1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
1047 continue;
1048 } 1129 }
1049 1130
1050 if (!operand1 || !operand2) { 1131 if (!operand1 || !operand2) {
@@ -1053,17 +1134,15 @@ static int replace_preds(struct event_subsystem *system,
1053 } 1134 }
1054 1135
1055 pred = create_pred(elt->op, operand1, operand2); 1136 pred = create_pred(elt->op, operand1, operand2);
1137add_pred:
1056 if (!pred) 1138 if (!pred)
1057 return -ENOMEM; 1139 return -ENOMEM;
1058 if (call) { 1140 if (call)
1059 err = filter_add_pred(ps, call, pred); 1141 err = filter_add_pred(ps, call, pred, false);
1060 filter_free_pred(pred); 1142 else
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred, 1143 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string); 1144 filter_string, dry_run);
1064 if (err) 1145 filter_free_pred(pred);
1065 filter_free_pred(pred);
1066 }
1067 if (err) 1146 if (err)
1068 return err; 1147 return err;
1069 1148
@@ -1081,6 +1160,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1081 1160
1082 mutex_lock(&event_mutex); 1161 mutex_lock(&event_mutex);
1083 1162
1163 err = init_preds(call);
1164 if (err)
1165 goto out_unlock;
1166
1084 if (!strcmp(strstrip(filter_string), "0")) { 1167 if (!strcmp(strstrip(filter_string), "0")) {
1085 filter_disable_preds(call); 1168 filter_disable_preds(call);
1086 remove_filter_string(call->filter); 1169 remove_filter_string(call->filter);
@@ -1103,7 +1186,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1103 goto out; 1186 goto out;
1104 } 1187 }
1105 1188
1106 err = replace_preds(NULL, call, ps, filter_string); 1189 err = replace_preds(NULL, call, ps, filter_string, false);
1107 if (err) 1190 if (err)
1108 append_filter_err(ps, call->filter); 1191 append_filter_err(ps, call->filter);
1109 1192
@@ -1126,8 +1209,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1126 1209
1127 mutex_lock(&event_mutex); 1210 mutex_lock(&event_mutex);
1128 1211
1212 err = init_subsystem_preds(system);
1213 if (err)
1214 goto out_unlock;
1215
1129 if (!strcmp(strstrip(filter_string), "0")) { 1216 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system); 1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1131 remove_filter_string(system->filter); 1218 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex); 1219 mutex_unlock(&event_mutex);
1133 return 0; 1220 return 0;
@@ -1138,7 +1225,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 if (!ps) 1225 if (!ps)
1139 goto out_unlock; 1226 goto out_unlock;
1140 1227
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string); 1228 replace_filter_string(system->filter, filter_string);
1143 1229
1144 parse_init(ps, filter_ops, filter_string); 1230 parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1234,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1148 goto out; 1234 goto out;
1149 } 1235 }
1150 1236
1151 err = replace_preds(system, NULL, ps, filter_string); 1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1152 if (err) 1238
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1153 append_filter_err(ps, system->filter); 1242 append_filter_err(ps, system->filter);
1243 goto out;
1244 }
1245
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1247
1248 /* really apply the filter to the events */
1249 err = replace_preds(system, NULL, ps, filter_string, false);
1250 if (err) {
1251 append_filter_err(ps, system->filter);
1252 filter_free_subsystem_preds(system, 2);
1253 }
1154 1254
1155out: 1255out:
1156 filter_opstack_clear(ps); 1256 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,116 +15,209 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
83
84#undef __array
85#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \
88 offsetof(typeof(field), item), \
89 sizeof(field.item)); \
90 if (!ret) \
91 return 0;
35 92
36#undef TRACE_FIELD_SPECIAL 93#undef __array_desc
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 94#define __array_desc(type, container, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 97 offsetof(typeof(field), container.item), \
41 (unsigned int)sizeof(field.item)); \ 98 sizeof(field.container.item)); \
42 if (!ret) \ 99 if (!ret) \
43 return 0; 100 return 0;
44 101
45#undef TRACE_FIELD_ZERO_CHAR 102#undef __dynamic_array
46#define TRACE_FIELD_ZERO_CHAR(item) \ 103#define __dynamic_array(type, item) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
48 "offset:%u;\tsize:0;\n", \ 105 "offset:%zu;\tsize:0;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 106 offsetof(typeof(field), item)); \
50 if (!ret) \ 107 if (!ret) \
51 return 0; 108 return 0;
52 109
53#undef TRACE_FIELD_SIGN 110#undef F_printk
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
55 TRACE_FIELD(type, item, assign)
56 112
57#undef TP_RAW_FMT 113#undef __entry
58#define TP_RAW_FMT(args...) args 114#define __entry REC
59 115
60#undef TRACE_EVENT_FORMAT 116#undef FTRACE_ENTRY
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
62static int \ 118static int \
63ftrace_format_##call(struct trace_seq *s) \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
120 struct trace_seq *s) \
64{ \ 121{ \
65 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
66 int ret; \ 123 int ret = 0; \
67 \ 124 \
68 tstruct; \ 125 tstruct; \
69 \ 126 \
70 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
71 \ 128 \
72 return ret; \ 129 return ret; \
73} 130}
74 131
75#undef TRACE_EVENT_FORMAT_NOFILTER 132#include "trace_entries.h"
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 133
77 tpfmt) \ 134
78static int \ 135#undef __field
79ftrace_format_##call(struct trace_seq *s) \ 136#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \
138 offsetof(typeof(field), item), \
139 sizeof(field.item), \
140 is_signed_type(type), FILTER_OTHER); \
141 if (ret) \
142 return ret;
143
144#undef __field_desc
145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \
160 if (ret) \
161 return ret;
162
163#undef __array_desc
164#define __array_desc(type, container, item, len) \
165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
170 FILTER_OTHER); \
171 if (ret) \
172 return ret;
173
174#undef __dynamic_array
175#define __dynamic_array(type, item)
176
177#undef FTRACE_ENTRY
178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
179int \
180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
80{ \ 181{ \
81 struct args field; \ 182 struct struct_name field; \
82 int ret; \ 183 int ret; \
83 \ 184 \
84 tstruct; \ 185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
85 \ 188 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 189 tstruct; \
87 \ 190 \
88 return ret; \ 191 return ret; \
89} 192}
90 193
91#include "trace_event_types.h" 194#include "trace_entries.h"
92
93#undef TRACE_ZERO_CHAR
94#define TRACE_ZERO_CHAR(arg)
95 195
96#undef TRACE_FIELD
97#define TRACE_FIELD(type, item, assign)\
98 entry->item = assign;
99 196
100#undef TRACE_FIELD 197#undef __field
101#define TRACE_FIELD(type, item, assign)\ 198#define __field(type, item)
102 entry->item = assign;
103 199
104#undef TRACE_FIELD_SIGN 200#undef __field_desc
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 201#define __field_desc(type, container, item)
106 TRACE_FIELD(type, item, assign)
107 202
108#undef TP_CMD 203#undef __array
109#define TP_CMD(cmd...) cmd 204#define __array(type, item, len)
110 205
111#undef TRACE_ENTRY 206#undef __array_desc
112#define TRACE_ENTRY entry 207#define __array_desc(type, container, item, len)
113 208
114#undef TRACE_FIELD_SPECIAL 209#undef __dynamic_array
115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 210#define __dynamic_array(type, item)
116 cmd;
117 211
118#undef TRACE_EVENT_FORMAT 212#undef FTRACE_ENTRY
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \ 214static int ftrace_raw_init_event_##call(void); \
122 \ 215 \
123struct ftrace_event_call __used \ 216struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \ 217__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \ 218__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \ 219 .name = #call, \
127 .id = proto, \ 220 .id = type, \
128 .system = __stringify(TRACE_SYSTEM), \ 221 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \ 222 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \ 223 .show_format = ftrace_format_##call, \
@@ -133,74 +226,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 226static int ftrace_raw_init_event_##call(void) \
134{ \ 227{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 228 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 229 return 0; \
138} \ 230} \
139 231
140#undef TRACE_EVENT_FORMAT_NOFILTER 232#include "trace_entries.h"
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
143 \
144struct ftrace_event_call __used \
145__attribute__((__aligned__(4))) \
146__attribute__((section("_ftrace_events"))) event_##call = { \
147 .name = #call, \
148 .id = proto, \
149 .system = __stringify(TRACE_SYSTEM), \
150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%ps:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
169static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
170{ 280{
171 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
172 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
173 if (ret) 286 if (ret)
174 return ret; 287 return ret;
175 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 290 return 0;
178} 291}
179 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
180static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
181{ 299{
182 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
184} 302}
185 303
186static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 305
195static enum print_line_t 306static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
197{ 308{
198 int i;
199 int ret; 309 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 310
204 /* 311 /*
205 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
207 * email: 314 * email:
208 */ 315 */
209 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 317 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
225 319
@@ -270,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
270} 364}
271 365
272 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
273/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
274static enum print_line_t 377static enum print_line_t
275verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -427,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
427 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
428 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
429 } 532 }
533
430 /* Proc */ 534 /* Proc */
431 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
432 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -565,11 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
566 } 670 }
567 671
568 ret = seq_print_ip_sym(s, call->func, 0); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 673 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
575 675
@@ -612,11 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
613 } 713 }
614 714
615 ret = seq_print_ip_sym(s, call->func, 0); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 716 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
622 718
@@ -672,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
672 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
673 } 769 }
674 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
675 return 0; 778 return 0;
676} 779}
677 780
@@ -866,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
866 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
867} 970}
868 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
869static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
870{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
871 /* 1st line */ 1001 /* 1st line */
872 seq_printf(s, "# "); 1002 seq_printf(s, "#");
873 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
874 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
875 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
876 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
877 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
878 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
879 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
880 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
881 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
882 1014
883 /* 2nd line */ 1015 /* 2nd line */
884 seq_printf(s, "# "); 1016 seq_printf(s, "#");
885 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
886 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
887 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
888 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
889 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
890 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
891 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
892 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
893 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
@@ -934,6 +1068,8 @@ static struct tracer graph_trace __read_mostly = {
934 1068
935static __init int init_graph_trace(void) 1069static __init int init_graph_trace(void)
936{ 1070{
1071 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1072
937 return register_tracer(&graph_trace); 1073 return register_tracer(&graph_trace);
938} 1074}
939 1075
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..23b63859130e 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
@@ -178,7 +170,6 @@ out_unlock:
178out: 170out:
179 data->critical_sequence = max_sequence; 171 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 172 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 173 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 174}
184 175
@@ -208,7 +199,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 199 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 200 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 201 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 202
213 local_save_flags(flags); 203 local_save_flags(flags);
214 204
@@ -379,6 +369,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 369 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 370 /* make sure that the tracer is visible */
381 smp_wmb(); 371 smp_wmb();
372 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 373 start_irqsoff_tracer(tr);
383} 374}
384 375
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
311 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 314 int pc = preempt_count();
313 315
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 316 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 317 sizeof(*entry), 0, pc);
316 if (!event) { 318 if (!event) {
317 atomic_inc(&dropped_count); 319 atomic_inc(&dropped_count);
@@ -319,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 321 }
320 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 323 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 327}
324 328
325void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +337,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
335{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
341 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 344 int pc = preempt_count();
339 345
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 346 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 347 sizeof(*entry), 0, pc);
342 if (!event) { 348 if (!event) {
343 atomic_inc(&dropped_count); 349 atomic_inc(&dropped_count);
@@ -345,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 351 }
346 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
347 entry->map = *map; 353 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 357}
350 358
351void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
481 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
482 return 0; 487 return 0;
483 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
484 if (entry->preempt_count) 496 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
487} 513}
488 514
489static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index 8a30d9874cd4..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,214 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct trace_power *entry;
42 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace;
44
45 if (!trace_power_enabled)
46 return;
47
48 preempt_disable();
49 it->end = ktime_get();
50 data = tr->data[smp_processor_id()];
51
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
53 sizeof(*entry), 0, 0);
54 if (!event)
55 goto out;
56 entry = ring_buffer_event_data(event);
57 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0);
60 out:
61 preempt_enable();
62}
63
64static void probe_power_mark(struct power_trace *it, unsigned int type,
65 unsigned int level)
66{
67 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event;
69 struct trace_power *entry;
70 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace;
72
73 if (!trace_power_enabled)
74 return;
75
76 memset(it, 0, sizeof(struct power_trace));
77 it->state = level;
78 it->type = type;
79 it->stamp = ktime_get();
80 preempt_disable();
81 it->end = it->stamp;
82 data = tr->data[smp_processor_id()];
83
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
85 sizeof(*entry), 0, 0);
86 if (!event)
87 goto out;
88 entry = ring_buffer_event_data(event);
89 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0);
92 out:
93 preempt_enable();
94}
95
96static int tracing_power_register(void)
97{
98 int ret;
99
100 ret = register_trace_power_start(probe_power_start);
101 if (ret) {
102 pr_info("power trace: Couldn't activate tracepoint"
103 " probe to trace_power_start\n");
104 return ret;
105 }
106 ret = register_trace_power_end(probe_power_end);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_end\n");
110 goto fail_start;
111 }
112 ret = register_trace_power_mark(probe_power_mark);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_mark\n");
116 goto fail_end;
117 }
118 return ret;
119fail_end:
120 unregister_trace_power_end(probe_power_end);
121fail_start:
122 unregister_trace_power_start(probe_power_start);
123 return ret;
124}
125
126static void start_power_trace(struct trace_array *tr)
127{
128 trace_power_enabled = 1;
129}
130
131static void stop_power_trace(struct trace_array *tr)
132{
133 trace_power_enabled = 0;
134}
135
136static void power_trace_reset(struct trace_array *tr)
137{
138 trace_power_enabled = 0;
139 unregister_trace_power_start(probe_power_start);
140 unregister_trace_power_end(probe_power_end);
141 unregister_trace_power_mark(probe_power_mark);
142}
143
144
145static int power_trace_init(struct trace_array *tr)
146{
147 int cpu;
148 power_trace = tr;
149
150 trace_power_enabled = 1;
151 tracing_power_register();
152
153 for_each_cpu(cpu, cpu_possible_mask)
154 tracing_reset(tr, cpu);
155 return 0;
156}
157
158static enum print_line_t power_print_line(struct trace_iterator *iter)
159{
160 int ret = 0;
161 struct trace_entry *entry = iter->ent;
162 struct trace_power *field ;
163 struct power_trace *it;
164 struct trace_seq *s = &iter->seq;
165 struct timespec stamp;
166 struct timespec duration;
167
168 trace_assign_type(field, entry);
169 it = &field->state_data;
170 stamp = ktime_to_timespec(it->stamp);
171 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
172
173 if (entry->type == TRACE_POWER) {
174 if (it->type == POWER_CSTATE)
175 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
176 stamp.tv_sec,
177 stamp.tv_nsec,
178 it->state, iter->cpu,
179 duration.tv_sec,
180 duration.tv_nsec);
181 if (it->type == POWER_PSTATE)
182 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
183 stamp.tv_sec,
184 stamp.tv_nsec,
185 it->state, iter->cpu);
186 if (!ret)
187 return TRACE_TYPE_PARTIAL_LINE;
188 return TRACE_TYPE_HANDLED;
189 }
190 return TRACE_TYPE_UNHANDLED;
191}
192
193static void power_print_header(struct seq_file *s)
194{
195 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
196 seq_puts(s, "# | | |\n");
197}
198
199static struct tracer power_tracer __read_mostly =
200{
201 .name = "power",
202 .init = power_trace_init,
203 .start = start_power_trace,
204 .stop = stop_power_trace,
205 .reset = power_trace_reset,
206 .print_line = power_print_line,
207 .print_header = power_print_header,
208};
209
210static int init_power_trace(void)
211{
212 return register_tracer(&power_tracer);
213}
214device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -186,11 +177,6 @@ out:
186 177
187static void __wakeup_reset(struct trace_array *tr) 178static void __wakeup_reset(struct trace_array *tr)
188{ 179{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 180 wakeup_cpu = -1;
195 wakeup_prio = -1; 181 wakeup_prio = -1;
196 182
@@ -204,6 +190,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 190{
205 unsigned long flags; 191 unsigned long flags;
206 192
193 tracing_reset_online_cpus(tr);
194
207 local_irq_save(flags); 195 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 196 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 197 __wakeup_reset(tr);
@@ -247,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
247 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
248 236
249 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
250 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
251 240
252 wakeup_task = p; 241 wakeup_task = p;
@@ -296,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
296 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
297 } 286 }
298 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
299 wakeup_reset(tr); 295 wakeup_reset(tr);
300 296
301 /* 297 /*
@@ -328,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
328 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
329 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
330 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
331} 328}
332 329
333static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..8504ac71e4e8 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192 192
193 (*pos)++; 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201
202 if (i >= max_stack_trace.nr_entries ||
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239
240 sprint_symbol(str, addr);
241 227
242 return seq_printf(m, "%s\n", str); 228 return seq_printf(m, "%pF\n", (void *)addr);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
@@ -313,14 +296,14 @@ static const struct file_operations stack_trace_fops = {
313 296
314int 297int
315stack_trace_sysctl(struct ctl_table *table, int write, 298stack_trace_sysctl(struct ctl_table *table, int write,
316 struct file *file, void __user *buffer, size_t *lenp, 299 void __user *buffer, size_t *lenp,
317 loff_t *ppos) 300 loff_t *ppos)
318{ 301{
319 int ret; 302 int ret;
320 303
321 mutex_lock(&stack_sysctl_mutex); 304 mutex_lock(&stack_sysctl_mutex);
322 305
323 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, buffer, lenp, ppos);
324 307
325 if (ret || !write || 308 if (ret || !write ||
326 (last_stack_tracer_enabled == !!stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
@@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
200{ 203{
201 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
202 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
203 int i; 207 int i;
204 208
205 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
206 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
207 211
208 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
209 if (!*pos && session->ts->stat_headers) 213 if (session->ts->stat_headers) {
210 return SEQ_START_TOKEN; 214 if (n == 0)
215 return SEQ_START_TOKEN;
216 n--;
217 }
211 218
212 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
213 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
214 node = rb_next(node); 221 node = rb_next(node);
215 222
216 return node; 223 return node;
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..9fbce6c9d2e1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_event.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,303 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
391 int syscall_nr;
392 int size;
393 int cpu;
233 394
234 ret = register_ftrace_event(&syscall_enter_event); 395 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 397 return;
237 syscall_enter_event.type); 398
238 WARN_ON_ONCE(1); 399 sys_data = syscall_nr_to_meta(syscall_nr);
400 if (!sys_data)
401 return;
402
403 /* get the size after alignment with the u32 buffer size field */
404 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
405 size = ALIGN(size + sizeof(u32), sizeof(u64));
406 size -= sizeof(u32);
407
408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
414
415 cpu = smp_processor_id();
416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
424
425 raw_data = per_cpu_ptr(raw_data, cpu);
426
427 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429
430 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
440}
441
442int reg_prof_syscall_enter(char *name)
443{
444 int ret = 0;
445 int num;
446
447 num = syscall_name_to_nr(name);
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450
451 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter)
453 ret = register_trace_sys_enter(prof_syscall_enter);
454 if (ret) {
455 pr_info("event trace: Could not activate"
456 "syscall entry trace point");
457 } else {
458 set_bit(num, enabled_prof_enter_syscalls);
459 sys_prof_refcount_enter++;
239 } 460 }
461 mutex_unlock(&syscall_trace_lock);
462 return ret;
463}
240 464
241 ret = register_ftrace_event(&syscall_exit_event); 465void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 466{
243 printk(KERN_WARNING "event %d failed to register\n", 467 int num;
244 syscall_exit_event.type); 468
245 WARN_ON_ONCE(1); 469 num = syscall_name_to_nr(name);
470 if (num < 0 || num >= NR_syscalls)
471 return;
472
473 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--;
475 clear_bit(num, enabled_prof_enter_syscalls);
476 if (!sys_prof_refcount_enter)
477 unregister_trace_sys_enter(prof_syscall_enter);
478 mutex_unlock(&syscall_trace_lock);
479}
480
481static void prof_syscall_exit(struct pt_regs *regs, long ret)
482{
483 struct syscall_metadata *sys_data;
484 struct syscall_trace_exit *rec;
485 unsigned long flags;
486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
490
491 syscall_nr = syscall_get_nr(current, regs);
492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
493 return;
494
495 sys_data = syscall_nr_to_meta(syscall_nr);
496 if (!sys_data)
497 return;
498
499 /* We can probably do that at build time */
500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
501 size -= sizeof(u32);
502
503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
539}
540
541int reg_prof_syscall_exit(char *name)
542{
543 int ret = 0;
544 int num;
545
546 num = syscall_name_to_nr(name);
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549
550 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit)
552 ret = register_trace_sys_exit(prof_syscall_exit);
553 if (ret) {
554 pr_info("event trace: Could not activate"
555 "syscall entry trace point");
556 } else {
557 set_bit(num, enabled_prof_exit_syscalls);
558 sys_prof_refcount_exit++;
246 } 559 }
560 mutex_unlock(&syscall_trace_lock);
561 return ret;
562}
563
564void unreg_prof_syscall_exit(char *name)
565{
566 int num;
567
568 num = syscall_name_to_nr(name);
569 if (num < 0 || num >= NR_syscalls)
570 return;
247 571
248 return register_tracer(&syscall_tracer); 572 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--;
574 clear_bit(num, enabled_prof_exit_syscalls);
575 if (!sys_prof_refcount_exit)
576 unregister_trace_sys_exit(prof_syscall_exit);
577 mutex_unlock(&syscall_trace_lock);
249} 578}
250device_initcall(register_ftrace_syscalls); 579
580#endif
581
582
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -47,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
47 48
48/* 49/*
49 * Note about RCU : 50 * Note about RCU :
50 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
51 * state is reached. 52 * state is reached.
52 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
53 */ 54 */
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 0314501688b9..419209893d87 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -4,7 +4,6 @@
4 */ 4 */
5 5
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/utsname.h>
8#include <linux/mman.h> 7#include <linux/mman.h>
9#include <linux/notifier.h> 8#include <linux/notifier.h>
10#include <linux/reboot.h> 9#include <linux/reboot.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 92359cc747a7..69eae358a726 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which)
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, 45static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47{ 47{
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55} 55}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{