aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-09-23 17:08:43 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-09-23 17:08:43 -0400
commitd7a4b414eed51f1653bb05ebe84122bf9a7ae18b (patch)
treebd6603a0c27de4c138a1767871897e9cd3e1a1d2 /kernel
parent1f0ab40976460bc4673fa204ce917a725185d8f2 (diff)
parenta724eada8c2a7b62463b73ccf73fd0bb6e928aeb (diff)
Merge commit 'linus/master' into tracing/kprobes
Conflicts: kernel/trace/Makefile kernel/trace/trace.h kernel/trace/trace_event_types.h kernel/trace/trace_export.c Merge reason: Sync with latest significant tracing core changes.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/cgroup.c9
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cred.c296
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/fork.c52
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c97
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/itimer.c169
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kmod.c18
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c795
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c130
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c29
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/perf_counter.c4861
-rw-r--r--kernel/perf_event.c5000
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/posix-cpu-timers.c155
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c414
-rw-r--r--kernel/printk.c208
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c92
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c241
-rw-r--r--kernel/rcutree.c369
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h566
-rw-r--r--kernel/rcutree_trace.c90
-rw-r--r--kernel/resource.c23
-rw-r--r--kernel/sched.c1697
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c522
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c82
-rw-r--r--kernel/smp.c69
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sys.c24
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c75
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/timer.c67
-rw-r--r--kernel/trace/Kconfig30
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/ftrace.c183
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c19
-rw-r--r--kernel/trace/trace.c188
-rw-r--r--kernel/trace/trace.h280
-rw-r--r--kernel/trace/trace_boot.c8
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h366
-rw-r--r--kernel/trace/trace_event_profile.c87
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c134
-rw-r--r--kernel/trace/trace_events_filter.c41
-rw-r--r--kernel/trace/trace_export.c285
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c66
-rw-r--r--kernel/trace/trace_hw_branches.c2
-rw-r--r--kernel/trace/trace_irqsoff.c16
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_mmiotrace.c10
-rw-r--r--kernel/trace/trace_output.c42
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c218
-rw-r--r--kernel/trace/trace_printk.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c52
-rw-r--r--kernel/trace/trace_syscalls.c99
-rw-r--r--kernel/tracepoint.c2
-rw-r--r--kernel/workqueue.c9
106 files changed, 12088 insertions, 12648 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..187c89b4783d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,26 +80,22 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
92obj-$(CONFIG_MARKERS) += marker.o
93obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 90obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
94obj-$(CONFIG_LATENCYTOP) += latencytop.o 91obj-$(CONFIG_LATENCYTOP) += latencytop.o
95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 92obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 93obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 94obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/ 95obj-$(CONFIG_RING_BUFFER) += trace/
100obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
101obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 98obj-$(CONFIG_PERF_EVENTS) += perf_event.o
103 99
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -119,7 +115,7 @@ $(obj)/config_data.gz: .config FORCE
119 $(call if_changed,gzip) 115 $(call if_changed,gzip)
120 116
121quiet_cmd_ikconfiggz = IKCFG $@ 117quiet_cmd_ikconfiggz = IKCFG $@
122 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 118 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
123targets += config_data.h 119targets += config_data.h
124$(obj)/config_data.h: $(obj)/config_data.gz FORCE 120$(obj)/config_data.h: $(obj)/config_data.gz FORCE
125 $(call if_changed,ikconfiggz) 121 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3e..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
491 u64 run_time; 491 u64 run_time;
492 struct timespec uptime; 492 struct timespec uptime;
493 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
494 498
495 /* 499 /*
496 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
497 * the process accounting system. 501 * the process accounting system.
498 */ 502 */
499 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
500 return; 504 goto out;
501 505
502 /* 506 /*
503 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
578 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
579 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
580 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
581} 587}
582 588
583/** 589/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..cd83d9933b6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -596,10 +596,11 @@ void cgroup_unlock(void)
596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
598static int cgroup_populate_dir(struct cgroup *cgrp); 598static int cgroup_populate_dir(struct cgroup *cgrp);
599static struct inode_operations cgroup_dir_inode_operations; 599static const struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
601 601
602static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
603 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
604}; 605};
605 606
@@ -960,7 +961,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
960 return ret; 961 return ret;
961} 962}
962 963
963static struct super_operations cgroup_ops = { 964static const struct super_operations cgroup_ops = {
964 .statfs = simple_statfs, 965 .statfs = simple_statfs,
965 .drop_inode = generic_delete_inode, 966 .drop_inode = generic_delete_inode,
966 .show_options = cgroup_show_options, 967 .show_options = cgroup_show_options,
@@ -1710,7 +1711,7 @@ static struct file_operations cgroup_file_operations = {
1710 .release = cgroup_file_release, 1711 .release = cgroup_file_release,
1711}; 1712};
1712 1713
1713static struct inode_operations cgroup_dir_inode_operations = { 1714static const struct inode_operations cgroup_dir_inode_operations = {
1714 .lookup = simple_lookup, 1715 .lookup = simple_lookup,
1715 .mkdir = cgroup_mkdir, 1716 .mkdir = cgroup_mkdir,
1716 .rmdir = cgroup_rmdir, 1717 .rmdir = cgroup_rmdir,
@@ -2313,7 +2314,7 @@ static int cgroup_tasks_show(struct seq_file *s, void *v)
2313 return seq_printf(s, "%d\n", *(int *)v); 2314 return seq_printf(s, "%d\n", *(int *)v);
2314} 2315}
2315 2316
2316static struct seq_operations cgroup_tasks_seq_operations = { 2317static const struct seq_operations cgroup_tasks_seq_operations = {
2317 .start = cgroup_tasks_start, 2318 .start = cgroup_tasks_start,
2318 .stop = cgroup_tasks_stop, 2319 .stop = cgroup_tasks_stop,
2319 .next = cgroup_tasks_next, 2320 .next = cgroup_tasks_next,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..d7f7a01082eb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
95 release_tgcred(cred); 149 release_tgcred(cred);
96 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
97 free_uid(cred->user); 152 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
99} 154}
@@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 161 */
107void __put_cred(struct cred *cred) 162void __put_cred(struct cred *cred)
108{ 163{
164 kdebug("__put_cred(%p{%d,%d})", cred,
165 atomic_read(&cred->usage),
166 read_cred_subscribers(cred));
167
109 BUG_ON(atomic_read(&cred->usage) != 0); 168 BUG_ON(atomic_read(&cred->usage) != 0);
169#ifdef CONFIG_DEBUG_CREDENTIALS
170 BUG_ON(read_cred_subscribers(cred) != 0);
171 cred->magic = CRED_MAGIC_DEAD;
172 cred->put_addr = __builtin_return_address(0);
173#endif
174 BUG_ON(cred == current->cred);
175 BUG_ON(cred == current->real_cred);
110 176
111 call_rcu(&cred->rcu, put_cred_rcu); 177 call_rcu(&cred->rcu, put_cred_rcu);
112} 178}
113EXPORT_SYMBOL(__put_cred); 179EXPORT_SYMBOL(__put_cred);
114 180
181/*
182 * Clean up a task's credentials when it exits
183 */
184void exit_creds(struct task_struct *tsk)
185{
186 struct cred *cred;
187
188 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
189 atomic_read(&tsk->cred->usage),
190 read_cred_subscribers(tsk->cred));
191
192 cred = (struct cred *) tsk->real_cred;
193 tsk->real_cred = NULL;
194 validate_creds(cred);
195 alter_cred_subscribers(cred, -1);
196 put_cred(cred);
197
198 cred = (struct cred *) tsk->cred;
199 tsk->cred = NULL;
200 validate_creds(cred);
201 alter_cred_subscribers(cred, -1);
202 put_cred(cred);
203
204 cred = (struct cred *) tsk->replacement_session_keyring;
205 if (cred) {
206 tsk->replacement_session_keyring = NULL;
207 validate_creds(cred);
208 put_cred(cred);
209 }
210}
211
212/*
213 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM.
215 */
216struct cred *cred_alloc_blank(void)
217{
218 struct cred *new;
219
220 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
221 if (!new)
222 return NULL;
223
224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) {
227 kfree(new);
228 return NULL;
229 }
230 atomic_set(&new->tgcred->usage, 1);
231#endif
232
233 atomic_set(&new->usage, 1);
234
235 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
236 goto error;
237
238#ifdef CONFIG_DEBUG_CREDENTIALS
239 new->magic = CRED_MAGIC;
240#endif
241 return new;
242
243error:
244 abort_creds(new);
245 return NULL;
246}
247
115/** 248/**
116 * prepare_creds - Prepare a new set of credentials for modification 249 * prepare_creds - Prepare a new set of credentials for modification
117 * 250 *
@@ -132,16 +265,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 265 const struct cred *old;
133 struct cred *new; 266 struct cred *new;
134 267
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 268 validate_process_creds();
136 269
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 270 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 271 if (!new)
139 return NULL; 272 return NULL;
140 273
274 kdebug("prepare_creds() alloc %p", new);
275
141 old = task->cred; 276 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 277 memcpy(new, old, sizeof(struct cred));
143 278
144 atomic_set(&new->usage, 1); 279 atomic_set(&new->usage, 1);
280 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 281 get_group_info(new->group_info);
146 get_uid(new->user); 282 get_uid(new->user);
147 283
@@ -157,6 +293,7 @@ struct cred *prepare_creds(void)
157 293
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 294 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 295 goto error;
296 validate_creds(new);
160 return new; 297 return new;
161 298
162error: 299error:
@@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 366 if (!new)
230 return NULL; 367 return NULL;
231 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
232 memcpy(new, &init_cred, sizeof(struct cred)); 371 memcpy(new, &init_cred, sizeof(struct cred));
233 372
234 atomic_set(&new->usage, 1); 373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 375 get_group_info(new->group_info);
236 get_uid(new->user); 376 get_uid(new->user);
237 377
@@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 390#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 392 goto error;
393 validate_creds(new);
253 394
254 BUG_ON(atomic_read(&new->usage) != 1); 395 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 396 return new;
@@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 427 ) {
287 p->real_cred = get_cred(p->cred); 428 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 429 get_cred(p->cred);
430 alter_cred_subscribers(p->cred, 2);
431 kdebug("share_creds(%p{%d,%d})",
432 p->cred, atomic_read(&p->cred->usage),
433 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 434 atomic_inc(&p->cred->user->processes);
290 return 0; 435 return 0;
291 } 436 }
@@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 476
332 atomic_inc(&new->user->processes); 477 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 478 p->cred = p->real_cred = get_cred(new);
479 alter_cred_subscribers(new, 2);
480 validate_creds(new);
334 return 0; 481 return 0;
335 482
336error_put: 483error_put:
@@ -355,13 +502,20 @@ error_put:
355int commit_creds(struct cred *new) 502int commit_creds(struct cred *new)
356{ 503{
357 struct task_struct *task = current; 504 struct task_struct *task = current;
358 const struct cred *old; 505 const struct cred *old = task->real_cred;
359 506
360 BUG_ON(task->cred != task->real_cred); 507 kdebug("commit_creds(%p{%d,%d})", new,
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 508 atomic_read(&new->usage),
509 read_cred_subscribers(new));
510
511 BUG_ON(task->cred != old);
512#ifdef CONFIG_DEBUG_CREDENTIALS
513 BUG_ON(read_cred_subscribers(old) < 2);
514 validate_creds(old);
515 validate_creds(new);
516#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 517 BUG_ON(atomic_read(&new->usage) < 1);
363 518
364 old = task->real_cred;
365 security_commit_creds(new, old); 519 security_commit_creds(new, old);
366 520
367 get_cred(new); /* we will require a ref for the subj creds too */ 521 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +544,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 544 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 545 * we should be checking for it. -DaveM
392 */ 546 */
547 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 548 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 549 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 550 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 551 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 552 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 553 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2);
399 555
400 sched_switch_user(task); 556 sched_switch_user(task);
401 557
@@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 584 */
429void abort_creds(struct cred *new) 585void abort_creds(struct cred *new)
430{ 586{
587 kdebug("abort_creds(%p{%d,%d})", new,
588 atomic_read(&new->usage),
589 read_cred_subscribers(new));
590
591#ifdef CONFIG_DEBUG_CREDENTIALS
592 BUG_ON(read_cred_subscribers(new) != 0);
593#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 594 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 595 put_cred(new);
433} 596}
@@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 607{
445 const struct cred *old = current->cred; 608 const struct cred *old = current->cred;
446 609
447 rcu_assign_pointer(current->cred, get_cred(new)); 610 kdebug("override_creds(%p{%d,%d})", new,
611 atomic_read(&new->usage),
612 read_cred_subscribers(new));
613
614 validate_creds(old);
615 validate_creds(new);
616 get_cred(new);
617 alter_cred_subscribers(new, 1);
618 rcu_assign_pointer(current->cred, new);
619 alter_cred_subscribers(old, -1);
620
621 kdebug("override_creds() = %p{%d,%d}", old,
622 atomic_read(&old->usage),
623 read_cred_subscribers(old));
448 return old; 624 return old;
449} 625}
450EXPORT_SYMBOL(override_creds); 626EXPORT_SYMBOL(override_creds);
@@ -460,7 +636,15 @@ void revert_creds(const struct cred *old)
460{ 636{
461 const struct cred *override = current->cred; 637 const struct cred *override = current->cred;
462 638
639 kdebug("revert_creds(%p{%d,%d})", old,
640 atomic_read(&old->usage),
641 read_cred_subscribers(old));
642
643 validate_creds(old);
644 validate_creds(override);
645 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 646 rcu_assign_pointer(current->cred, old);
647 alter_cred_subscribers(override, -1);
464 put_cred(override); 648 put_cred(override);
465} 649}
466EXPORT_SYMBOL(revert_creds); 650EXPORT_SYMBOL(revert_creds);
@@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 686 if (!new)
503 return NULL; 687 return NULL;
504 688
689 kdebug("prepare_kernel_cred() alloc %p", new);
690
505 if (daemon) 691 if (daemon)
506 old = get_task_cred(daemon); 692 old = get_task_cred(daemon);
507 else 693 else
508 old = get_cred(&init_cred); 694 old = get_cred(&init_cred);
509 695
696 validate_creds(old);
697
510 *new = *old; 698 *new = *old;
511 get_uid(new->user); 699 get_uid(new->user);
512 get_group_info(new->group_info); 700 get_group_info(new->group_info);
@@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 714 goto error;
527 715
528 atomic_set(&new->usage, 1); 716 atomic_set(&new->usage, 1);
717 set_cred_subscribers(new, 0);
529 put_cred(old); 718 put_cred(old);
719 validate_creds(new);
530 return new; 720 return new;
531 721
532error: 722error:
@@ -589,3 +779,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 779 return security_kernel_create_files_as(new, inode);
590} 780}
591EXPORT_SYMBOL(set_create_files_as); 781EXPORT_SYMBOL(set_create_files_as);
782
783#ifdef CONFIG_DEBUG_CREDENTIALS
784
785/*
786 * dump invalid credentials
787 */
788static void dump_invalid_creds(const struct cred *cred, const char *label,
789 const struct task_struct *tsk)
790{
791 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
792 label, cred,
793 cred == &init_cred ? "[init]" : "",
794 cred == tsk->real_cred ? "[real]" : "",
795 cred == tsk->cred ? "[eff]" : "");
796 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
797 cred->magic, cred->put_addr);
798 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
799 atomic_read(&cred->usage),
800 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid);
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid);
805#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE &&
808 (((unsigned long) cred->security & 0xffffff00) !=
809 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
810 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
811 ((u32*)cred->security)[0],
812 ((u32*)cred->security)[1]);
813#endif
814}
815
816/*
817 * report use of invalid credentials
818 */
819void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
820{
821 printk(KERN_ERR "CRED: Invalid credentials\n");
822 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
823 dump_invalid_creds(cred, "Specified", current);
824 BUG();
825}
826EXPORT_SYMBOL(__invalid_creds);
827
828/*
829 * check the credentials on a process
830 */
831void __validate_process_creds(struct task_struct *tsk,
832 const char *file, unsigned line)
833{
834 if (tsk->cred == tsk->real_cred) {
835 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
836 creds_are_invalid(tsk->cred)))
837 goto invalid_creds;
838 } else {
839 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
840 read_cred_subscribers(tsk->cred) < 1 ||
841 creds_are_invalid(tsk->real_cred) ||
842 creds_are_invalid(tsk->cred)))
843 goto invalid_creds;
844 }
845 return;
846
847invalid_creds:
848 printk(KERN_ERR "CRED: Invalid process credentials\n");
849 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
850
851 dump_invalid_creds(tsk->real_cred, "Real", tsk);
852 if (tsk->cred != tsk->real_cred)
853 dump_invalid_creds(tsk->cred, "Effective", tsk);
854 else
855 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
856 BUG();
857}
858EXPORT_SYMBOL(__validate_process_creds);
859
860/*
861 * check creds for do_exit()
862 */
863void validate_creds_for_do_exit(struct task_struct *tsk)
864{
865 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
866 tsk->real_cred, tsk->cred,
867 atomic_read(&tsk->cred->usage),
868 read_cred_subscribers(tsk->cred));
869
870 __validate_process_creds(tsk, __FILE__, __LINE__);
871}
872
873#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..60d6fdcc9265 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -47,7 +47,7 @@
47#include <linux/tracehook.h> 47#include <linux/tracehook.h>
48#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
49#include <linux/init_task.h> 49#include <linux/init_task.h>
50#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
51#include <trace/events/sched.h> 51#include <trace/events/sched.h>
52 52
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
154{ 154{
155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
156 156
157#ifdef CONFIG_PERF_COUNTERS 157#ifdef CONFIG_PERF_EVENTS
158 WARN_ON_ONCE(tsk->perf_counter_ctxp); 158 WARN_ON_ONCE(tsk->perf_event_ctxp);
159#endif 159#endif
160 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
161 put_task_struct(tsk); 161 put_task_struct(tsk);
@@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid)
359{ 359{
360 struct task_struct *curr = current->group_leader; 360 struct task_struct *curr = current->group_leader;
361 361
362 if (task_session(curr) != pid) 362 if (task_session(curr) != pid) {
363 change_pid(curr, PIDTYPE_SID, pid); 363 change_pid(curr, PIDTYPE_SID, pid);
364 proc_sid_connector(curr);
365 }
364 366
365 if (task_pgrp(curr) != pid) 367 if (task_pgrp(curr) != pid)
366 change_pid(curr, PIDTYPE_PGID, pid); 368 change_pid(curr, PIDTYPE_PGID, pid);
@@ -901,6 +903,8 @@ NORET_TYPE void do_exit(long code)
901 903
902 tracehook_report_exit(&code); 904 tracehook_report_exit(&code);
903 905
906 validate_creds_for_do_exit(tsk);
907
904 /* 908 /*
905 * We're taking recursive faults here in do_exit. Safest is to just 909 * We're taking recursive faults here in do_exit. Safest is to just
906 * leave this task alone and wait for reboot. 910 * leave this task alone and wait for reboot.
@@ -943,6 +947,8 @@ NORET_TYPE void do_exit(long code)
943 if (group_dead) { 947 if (group_dead) {
944 hrtimer_cancel(&tsk->signal->real_timer); 948 hrtimer_cancel(&tsk->signal->real_timer);
945 exit_itimers(tsk->signal); 949 exit_itimers(tsk->signal);
950 if (tsk->mm)
951 setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
946 } 952 }
947 acct_collect(code, group_dead); 953 acct_collect(code, group_dead);
948 if (group_dead) 954 if (group_dead)
@@ -979,7 +985,7 @@ NORET_TYPE void do_exit(long code)
979 * Flush inherited counters to the parent - before the parent 985 * Flush inherited counters to the parent - before the parent
980 * gets woken up by child-exit notifications. 986 * gets woken up by child-exit notifications.
981 */ 987 */
982 perf_counter_exit_task(tsk); 988 perf_event_exit_task(tsk);
983 989
984 exit_notify(tsk, group_dead); 990 exit_notify(tsk, group_dead);
985#ifdef CONFIG_NUMA 991#ifdef CONFIG_NUMA
@@ -1009,7 +1015,10 @@ NORET_TYPE void do_exit(long code)
1009 if (tsk->splice_pipe) 1015 if (tsk->splice_pipe)
1010 __free_pipe_info(tsk->splice_pipe); 1016 __free_pipe_info(tsk->splice_pipe);
1011 1017
1018 validate_creds_for_do_exit(tsk);
1019
1012 preempt_disable(); 1020 preempt_disable();
1021 exit_rcu();
1013 /* causes final put_task_struct in finish_task_switch(). */ 1022 /* causes final put_task_struct in finish_task_switch(). */
1014 tsk->state = TASK_DEAD; 1023 tsk->state = TASK_DEAD;
1015 schedule(); 1024 schedule();
@@ -1203,6 +1212,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1203 if (likely(!traced) && likely(!task_detached(p))) { 1212 if (likely(!traced) && likely(!task_detached(p))) {
1204 struct signal_struct *psig; 1213 struct signal_struct *psig;
1205 struct signal_struct *sig; 1214 struct signal_struct *sig;
1215 unsigned long maxrss;
1206 1216
1207 /* 1217 /*
1208 * The resource counters for the group leader are in its 1218 * The resource counters for the group leader are in its
@@ -1251,6 +1261,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1251 psig->coublock += 1261 psig->coublock +=
1252 task_io_get_oublock(p) + 1262 task_io_get_oublock(p) +
1253 sig->oublock + sig->coublock; 1263 sig->oublock + sig->coublock;
1264 maxrss = max(sig->maxrss, sig->cmaxrss);
1265 if (psig->cmaxrss < maxrss)
1266 psig->cmaxrss = maxrss;
1254 task_io_accounting_add(&psig->ioac, &p->ioac); 1267 task_io_accounting_add(&psig->ioac, &p->ioac);
1255 task_io_accounting_add(&psig->ioac, &sig->ioac); 1268 task_io_accounting_add(&psig->ioac, &sig->ioac);
1256 spin_unlock_irq(&p->real_parent->sighand->siglock); 1269 spin_unlock_irq(&p->real_parent->sighand->siglock);
diff --git a/kernel/fork.c b/kernel/fork.c
index e6c04d462ab2..51ad0b0b7266 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -49,6 +49,7 @@
49#include <linux/ftrace.h> 49#include <linux/ftrace.h>
50#include <linux/profile.h> 50#include <linux/profile.h>
51#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
53#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
54#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -61,7 +62,8 @@
61#include <linux/blkdev.h> 62#include <linux/blkdev.h>
62#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
63#include <linux/magic.h> 64#include <linux/magic.h>
64#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66#include <linux/posix-timers.h>
65 67
66#include <asm/pgtable.h> 68#include <asm/pgtable.h>
67#include <asm/pgalloc.h> 69#include <asm/pgalloc.h>
@@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep;
136/* SLAB cache for mm_struct structures (tsk->mm) */ 138/* SLAB cache for mm_struct structures (tsk->mm) */
137static struct kmem_cache *mm_cachep; 139static struct kmem_cache *mm_cachep;
138 140
141static void account_kernel_stack(struct thread_info *ti, int account)
142{
143 struct zone *zone = page_zone(virt_to_page(ti));
144
145 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
146}
147
139void free_task(struct task_struct *tsk) 148void free_task(struct task_struct *tsk)
140{ 149{
141 prop_local_destroy_single(&tsk->dirties); 150 prop_local_destroy_single(&tsk->dirties);
151 account_kernel_stack(tsk->stack, -1);
142 free_thread_info(tsk->stack); 152 free_thread_info(tsk->stack);
143 rt_mutex_debug_task_free(tsk); 153 rt_mutex_debug_task_free(tsk);
144 ftrace_graph_exit_task(tsk); 154 ftrace_graph_exit_task(tsk);
@@ -152,8 +162,7 @@ void __put_task_struct(struct task_struct *tsk)
152 WARN_ON(atomic_read(&tsk->usage)); 162 WARN_ON(atomic_read(&tsk->usage));
153 WARN_ON(tsk == current); 163 WARN_ON(tsk == current);
154 164
155 put_cred(tsk->real_cred); 165 exit_creds(tsk);
156 put_cred(tsk->cred);
157 delayacct_tsk_free(tsk); 166 delayacct_tsk_free(tsk);
158 167
159 if (!profile_handoff_task(tsk)) 168 if (!profile_handoff_task(tsk))
@@ -254,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
254 tsk->btrace_seq = 0; 263 tsk->btrace_seq = 0;
255#endif 264#endif
256 tsk->splice_pipe = NULL; 265 tsk->splice_pipe = NULL;
266
267 account_kernel_stack(ti, 1);
268
257 return tsk; 269 return tsk;
258 270
259out: 271out:
@@ -289,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
289 rb_link = &mm->mm_rb.rb_node; 301 rb_link = &mm->mm_rb.rb_node;
290 rb_parent = NULL; 302 rb_parent = NULL;
291 pprev = &mm->mmap; 303 pprev = &mm->mmap;
304 retval = ksm_fork(mm, oldmm);
305 if (retval)
306 goto out;
292 307
293 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 308 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
294 struct file *file; 309 struct file *file;
@@ -425,7 +440,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
425 atomic_set(&mm->mm_count, 1); 440 atomic_set(&mm->mm_count, 1);
426 init_rwsem(&mm->mmap_sem); 441 init_rwsem(&mm->mmap_sem);
427 INIT_LIST_HEAD(&mm->mmlist); 442 INIT_LIST_HEAD(&mm->mmlist);
428 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 443 mm->flags = (current->mm) ?
444 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
429 mm->core_state = NULL; 445 mm->core_state = NULL;
430 mm->nr_ptes = 0; 446 mm->nr_ptes = 0;
431 set_mm_counter(mm, file_rss, 0); 447 set_mm_counter(mm, file_rss, 0);
@@ -486,6 +502,7 @@ void mmput(struct mm_struct *mm)
486 502
487 if (atomic_dec_and_test(&mm->mm_users)) { 503 if (atomic_dec_and_test(&mm->mm_users)) {
488 exit_aio(mm); 504 exit_aio(mm);
505 ksm_exit(mm);
489 exit_mmap(mm); 506 exit_mmap(mm);
490 set_mm_exe_file(mm, NULL); 507 set_mm_exe_file(mm, NULL);
491 if (!list_empty(&mm->mmlist)) { 508 if (!list_empty(&mm->mmlist)) {
@@ -789,10 +806,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
789 thread_group_cputime_init(sig); 806 thread_group_cputime_init(sig);
790 807
791 /* Expiration times and increments. */ 808 /* Expiration times and increments. */
792 sig->it_virt_expires = cputime_zero; 809 sig->it[CPUCLOCK_PROF].expires = cputime_zero;
793 sig->it_virt_incr = cputime_zero; 810 sig->it[CPUCLOCK_PROF].incr = cputime_zero;
794 sig->it_prof_expires = cputime_zero; 811 sig->it[CPUCLOCK_VIRT].expires = cputime_zero;
795 sig->it_prof_incr = cputime_zero; 812 sig->it[CPUCLOCK_VIRT].incr = cputime_zero;
796 813
797 /* Cached expiration times. */ 814 /* Cached expiration times. */
798 sig->cputime_expires.prof_exp = cputime_zero; 815 sig->cputime_expires.prof_exp = cputime_zero;
@@ -850,6 +867,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
850 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 867 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
851 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 868 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
852 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 869 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
870 sig->maxrss = sig->cmaxrss = 0;
853 task_io_accounting_init(&sig->ioac); 871 task_io_accounting_init(&sig->ioac);
854 sig->sum_sched_runtime = 0; 872 sig->sum_sched_runtime = 0;
855 taskstats_tgid_init(sig); 873 taskstats_tgid_init(sig);
@@ -864,6 +882,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 882
865 tty_audit_fork(sig); 883 tty_audit_fork(sig);
866 884
885 sig->oom_adj = current->signal->oom_adj;
886
867 return 0; 887 return 0;
868} 888}
869 889
@@ -1008,10 +1028,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1008 copy_flags(clone_flags, p); 1028 copy_flags(clone_flags, p);
1009 INIT_LIST_HEAD(&p->children); 1029 INIT_LIST_HEAD(&p->children);
1010 INIT_LIST_HEAD(&p->sibling); 1030 INIT_LIST_HEAD(&p->sibling);
1011#ifdef CONFIG_PREEMPT_RCU 1031 rcu_copy_process(p);
1012 p->rcu_read_lock_nesting = 0;
1013 p->rcu_flipctr_idx = 0;
1014#endif /* #ifdef CONFIG_PREEMPT_RCU */
1015 p->vfork_done = NULL; 1032 p->vfork_done = NULL;
1016 spin_lock_init(&p->alloc_lock); 1033 spin_lock_init(&p->alloc_lock);
1017 1034
@@ -1079,10 +1096,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1079 1096
1080 p->bts = NULL; 1097 p->bts = NULL;
1081 1098
1099 p->stack_start = stack_start;
1100
1082 /* Perform scheduler related setup. Assign this task to a CPU. */ 1101 /* Perform scheduler related setup. Assign this task to a CPU. */
1083 sched_fork(p, clone_flags); 1102 sched_fork(p, clone_flags);
1084 1103
1085 retval = perf_counter_init_task(p); 1104 retval = perf_event_init_task(p);
1086 if (retval) 1105 if (retval)
1087 goto bad_fork_cleanup_policy; 1106 goto bad_fork_cleanup_policy;
1088 1107
@@ -1257,7 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1257 write_unlock_irq(&tasklist_lock); 1276 write_unlock_irq(&tasklist_lock);
1258 proc_fork_connector(p); 1277 proc_fork_connector(p);
1259 cgroup_post_fork(p); 1278 cgroup_post_fork(p);
1260 perf_counter_fork(p); 1279 perf_event_fork(p);
1261 return p; 1280 return p;
1262 1281
1263bad_fork_free_pid: 1282bad_fork_free_pid:
@@ -1284,7 +1303,7 @@ bad_fork_cleanup_semundo:
1284bad_fork_cleanup_audit: 1303bad_fork_cleanup_audit:
1285 audit_free(p); 1304 audit_free(p);
1286bad_fork_cleanup_policy: 1305bad_fork_cleanup_policy:
1287 perf_counter_free_task(p); 1306 perf_event_free_task(p);
1288#ifdef CONFIG_NUMA 1307#ifdef CONFIG_NUMA
1289 mpol_put(p->mempolicy); 1308 mpol_put(p->mempolicy);
1290bad_fork_cleanup_cgroup: 1309bad_fork_cleanup_cgroup:
@@ -1297,8 +1316,7 @@ bad_fork_cleanup_put_domain:
1297 module_put(task_thread_info(p)->exec_domain->module); 1316 module_put(task_thread_info(p)->exec_domain->module);
1298bad_fork_cleanup_count: 1317bad_fork_cleanup_count:
1299 atomic_dec(&p->cred->user->processes); 1318 atomic_dec(&p->cred->user->processes);
1300 put_cred(p->real_cred); 1319 exit_creds(p);
1301 put_cred(p->cred);
1302bad_fork_free: 1320bad_fork_free:
1303 free_task(p); 1321 free_task(p);
1304fork_out: 1322fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1089 if (!top_waiter) 1092 if (!top_waiter)
1090 return 0; 1093 return 0;
1091 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1092 /* 1099 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
1276 continue; 1283 continue;
1277 } 1284 }
1278 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1279 /* 1292 /*
1280 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1751 q.pi_state = NULL; 1764 q.pi_state = NULL;
1752 q.bitset = bitset; 1765 q.bitset = bitset;
1753 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1754 1768
1755 if (abs_time) { 1769 if (abs_time) {
1756 to = &timeout; 1770 to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1858 1872
1859 q.pi_state = NULL; 1873 q.pi_state = NULL;
1860 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1861retry: 1876retry:
1862 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following: 2134 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2122 * 3) signal (before or after requeue) 2137 * 3) signal
2123 * 4) timeout (before or after requeue) 2138 * 4) timeout
2124 * 2139 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2126 * 2141 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock 2143 * 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2130 * 7) timeout 2145 * 7) timeout
2131 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2132 * 2147 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2134 * 2149 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 * 2151 *
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2169 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2171 2186
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2179 goto out; 2190 goto out;
2180 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2181 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret) 2199 if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2248 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2250 /* 2266 /*
2251 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2252 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2253 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2254 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2255 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */ 2272 */
2260 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2261 } 2274 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..654efd09f6a9 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL)
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..e5d98ce50f89 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,36 +48,7 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/** 51#include <trace/events/timer.h>
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81 52
82/* 53/*
83 * The timer bases: 54 * The timer bases:
@@ -106,31 +77,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 77 }
107}; 78};
108 79
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 80/*
135 * Get the coarse grained time at the softirq based on xtime and 81 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 82 * wall_to_monotonic.
@@ -485,6 +431,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
485 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 431 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
486 __hrtimer_init(timer, clock_id, mode); 432 __hrtimer_init(timer, clock_id, mode);
487} 433}
434EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
488 435
489void destroy_hrtimer_on_stack(struct hrtimer *timer) 436void destroy_hrtimer_on_stack(struct hrtimer *timer)
490{ 437{
@@ -497,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { }
497static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } 444static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
498#endif 445#endif
499 446
447static inline void
448debug_init(struct hrtimer *timer, clockid_t clockid,
449 enum hrtimer_mode mode)
450{
451 debug_hrtimer_init(timer);
452 trace_hrtimer_init(timer, clockid, mode);
453}
454
455static inline void debug_activate(struct hrtimer *timer)
456{
457 debug_hrtimer_activate(timer);
458 trace_hrtimer_start(timer);
459}
460
461static inline void debug_deactivate(struct hrtimer *timer)
462{
463 debug_hrtimer_deactivate(timer);
464 trace_hrtimer_cancel(timer);
465}
466
500/* High resolution timer related functions */ 467/* High resolution timer related functions */
501#ifdef CONFIG_HIGH_RES_TIMERS 468#ifdef CONFIG_HIGH_RES_TIMERS
502 469
@@ -853,7 +820,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
853 struct hrtimer *entry; 820 struct hrtimer *entry;
854 int leftmost = 1; 821 int leftmost = 1;
855 822
856 debug_hrtimer_activate(timer); 823 debug_activate(timer);
857 824
858 /* 825 /*
859 * Find the right place in the rbtree: 826 * Find the right place in the rbtree:
@@ -939,7 +906,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
939 * reprogramming happens in the interrupt handler. This is a 906 * reprogramming happens in the interrupt handler. This is a
940 * rare case and less expensive than a smp call. 907 * rare case and less expensive than a smp call.
941 */ 908 */
942 debug_hrtimer_deactivate(timer); 909 debug_deactivate(timer);
943 timer_stats_hrtimer_clear_start_info(timer); 910 timer_stats_hrtimer_clear_start_info(timer);
944 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 911 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
945 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 912 __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
@@ -1154,7 +1121,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1154 clock_id = CLOCK_MONOTONIC; 1121 clock_id = CLOCK_MONOTONIC;
1155 1122
1156 timer->base = &cpu_base->clock_base[clock_id]; 1123 timer->base = &cpu_base->clock_base[clock_id];
1157 INIT_LIST_HEAD(&timer->cb_entry);
1158 hrtimer_init_timer_hres(timer); 1124 hrtimer_init_timer_hres(timer);
1159 1125
1160#ifdef CONFIG_TIMER_STATS 1126#ifdef CONFIG_TIMER_STATS
@@ -1173,7 +1139,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1173void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, 1139void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1174 enum hrtimer_mode mode) 1140 enum hrtimer_mode mode)
1175{ 1141{
1176 debug_hrtimer_init(timer); 1142 debug_init(timer, clock_id, mode);
1177 __hrtimer_init(timer, clock_id, mode); 1143 __hrtimer_init(timer, clock_id, mode);
1178} 1144}
1179EXPORT_SYMBOL_GPL(hrtimer_init); 1145EXPORT_SYMBOL_GPL(hrtimer_init);
@@ -1197,7 +1163,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1197} 1163}
1198EXPORT_SYMBOL_GPL(hrtimer_get_res); 1164EXPORT_SYMBOL_GPL(hrtimer_get_res);
1199 1165
1200static void __run_hrtimer(struct hrtimer *timer) 1166static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1201{ 1167{
1202 struct hrtimer_clock_base *base = timer->base; 1168 struct hrtimer_clock_base *base = timer->base;
1203 struct hrtimer_cpu_base *cpu_base = base->cpu_base; 1169 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
@@ -1206,7 +1172,7 @@ static void __run_hrtimer(struct hrtimer *timer)
1206 1172
1207 WARN_ON(!irqs_disabled()); 1173 WARN_ON(!irqs_disabled());
1208 1174
1209 debug_hrtimer_deactivate(timer); 1175 debug_deactivate(timer);
1210 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); 1176 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1211 timer_stats_account_hrtimer(timer); 1177 timer_stats_account_hrtimer(timer);
1212 fn = timer->function; 1178 fn = timer->function;
@@ -1217,7 +1183,9 @@ static void __run_hrtimer(struct hrtimer *timer)
1217 * the timer base. 1183 * the timer base.
1218 */ 1184 */
1219 spin_unlock(&cpu_base->lock); 1185 spin_unlock(&cpu_base->lock);
1186 trace_hrtimer_expire_entry(timer, now);
1220 restart = fn(timer); 1187 restart = fn(timer);
1188 trace_hrtimer_expire_exit(timer);
1221 spin_lock(&cpu_base->lock); 1189 spin_lock(&cpu_base->lock);
1222 1190
1223 /* 1191 /*
@@ -1328,7 +1296,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1328 break; 1296 break;
1329 } 1297 }
1330 1298
1331 __run_hrtimer(timer); 1299 __run_hrtimer(timer, &basenow);
1332 } 1300 }
1333 base++; 1301 base++;
1334 } 1302 }
@@ -1450,7 +1418,7 @@ void hrtimer_run_queues(void)
1450 hrtimer_get_expires_tv64(timer)) 1418 hrtimer_get_expires_tv64(timer))
1451 break; 1419 break;
1452 1420
1453 __run_hrtimer(timer); 1421 __run_hrtimer(timer, &base->softirq_time);
1454 } 1422 }
1455 spin_unlock(&cpu_base->lock); 1423 spin_unlock(&cpu_base->lock);
1456 } 1424 }
@@ -1477,6 +1445,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1477 sl->timer.function = hrtimer_wakeup; 1445 sl->timer.function = hrtimer_wakeup;
1478 sl->task = task; 1446 sl->task = task;
1479} 1447}
1448EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1480 1449
1481static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1450static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1482{ 1451{
@@ -1626,7 +1595,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1626 while ((node = rb_first(&old_base->active))) { 1595 while ((node = rb_first(&old_base->active))) {
1627 timer = rb_entry(node, struct hrtimer, node); 1596 timer = rb_entry(node, struct hrtimer, node);
1628 BUG_ON(hrtimer_callback_running(timer)); 1597 BUG_ON(hrtimer_callback_running(timer));
1629 debug_hrtimer_deactivate(timer); 1598 debug_deactivate(timer);
1630 1599
1631 /* 1600 /*
1632 * Mark it as STATE_MIGRATE not INACTIVE otherwise the 1601 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc); 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
59
47/* 60/*
48 * Debugging printout: 61 * Debugging printout:
49 */ 62 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed831737..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
230 if (!desc) 230 if (!desc)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc);
233 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
234 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
235 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
236} 238}
237EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
238 240
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
294 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
295 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
296 * 298 *
297 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
298 */ 301 */
299void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
300{ 303{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
304 if (!desc) 307 if (!desc)
305 return; 308 return;
306 309
310 chip_bus_lock(irq, desc);
307 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
308 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
309 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
310} 315}
311EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
312 317
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 return ret; 441 return ret;
437} 442}
438 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
439static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
440{ 465{
441 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
451 return -1; 476 return -1;
452} 477}
453 478
479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
454#ifdef CONFIG_SMP 496#ifdef CONFIG_SMP
455/* 497/*
456 * Check whether we need to change the affinity of the interrupt thread. 498 * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
492 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
493 struct irqaction *action = data; 535 struct irqaction *action = data;
494 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
495 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
496 538
497 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
498 current->irqaction = action; 540 current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
518 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
519 561
520 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
521 } 566 }
522 567
523 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
565 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
566 const char *old_name = NULL; 611 const char *old_name = NULL;
567 unsigned long flags; 612 unsigned long flags;
568 int shared = 0; 613 int nested, shared = 0;
569 int ret; 614 int ret;
570 615
571 if (!desc) 616 if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
590 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
591 } 636 }
592 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
593 /* 658 /*
594 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
595 */ 662 */
596 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
597 struct task_struct *t; 664 struct task_struct *t;
598 665
599 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
662 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
663#endif 730#endif
664 731
665 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
666 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
667 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
668 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
669 desc->depth = 0; 739 desc->depth = 0;
670 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
875 */ 945 */
876void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
877{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
878 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
879} 956}
880EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
881 958
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
884 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
885 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
886 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
887 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
888 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
889 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
963 1042
964 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
965 return -EINVAL; 1044 return -EINVAL;
966 if (!handler) 1045
967 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
968 1051
969 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
970 if (!action) 1053 if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
976 action->name = devname; 1059 action->name = devname;
977 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
978 1061
1062 chip_bus_lock(irq, desc);
979 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
980 if (retval) 1066 if (retval)
981 kfree(action); 1067 kfree(action);
982 1068
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 58762f7077ec..b03451ede528 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/posix-timers.h> 13#include <linux/posix-timers.h>
14#include <linux/hrtimer.h> 14#include <linux/hrtimer.h>
15#include <trace/events/timer.h>
15 16
16#include <asm/uaccess.h> 17#include <asm/uaccess.h>
17 18
@@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
41 return ktime_to_timeval(rem); 42 return ktime_to_timeval(rem);
42} 43}
43 44
45static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
46 struct itimerval *const value)
47{
48 cputime_t cval, cinterval;
49 struct cpu_itimer *it = &tsk->signal->it[clock_id];
50
51 spin_lock_irq(&tsk->sighand->siglock);
52
53 cval = it->expires;
54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) {
56 struct task_cputime cputime;
57 cputime_t t;
58
59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime);
62 else
63 /* CPUCLOCK_VIRT */
64 t = cputime.utime;
65
66 if (cputime_le(cval, t))
67 /* about to fire */
68 cval = cputime_one_jiffy;
69 else
70 cval = cputime_sub(cval, t);
71 }
72
73 spin_unlock_irq(&tsk->sighand->siglock);
74
75 cputime_to_timeval(cval, &value->it_value);
76 cputime_to_timeval(cinterval, &value->it_interval);
77}
78
44int do_getitimer(int which, struct itimerval *value) 79int do_getitimer(int which, struct itimerval *value)
45{ 80{
46 struct task_struct *tsk = current; 81 struct task_struct *tsk = current;
47 cputime_t cinterval, cval;
48 82
49 switch (which) { 83 switch (which) {
50 case ITIMER_REAL: 84 case ITIMER_REAL:
@@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 89 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 90 break;
57 case ITIMER_VIRTUAL: 91 case ITIMER_VIRTUAL:
58 spin_lock_irq(&tsk->sighand->siglock); 92 get_cpu_itimer(tsk, CPUCLOCK_VIRT, value);
59 cval = tsk->signal->it_virt_expires;
60 cinterval = tsk->signal->it_virt_incr;
61 if (!cputime_eq(cval, cputime_zero)) {
62 struct task_cputime cputime;
63 cputime_t utime;
64
65 thread_group_cputimer(tsk, &cputime);
66 utime = cputime.utime;
67 if (cputime_le(cval, utime)) { /* about to fire */
68 cval = jiffies_to_cputime(1);
69 } else {
70 cval = cputime_sub(cval, utime);
71 }
72 }
73 spin_unlock_irq(&tsk->sighand->siglock);
74 cputime_to_timeval(cval, &value->it_value);
75 cputime_to_timeval(cinterval, &value->it_interval);
76 break; 93 break;
77 case ITIMER_PROF: 94 case ITIMER_PROF:
78 spin_lock_irq(&tsk->sighand->siglock); 95 get_cpu_itimer(tsk, CPUCLOCK_PROF, value);
79 cval = tsk->signal->it_prof_expires;
80 cinterval = tsk->signal->it_prof_incr;
81 if (!cputime_eq(cval, cputime_zero)) {
82 struct task_cputime times;
83 cputime_t ptime;
84
85 thread_group_cputimer(tsk, &times);
86 ptime = cputime_add(times.utime, times.stime);
87 if (cputime_le(cval, ptime)) { /* about to fire */
88 cval = jiffies_to_cputime(1);
89 } else {
90 cval = cputime_sub(cval, ptime);
91 }
92 }
93 spin_unlock_irq(&tsk->sighand->siglock);
94 cputime_to_timeval(cval, &value->it_value);
95 cputime_to_timeval(cinterval, &value->it_interval);
96 break; 96 break;
97 default: 97 default:
98 return(-EINVAL); 98 return(-EINVAL);
@@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
123 struct signal_struct *sig = 123 struct signal_struct *sig =
124 container_of(timer, struct signal_struct, real_timer); 124 container_of(timer, struct signal_struct, real_timer);
125 125
126 trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
126 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); 127 kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
127 128
128 return HRTIMER_NORESTART; 129 return HRTIMER_NORESTART;
129} 130}
130 131
132static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns)
133{
134 struct timespec ts;
135 s64 cpu_ns;
136
137 cputime_to_timespec(ct, &ts);
138 cpu_ns = timespec_to_ns(&ts);
139
140 return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns;
141}
142
143static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
144 const struct itimerval *const value,
145 struct itimerval *const ovalue)
146{
147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150
151 nval = timeval_to_cputime(&value->it_value);
152 ns_nval = timeval_to_ns(&value->it_value);
153 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval);
155
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval);
157 it->error = cputime_sub_ns(nval, ns_nval);
158
159 spin_lock_irq(&tsk->sighand->siglock);
160
161 cval = it->expires;
162 cinterval = it->incr;
163 if (!cputime_eq(cval, cputime_zero) ||
164 !cputime_eq(nval, cputime_zero)) {
165 if (cputime_gt(nval, cputime_zero))
166 nval = cputime_add(nval, cputime_one_jiffy);
167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
168 }
169 it->expires = nval;
170 it->incr = ninterval;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173
174 spin_unlock_irq(&tsk->sighand->siglock);
175
176 if (ovalue) {
177 cputime_to_timeval(cval, &ovalue->it_value);
178 cputime_to_timeval(cinterval, &ovalue->it_interval);
179 }
180}
181
131/* 182/*
132 * Returns true if the timeval is in canonical form 183 * Returns true if the timeval is in canonical form
133 */ 184 */
@@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
139 struct task_struct *tsk = current; 190 struct task_struct *tsk = current;
140 struct hrtimer *timer; 191 struct hrtimer *timer;
141 ktime_t expires; 192 ktime_t expires;
142 cputime_t cval, cinterval, nval, ninterval;
143 193
144 /* 194 /*
145 * Validate the timevals in value. 195 * Validate the timevals in value.
@@ -171,51 +221,14 @@ again:
171 } else 221 } else
172 tsk->signal->it_real_incr.tv64 = 0; 222 tsk->signal->it_real_incr.tv64 = 0;
173 223
224 trace_itimer_state(ITIMER_REAL, value, 0);
174 spin_unlock_irq(&tsk->sighand->siglock); 225 spin_unlock_irq(&tsk->sighand->siglock);
175 break; 226 break;
176 case ITIMER_VIRTUAL: 227 case ITIMER_VIRTUAL:
177 nval = timeval_to_cputime(&value->it_value); 228 set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue);
178 ninterval = timeval_to_cputime(&value->it_interval);
179 spin_lock_irq(&tsk->sighand->siglock);
180 cval = tsk->signal->it_virt_expires;
181 cinterval = tsk->signal->it_virt_incr;
182 if (!cputime_eq(cval, cputime_zero) ||
183 !cputime_eq(nval, cputime_zero)) {
184 if (cputime_gt(nval, cputime_zero))
185 nval = cputime_add(nval,
186 jiffies_to_cputime(1));
187 set_process_cpu_timer(tsk, CPUCLOCK_VIRT,
188 &nval, &cval);
189 }
190 tsk->signal->it_virt_expires = nval;
191 tsk->signal->it_virt_incr = ninterval;
192 spin_unlock_irq(&tsk->sighand->siglock);
193 if (ovalue) {
194 cputime_to_timeval(cval, &ovalue->it_value);
195 cputime_to_timeval(cinterval, &ovalue->it_interval);
196 }
197 break; 229 break;
198 case ITIMER_PROF: 230 case ITIMER_PROF:
199 nval = timeval_to_cputime(&value->it_value); 231 set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue);
200 ninterval = timeval_to_cputime(&value->it_interval);
201 spin_lock_irq(&tsk->sighand->siglock);
202 cval = tsk->signal->it_prof_expires;
203 cinterval = tsk->signal->it_prof_incr;
204 if (!cputime_eq(cval, cputime_zero) ||
205 !cputime_eq(nval, cputime_zero)) {
206 if (cputime_gt(nval, cputime_zero))
207 nval = cputime_add(nval,
208 jiffies_to_cputime(1));
209 set_process_cpu_timer(tsk, CPUCLOCK_PROF,
210 &nval, &cval);
211 }
212 tsk->signal->it_prof_expires = nval;
213 tsk->signal->it_prof_incr = ninterval;
214 spin_unlock_irq(&tsk->sighand->siglock);
215 if (ovalue) {
216 cputime_to_timeval(cval, &ovalue->it_value);
217 cputime_to_timeval(cinterval, &ovalue->it_interval);
218 }
219 break; 232 break;
220 default: 233 default:
221 return -EINVAL; 234 return -EINVAL;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3a29dbe7898e..8b6b8b697c68 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr)
59 59
60static inline int is_kernel_text(unsigned long addr) 60static inline int is_kernel_text(unsigned long addr)
61{ 61{
62 if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) 62 if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
63 arch_is_kernel_text(addr))
63 return 1; 64 return 1;
64 return in_gate_area_no_task(addr); 65 return in_gate_area_no_task(addr);
65} 66}
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a92280870e30..689d20f39305 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
83 va_start(args, fmt); 87 va_start(args, fmt);
84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
85 va_end(args); 89 va_end(args);
@@ -139,6 +143,7 @@ struct subprocess_info {
139static int ____call_usermodehelper(void *data) 143static int ____call_usermodehelper(void *data)
140{ 144{
141 struct subprocess_info *sub_info = data; 145 struct subprocess_info *sub_info = data;
146 enum umh_wait wait = sub_info->wait;
142 int retval; 147 int retval;
143 148
144 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 149 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
@@ -180,10 +185,14 @@ static int ____call_usermodehelper(void *data)
180 */ 185 */
181 set_user_nice(current, 0); 186 set_user_nice(current, 0);
182 187
188 if (wait == UMH_WAIT_EXEC)
189 complete(sub_info->complete);
190
183 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 191 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
184 192
185 /* Exec failed? */ 193 /* Exec failed? */
186 sub_info->retval = retval; 194 if (wait != UMH_WAIT_EXEC)
195 sub_info->retval = retval;
187 do_exit(0); 196 do_exit(0);
188} 197}
189 198
@@ -262,16 +271,14 @@ static void __call_usermodehelper(struct work_struct *work)
262 271
263 switch (wait) { 272 switch (wait) {
264 case UMH_NO_WAIT: 273 case UMH_NO_WAIT:
274 case UMH_WAIT_EXEC:
265 break; 275 break;
266 276
267 case UMH_WAIT_PROC: 277 case UMH_WAIT_PROC:
268 if (pid > 0) 278 if (pid > 0)
269 break; 279 break;
270 sub_info->retval = pid; 280 sub_info->retval = pid;
271 /* FALLTHROUGH */ 281 break;
272
273 case UMH_WAIT_EXEC:
274 complete(sub_info->complete);
275 } 282 }
276} 283}
277 284
@@ -466,6 +473,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
466 int retval = 0; 473 int retval = 0;
467 474
468 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 475 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
476 validate_creds(sub_info->cred);
469 477
470 helper_lock(); 478 helper_lock();
471 if (sub_info->path[0] == '\0') 479 if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b946761f84bd..b466afa4e148 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1349,7 +1349,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
1349 return 0; 1349 return 0;
1350} 1350}
1351 1351
1352static struct seq_operations kprobes_seq_ops = { 1352static const struct seq_operations kprobes_seq_ops = {
1353 .start = kprobe_seq_start, 1353 .start = kprobe_seq_start,
1354 .next = kprobe_seq_next, 1354 .next = kprobe_seq_next,
1355 .stop = kprobe_seq_stop, 1355 .stop = kprobe_seq_stop,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..3815ac1d58b2 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -636,6 +578,9 @@ static int static_obj(void *obj)
636 if ((addr >= start) && (addr < end)) 578 if ((addr >= start) && (addr < end))
637 return 1; 579 return 1;
638 580
581 if (arch_is_kernel_data(addr))
582 return 1;
583
639#ifdef CONFIG_SMP 584#ifdef CONFIG_SMP
640 /* 585 /*
641 * percpu var? 586 * percpu var?
@@ -898,22 +843,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 843}
899 844
900/* 845/*
846 * For good efficiency of modular, we use power of 2
847 */
848#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
849#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
850
851/*
852 * The circular_queue and helpers is used to implement the
853 * breadth-first search(BFS)algorithem, by which we can build
854 * the shortest path from the next lock to be acquired to the
855 * previous held lock if there is a circular between them.
856 */
857struct circular_queue {
858 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
859 unsigned int front, rear;
860};
861
862static struct circular_queue lock_cq;
863
864unsigned int max_bfs_queue_depth;
865
866static unsigned int lockdep_dependency_gen_id;
867
868static inline void __cq_init(struct circular_queue *cq)
869{
870 cq->front = cq->rear = 0;
871 lockdep_dependency_gen_id++;
872}
873
874static inline int __cq_empty(struct circular_queue *cq)
875{
876 return (cq->front == cq->rear);
877}
878
879static inline int __cq_full(struct circular_queue *cq)
880{
881 return ((cq->rear + 1) & CQ_MASK) == cq->front;
882}
883
884static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
885{
886 if (__cq_full(cq))
887 return -1;
888
889 cq->element[cq->rear] = elem;
890 cq->rear = (cq->rear + 1) & CQ_MASK;
891 return 0;
892}
893
894static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
895{
896 if (__cq_empty(cq))
897 return -1;
898
899 *elem = cq->element[cq->front];
900 cq->front = (cq->front + 1) & CQ_MASK;
901 return 0;
902}
903
904static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
905{
906 return (cq->rear - cq->front) & CQ_MASK;
907}
908
909static inline void mark_lock_accessed(struct lock_list *lock,
910 struct lock_list *parent)
911{
912 unsigned long nr;
913
914 nr = lock - list_entries;
915 WARN_ON(nr >= nr_list_entries);
916 lock->parent = parent;
917 lock->class->dep_gen_id = lockdep_dependency_gen_id;
918}
919
920static inline unsigned long lock_accessed(struct lock_list *lock)
921{
922 unsigned long nr;
923
924 nr = lock - list_entries;
925 WARN_ON(nr >= nr_list_entries);
926 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
927}
928
929static inline struct lock_list *get_lock_parent(struct lock_list *child)
930{
931 return child->parent;
932}
933
934static inline int get_lock_depth(struct lock_list *child)
935{
936 int depth = 0;
937 struct lock_list *parent;
938
939 while ((parent = get_lock_parent(child))) {
940 child = parent;
941 depth++;
942 }
943 return depth;
944}
945
946static int __bfs(struct lock_list *source_entry,
947 void *data,
948 int (*match)(struct lock_list *entry, void *data),
949 struct lock_list **target_entry,
950 int forward)
951{
952 struct lock_list *entry;
953 struct list_head *head;
954 struct circular_queue *cq = &lock_cq;
955 int ret = 1;
956
957 if (match(source_entry, data)) {
958 *target_entry = source_entry;
959 ret = 0;
960 goto exit;
961 }
962
963 if (forward)
964 head = &source_entry->class->locks_after;
965 else
966 head = &source_entry->class->locks_before;
967
968 if (list_empty(head))
969 goto exit;
970
971 __cq_init(cq);
972 __cq_enqueue(cq, (unsigned long)source_entry);
973
974 while (!__cq_empty(cq)) {
975 struct lock_list *lock;
976
977 __cq_dequeue(cq, (unsigned long *)&lock);
978
979 if (!lock->class) {
980 ret = -2;
981 goto exit;
982 }
983
984 if (forward)
985 head = &lock->class->locks_after;
986 else
987 head = &lock->class->locks_before;
988
989 list_for_each_entry(entry, head, entry) {
990 if (!lock_accessed(entry)) {
991 unsigned int cq_depth;
992 mark_lock_accessed(entry, lock);
993 if (match(entry, data)) {
994 *target_entry = entry;
995 ret = 0;
996 goto exit;
997 }
998
999 if (__cq_enqueue(cq, (unsigned long)entry)) {
1000 ret = -1;
1001 goto exit;
1002 }
1003 cq_depth = __cq_get_elem_count(cq);
1004 if (max_bfs_queue_depth < cq_depth)
1005 max_bfs_queue_depth = cq_depth;
1006 }
1007 }
1008 }
1009exit:
1010 return ret;
1011}
1012
1013static inline int __bfs_forwards(struct lock_list *src_entry,
1014 void *data,
1015 int (*match)(struct lock_list *entry, void *data),
1016 struct lock_list **target_entry)
1017{
1018 return __bfs(src_entry, data, match, target_entry, 1);
1019
1020}
1021
1022static inline int __bfs_backwards(struct lock_list *src_entry,
1023 void *data,
1024 int (*match)(struct lock_list *entry, void *data),
1025 struct lock_list **target_entry)
1026{
1027 return __bfs(src_entry, data, match, target_entry, 0);
1028
1029}
1030
1031/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1032 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1033 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1034 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1035 */
909static struct held_lock *check_source, *check_target;
910 1036
911/* 1037/*
912 * Print a dependency chain entry (this is only done when a deadlock 1038 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1039 * has been detected):
914 */ 1040 */
915static noinline int 1041static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1042print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1043{
918 if (debug_locks_silent) 1044 if (debug_locks_silent)
919 return 0; 1045 return 0;
@@ -930,11 +1056,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1056 * header first:
931 */ 1057 */
932static noinline int 1058static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1059print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1060 struct held_lock *check_src,
1061 struct held_lock *check_tgt)
934{ 1062{
935 struct task_struct *curr = current; 1063 struct task_struct *curr = current;
936 1064
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1065 if (debug_locks_silent)
938 return 0; 1066 return 0;
939 1067
940 printk("\n=======================================================\n"); 1068 printk("\n=======================================================\n");
@@ -943,9 +1071,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1071 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1072 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1073 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1074 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1075 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1076 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1077 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1078 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1079
@@ -954,19 +1082,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1082 return 0;
955} 1083}
956 1084
957static noinline int print_circular_bug_tail(void) 1085static inline int class_equal(struct lock_list *entry, void *data)
1086{
1087 return entry->class == data;
1088}
1089
1090static noinline int print_circular_bug(struct lock_list *this,
1091 struct lock_list *target,
1092 struct held_lock *check_src,
1093 struct held_lock *check_tgt)
958{ 1094{
959 struct task_struct *curr = current; 1095 struct task_struct *curr = current;
960 struct lock_list this; 1096 struct lock_list *parent;
1097 int depth;
961 1098
962 if (debug_locks_silent) 1099 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1100 return 0;
964 1101
965 this.class = hlock_class(check_source); 1102 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1103 return 0;
968 1104
969 print_circular_bug_entry(&this, 0); 1105 depth = get_lock_depth(target);
1106
1107 print_circular_bug_header(target, depth, check_src, check_tgt);
1108
1109 parent = get_lock_parent(target);
1110
1111 while (parent) {
1112 print_circular_bug_entry(parent, --depth);
1113 parent = get_lock_parent(parent);
1114 }
970 1115
971 printk("\nother info that might help us debug this:\n\n"); 1116 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1117 lockdep_print_held_locks(curr);
@@ -977,73 +1122,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1122 return 0;
978} 1123}
979 1124
980#define RECURSION_LIMIT 40 1125static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1126{
984 if (!debug_locks_off_graph_unlock()) 1127 if (!debug_locks_off_graph_unlock())
985 return 0; 1128 return 0;
986 1129
987 WARN_ON(1); 1130 WARN(1, "lockdep bfs error:%d\n", ret);
988 1131
989 return 0; 1132 return 0;
990} 1133}
991 1134
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1135static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1136{
995 struct lock_list *entry; 1137 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1138 return 0;
1139}
997 1140
998 if (lockdep_dependency_visit(class, depth)) 1141unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1142{
1143 unsigned long count = 0;
1144 struct lock_list *uninitialized_var(target_entry);
1000 1145
1001 /* 1146 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1147
1007 return ret; 1148 return count;
1008} 1149}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1150unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1151{
1012 unsigned long ret, flags; 1152 unsigned long ret, flags;
1153 struct lock_list this;
1154
1155 this.parent = NULL;
1156 this.class = class;
1013 1157
1014 local_irq_save(flags); 1158 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1159 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1160 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1161 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1162 local_irq_restore(flags);
1019 1163
1020 return ret; 1164 return ret;
1021} 1165}
1022 1166
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1167unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1168{
1026 struct lock_list *entry; 1169 unsigned long count = 0;
1027 unsigned long ret = 1; 1170 struct lock_list *uninitialized_var(target_entry);
1028 1171
1029 if (lockdep_dependency_visit(class, depth)) 1172 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1173
1037 return ret; 1174 return count;
1038} 1175}
1039 1176
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1177unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1178{
1042 unsigned long ret, flags; 1179 unsigned long ret, flags;
1180 struct lock_list this;
1181
1182 this.parent = NULL;
1183 this.class = class;
1043 1184
1044 local_irq_save(flags); 1185 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1186 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1187 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1188 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1189 local_irq_restore(flags);
1049 1190
@@ -1055,29 +1196,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1196 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1197 */
1057static noinline int 1198static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1199check_noncircular(struct lock_list *root, struct lock_class *target,
1200 struct lock_list **target_entry)
1059{ 1201{
1060 struct lock_list *entry; 1202 int result;
1061 1203
1062 if (lockdep_dependency_visit(source, depth)) 1204 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1205
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1206 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1207
1067 max_recursion_depth = depth; 1208 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1209}
1082 1210
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1211#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1214,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1214 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1215 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1216 */
1089static enum lock_usage_bit find_usage_bit; 1217
1090static struct lock_class *forwards_match, *backwards_match; 1218static inline int usage_match(struct lock_list *entry, void *bit)
1219{
1220 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1221}
1222
1223
1091 1224
1092/* 1225/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1226 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1227 * at @root->class that matches @bit.
1095 * 1228 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1229 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1230 * into *@target_entry.
1098 * 1231 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1232 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1233 * Return <0 on error.
1101 */ 1234 */
1102static noinline int 1235static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1236find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1237 struct lock_list **target_entry)
1104{ 1238{
1105 struct lock_list *entry; 1239 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1240
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1241 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1242
1122 /* 1243 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1244
1124 */ 1245 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1246}
1133 1247
1134/* 1248/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1249 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1250 * at @root->class that matches @bit.
1137 * 1251 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1252 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1253 * into *@target_entry.
1140 * 1254 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1255 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1256 * Return <0 on error.
1143 */ 1257 */
1144static noinline int 1258static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1259find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1260 struct lock_list **target_entry)
1146{ 1261{
1147 struct lock_list *entry; 1262 int result;
1148 int ret;
1149 1263
1150 if (lockdep_dependency_visit(source, depth)) 1264 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1265
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1266 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1267
1156 if (depth > max_recursion_depth) 1268 return result;
1157 max_recursion_depth = depth; 1269}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1270
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1271static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1272{
1163 backwards_match = source; 1273 int bit;
1164 return 2;
1165 }
1166 1274
1167 if (!source && debug_locks_off_graph_unlock()) { 1275 printk("%*s->", depth, "");
1168 WARN_ON(1); 1276 print_lock_name(class);
1169 return 0; 1277 printk(" ops: %lu", class->ops);
1170 } 1278 printk(" {\n");
1171 1279
1172 /* 1280 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1281 if (class->usage_mask & (1 << bit)) {
1174 */ 1282 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1283
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1284 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1285 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1286 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1287 }
1180 } 1288 }
1181 return 1; 1289 printk("%*s }\n", depth, "");
1290
1291 printk("%*s ... key at: ",depth,"");
1292 print_ip_sym((unsigned long)class->key);
1293}
1294
1295/*
1296 * printk the shortest lock dependencies from @start to @end in reverse order:
1297 */
1298static void __used
1299print_shortest_lock_dependencies(struct lock_list *leaf,
1300 struct lock_list *root)
1301{
1302 struct lock_list *entry = leaf;
1303 int depth;
1304
1305 /*compute depth from generated tree by BFS*/
1306 depth = get_lock_depth(leaf);
1307
1308 do {
1309 print_lock_class_header(entry->class, depth);
1310 printk("%*s ... acquired at:\n", depth, "");
1311 print_stack_trace(&entry->trace, 2);
1312 printk("\n");
1313
1314 if (depth == 0 && (entry != root)) {
1315 printk("lockdep:%s bad BFS generated tree\n", __func__);
1316 break;
1317 }
1318
1319 entry = get_lock_parent(entry);
1320 depth--;
1321 } while (entry && (depth >= 0));
1322
1323 return;
1182} 1324}
1183 1325
1184static int 1326static int
1185print_bad_irq_dependency(struct task_struct *curr, 1327print_bad_irq_dependency(struct task_struct *curr,
1328 struct lock_list *prev_root,
1329 struct lock_list *next_root,
1330 struct lock_list *backwards_entry,
1331 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1332 struct held_lock *prev,
1187 struct held_lock *next, 1333 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1334 enum lock_usage_bit bit1,
@@ -1215,26 +1361,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1361
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1362 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1363 irqclass);
1218 print_lock_name(backwards_match); 1364 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1365 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1366
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1367 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1368
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1369 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1370 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1371 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1372 printk("...");
1227 1373
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1374 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1375
1230 printk("\nother info that might help us debug this:\n\n"); 1376 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1377 lockdep_print_held_locks(curr);
1232 1378
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1379 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1380 printk(" and the holding lock:\n");
1381 if (!save_trace(&prev_root->trace))
1382 return 0;
1383 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1384
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1385 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1386 printk(" and %s-irq-unsafe lock:\n", irqclass);
1387 if (!save_trace(&next_root->trace))
1388 return 0;
1389 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1390
1239 printk("\nstack backtrace:\n"); 1391 printk("\nstack backtrace:\n");
1240 dump_stack(); 1392 dump_stack();
@@ -1248,19 +1400,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1400 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1401{
1250 int ret; 1402 int ret;
1403 struct lock_list this, that;
1404 struct lock_list *uninitialized_var(target_entry);
1405 struct lock_list *uninitialized_var(target_entry1);
1406
1407 this.parent = NULL;
1251 1408
1252 find_usage_bit = bit_backwards; 1409 this.class = hlock_class(prev);
1253 /* fills in <backwards_match> */ 1410 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1411 if (ret < 0)
1255 if (!ret || ret == 1) 1412 return print_bfs_bug(ret);
1413 if (ret == 1)
1256 return ret; 1414 return ret;
1257 1415
1258 find_usage_bit = bit_forwards; 1416 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1417 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1418 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1419 if (ret < 0)
1420 return print_bfs_bug(ret);
1421 if (ret == 1)
1261 return ret; 1422 return ret;
1262 /* ret == 2 */ 1423
1263 return print_bad_irq_dependency(curr, prev, next, 1424 return print_bad_irq_dependency(curr, &this, &that,
1425 target_entry, target_entry1,
1426 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1427 bit_backwards, bit_forwards, irqclass);
1265} 1428}
1266 1429
@@ -1472,6 +1635,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1635{
1473 struct lock_list *entry; 1636 struct lock_list *entry;
1474 int ret; 1637 int ret;
1638 struct lock_list this;
1639 struct lock_list *uninitialized_var(target_entry);
1475 1640
1476 /* 1641 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1642 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1647,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1647 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1648 * keep the stackframe size of the recursive functions low:
1484 */ 1649 */
1485 check_source = next; 1650 this.class = hlock_class(next);
1486 check_target = prev; 1651 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1652 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1653 if (unlikely(!ret))
1654 return print_circular_bug(&this, target_entry, next, prev);
1655 else if (unlikely(ret < 0))
1656 return print_bfs_bug(ret);
1489 1657
1490 if (!check_prev_add_irq(curr, prev, next)) 1658 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1659 return 0;
@@ -1884,7 +2052,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2052 * print irq inversion bug:
1885 */ 2053 */
1886static int 2054static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2055print_irq_inversion_bug(struct task_struct *curr,
2056 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2057 struct held_lock *this, int forwards,
1889 const char *irqclass) 2058 const char *irqclass)
1890{ 2059{
@@ -1902,17 +2071,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2071 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2072 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2073 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2074 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2075 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2076
1908 printk("\nother info that might help us debug this:\n"); 2077 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2078 lockdep_print_held_locks(curr);
1910 2079
1911 printk("\nthe first lock's dependencies:\n"); 2080 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2081 if (!save_trace(&root->trace))
1913 2082 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2083 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2084
1917 printk("\nstack backtrace:\n"); 2085 printk("\nstack backtrace:\n");
1918 dump_stack(); 2086 dump_stack();
@@ -1929,14 +2097,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2097 enum lock_usage_bit bit, const char *irqclass)
1930{ 2098{
1931 int ret; 2099 int ret;
1932 2100 struct lock_list root;
1933 find_usage_bit = bit; 2101 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2102
1935 ret = find_usage_forwards(hlock_class(this), 0); 2103 root.parent = NULL;
1936 if (!ret || ret == 1) 2104 root.class = hlock_class(this);
2105 ret = find_usage_forwards(&root, bit, &target_entry);
2106 if (ret < 0)
2107 return print_bfs_bug(ret);
2108 if (ret == 1)
1937 return ret; 2109 return ret;
1938 2110
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2111 return print_irq_inversion_bug(curr, &root, target_entry,
2112 this, 1, irqclass);
1940} 2113}
1941 2114
1942/* 2115/*
@@ -1948,14 +2121,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2121 enum lock_usage_bit bit, const char *irqclass)
1949{ 2122{
1950 int ret; 2123 int ret;
1951 2124 struct lock_list root;
1952 find_usage_bit = bit; 2125 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2126
1954 ret = find_usage_backwards(hlock_class(this), 0); 2127 root.parent = NULL;
1955 if (!ret || ret == 1) 2128 root.class = hlock_class(this);
2129 ret = find_usage_backwards(&root, bit, &target_entry);
2130 if (ret < 0)
2131 return print_bfs_bug(ret);
2132 if (ret == 1)
1956 return ret; 2133 return ret;
1957 2134
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2135 return print_irq_inversion_bug(curr, &root, target_entry,
2136 this, 1, irqclass);
1959} 2137}
1960 2138
1961void print_irqtrace_events(struct task_struct *curr) 2139void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2708,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2708 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2709static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2710 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2711 struct lockdep_map *nest_lock, unsigned long ip,
2712 int references)
2534{ 2713{
2535 struct task_struct *curr = current; 2714 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2715 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2716 struct held_lock *hlock;
2538 unsigned int depth, id; 2717 unsigned int depth, id;
2539 int chain_head = 0; 2718 int chain_head = 0;
2719 int class_idx;
2540 u64 chain_key; 2720 u64 chain_key;
2541 2721
2542 if (!prove_locking) 2722 if (!prove_locking)
@@ -2584,10 +2764,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2764 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2765 return 0;
2586 2766
2767 class_idx = class - lock_classes + 1;
2768
2769 if (depth) {
2770 hlock = curr->held_locks + depth - 1;
2771 if (hlock->class_idx == class_idx && nest_lock) {
2772 if (hlock->references)
2773 hlock->references++;
2774 else
2775 hlock->references = 2;
2776
2777 return 1;
2778 }
2779 }
2780
2587 hlock = curr->held_locks + depth; 2781 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2782 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2783 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2784 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2785 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2786 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2787 hlock->nest_lock = nest_lock;
@@ -2595,6 +2789,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2789 hlock->read = read;
2596 hlock->check = check; 2790 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2791 hlock->hardirqs_off = !!hardirqs_off;
2792 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2793#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2794 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2795 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2898,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2898 return 1;
2704} 2899}
2705 2900
2901static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2902{
2903 if (hlock->instance == lock)
2904 return 1;
2905
2906 if (hlock->references) {
2907 struct lock_class *class = lock->class_cache;
2908
2909 if (!class)
2910 class = look_up_lock_class(lock, 0);
2911
2912 if (DEBUG_LOCKS_WARN_ON(!class))
2913 return 0;
2914
2915 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2916 return 0;
2917
2918 if (hlock->class_idx == class - lock_classes + 1)
2919 return 1;
2920 }
2921
2922 return 0;
2923}
2924
2706static int 2925static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2926__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2927 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2945,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2945 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2946 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2947 break;
2729 if (hlock->instance == lock) 2948 if (match_held_lock(hlock, lock))
2730 goto found_it; 2949 goto found_it;
2731 prev_hlock = hlock; 2950 prev_hlock = hlock;
2732 } 2951 }
@@ -2745,7 +2964,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2964 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2965 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2966 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2967 hlock->nest_lock, hlock->acquire_ip,
2968 hlock->references))
2749 return 0; 2969 return 0;
2750 } 2970 }
2751 2971
@@ -2784,20 +3004,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3004 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3005 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3006 break;
2787 if (hlock->instance == lock) 3007 if (match_held_lock(hlock, lock))
2788 goto found_it; 3008 goto found_it;
2789 prev_hlock = hlock; 3009 prev_hlock = hlock;
2790 } 3010 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3011 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3012
2793found_it: 3013found_it:
2794 lock_release_holdtime(hlock); 3014 if (hlock->instance == lock)
3015 lock_release_holdtime(hlock);
3016
3017 if (hlock->references) {
3018 hlock->references--;
3019 if (hlock->references) {
3020 /*
3021 * We had, and after removing one, still have
3022 * references, the current lock stack is still
3023 * valid. We're done!
3024 */
3025 return 1;
3026 }
3027 }
2795 3028
2796 /* 3029 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3030 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3031 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3032 * entries (if any), recalculating the hash along the way:
2800 */ 3033 */
3034
2801 curr->lockdep_depth = i; 3035 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3036 curr->curr_chain_key = hlock->prev_chain_key;
2803 3037
@@ -2806,7 +3040,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3040 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3041 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3042 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3043 hlock->nest_lock, hlock->acquire_ip,
3044 hlock->references))
2810 return 0; 3045 return 0;
2811 } 3046 }
2812 3047
@@ -2836,7 +3071,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3071 /*
2837 * Is the unlock non-nested: 3072 * Is the unlock non-nested:
2838 */ 3073 */
2839 if (hlock->instance != lock) 3074 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3075 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3076 curr->lockdep_depth--;
2842 3077
@@ -2881,6 +3116,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3116 check_chain_key(curr);
2882} 3117}
2883 3118
3119static int __lock_is_held(struct lockdep_map *lock)
3120{
3121 struct task_struct *curr = current;
3122 int i;
3123
3124 for (i = 0; i < curr->lockdep_depth; i++) {
3125 struct held_lock *hlock = curr->held_locks + i;
3126
3127 if (match_held_lock(hlock, lock))
3128 return 1;
3129 }
3130
3131 return 0;
3132}
3133
2884/* 3134/*
2885 * Check whether we follow the irq-flags state precisely: 3135 * Check whether we follow the irq-flags state precisely:
2886 */ 3136 */
@@ -2957,7 +3207,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3207
2958 current->lockdep_recursion = 1; 3208 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3209 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3210 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3211 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3212 raw_local_irq_restore(flags);
2963} 3213}
@@ -2982,6 +3232,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3232}
2983EXPORT_SYMBOL_GPL(lock_release); 3233EXPORT_SYMBOL_GPL(lock_release);
2984 3234
3235int lock_is_held(struct lockdep_map *lock)
3236{
3237 unsigned long flags;
3238 int ret = 0;
3239
3240 if (unlikely(current->lockdep_recursion))
3241 return ret;
3242
3243 raw_local_irq_save(flags);
3244 check_flags(flags);
3245
3246 current->lockdep_recursion = 1;
3247 ret = __lock_is_held(lock);
3248 current->lockdep_recursion = 0;
3249 raw_local_irq_restore(flags);
3250
3251 return ret;
3252}
3253EXPORT_SYMBOL_GPL(lock_is_held);
3254
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3255void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3256{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3257 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3311,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3311 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3312 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3313 break;
3044 if (hlock->instance == lock) 3314 if (match_held_lock(hlock, lock))
3045 goto found_it; 3315 goto found_it;
3046 prev_hlock = hlock; 3316 prev_hlock = hlock;
3047 } 3317 }
@@ -3049,6 +3319,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3319 return;
3050 3320
3051found_it: 3321found_it:
3322 if (hlock->instance != lock)
3323 return;
3324
3052 hlock->waittime_stamp = sched_clock(); 3325 hlock->waittime_stamp = sched_clock();
3053 3326
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3327 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3361,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3361 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3362 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3363 break;
3091 if (hlock->instance == lock) 3364 if (match_held_lock(hlock, lock))
3092 goto found_it; 3365 goto found_it;
3093 prev_hlock = hlock; 3366 prev_hlock = hlock;
3094 } 3367 }
@@ -3096,6 +3369,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3369 return;
3097 3370
3098found_it: 3371found_it:
3372 if (hlock->instance != lock)
3373 return;
3374
3099 cpu = smp_processor_id(); 3375 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3376 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3377 now = sched_clock();
@@ -3326,7 +3602,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3602 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3603 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3604 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3605 sizeof(struct list_head) * CHAINHASH_SIZE
3606#ifdef CONFIG_PROVE_LOCKING
3607 + sizeof(struct circular_queue)
3608#endif
3609 ) / 1024
3610 );
3330 3611
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3612 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3613 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..d4aba4f3584c 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -670,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v)
670 return 0; 594 return 0;
671} 595}
672 596
673static struct seq_operations lockstat_ops = { 597static const struct seq_operations lockstat_ops = {
674 .start = ls_start, 598 .start = ls_start,
675 .next = ls_next, 599 .next = ls_next,
676 .stop = ls_stop, 600 .stop = ls_stop,
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 46580edff0cb..e6bc4b28aa62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -369,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module);
369 370
370#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
371 372
372#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
373 374
374static void *percpu_modalloc(unsigned long size, unsigned long align, 375static void *percpu_modalloc(unsigned long size, unsigned long align,
375 const char *name) 376 const char *name)
@@ -394,7 +395,7 @@ static void percpu_modfree(void *freeme)
394 free_percpu(freeme); 395 free_percpu(freeme);
395} 396}
396 397
397#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
398 399
399/* Number of blocks used and allocated. */ 400/* Number of blocks used and allocated. */
400static unsigned int pcpu_num_used, pcpu_num_allocated; 401static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -540,7 +541,7 @@ static int percpu_modinit(void)
540} 541}
541__initcall(percpu_modinit); 542__initcall(percpu_modinit);
542 543
543#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
544 545
545static unsigned int find_pcpusec(Elf_Ehdr *hdr, 546static unsigned int find_pcpusec(Elf_Ehdr *hdr,
546 Elf_Shdr *sechdrs, 547 Elf_Shdr *sechdrs,
@@ -1535,6 +1536,10 @@ static void free_module(struct module *mod)
1535 1536
1536 /* Finally, free the core (containing the module structure) */ 1537 /* Finally, free the core (containing the module structure) */
1537 module_free(mod, mod->module_core); 1538 module_free(mod, mod->module_core);
1539
1540#ifdef CONFIG_MPU
1541 update_protections(current->mm);
1542#endif
1538} 1543}
1539 1544
1540void *__symbol_get(const char *symbol) 1545void *__symbol_get(const char *symbol)
@@ -2237,10 +2242,6 @@ static noinline struct module *load_module(void __user *umod,
2237 sizeof(*mod->ctors), &mod->num_ctors); 2242 sizeof(*mod->ctors), &mod->num_ctors);
2238#endif 2243#endif
2239 2244
2240#ifdef CONFIG_MARKERS
2241 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2242 sizeof(*mod->markers), &mod->num_markers);
2243#endif
2244#ifdef CONFIG_TRACEPOINTS 2245#ifdef CONFIG_TRACEPOINTS
2245 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2246 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2246 "__tracepoints", 2247 "__tracepoints",
@@ -2958,20 +2959,6 @@ void module_layout(struct module *mod,
2958EXPORT_SYMBOL(module_layout); 2959EXPORT_SYMBOL(module_layout);
2959#endif 2960#endif
2960 2961
2961#ifdef CONFIG_MARKERS
2962void module_update_markers(void)
2963{
2964 struct module *mod;
2965
2966 mutex_lock(&module_mutex);
2967 list_for_each_entry(mod, &modules, list)
2968 if (!mod->taints)
2969 marker_update_probe_range(mod->markers,
2970 mod->markers + mod->num_markers);
2971 mutex_unlock(&module_mutex);
2972}
2973#endif
2974
2975#ifdef CONFIG_TRACEPOINTS 2962#ifdef CONFIG_TRACEPOINTS
2976void module_update_tracepoints(void) 2963void module_update_tracepoints(void)
2977{ 2964{
diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..bcdef26e3332 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,7 +177,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 177 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 178 * 'C' - modules from drivers/staging are loaded.
179 * 179 *
180 * The string is overwritten by the next call to print_taint(). 180 * The string is overwritten by the next call to print_tainted().
181 */ 181 */
182const char *print_tainted(void) 182const char *print_tainted(void)
183{ 183{
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index d7cbc579fc80..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4861 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45static atomic_t nr_task_counters __read_mostly;
46
47/*
48 * perf counter paranoia level:
49 * 0 - not paranoid
50 * 1 - disallow cpu counters to unpriv
51 * 2 - disallow kernel profiling to unpriv
52 */
53int sysctl_perf_counter_paranoid __read_mostly = 1;
54
55static inline bool perf_paranoid_cpu(void)
56{
57 return sysctl_perf_counter_paranoid > 0;
58}
59
60static inline bool perf_paranoid_kernel(void)
61{
62 return sysctl_perf_counter_paranoid > 1;
63}
64
65int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
66
67/*
68 * max perf counter sample rate
69 */
70int sysctl_perf_counter_sample_rate __read_mostly = 100000;
71
72static atomic64_t perf_counter_id;
73
74/*
75 * Lock for (sysadmin-configurable) counter reservations:
76 */
77static DEFINE_SPINLOCK(perf_resource_lock);
78
79/*
80 * Architecture provided APIs - weak aliases:
81 */
82extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
83{
84 return NULL;
85}
86
87void __weak hw_perf_disable(void) { barrier(); }
88void __weak hw_perf_enable(void) { barrier(); }
89
90void __weak hw_perf_counter_setup(int cpu) { barrier(); }
91void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
92
93int __weak
94hw_perf_group_sched_in(struct perf_counter *group_leader,
95 struct perf_cpu_context *cpuctx,
96 struct perf_counter_context *ctx, int cpu)
97{
98 return 0;
99}
100
101void __weak perf_counter_print_debug(void) { }
102
103static DEFINE_PER_CPU(int, disable_count);
104
105void __perf_disable(void)
106{
107 __get_cpu_var(disable_count)++;
108}
109
110bool __perf_enable(void)
111{
112 return !--__get_cpu_var(disable_count);
113}
114
115void perf_disable(void)
116{
117 __perf_disable();
118 hw_perf_disable();
119}
120
121void perf_enable(void)
122{
123 if (__perf_enable())
124 hw_perf_enable();
125}
126
127static void get_ctx(struct perf_counter_context *ctx)
128{
129 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
130}
131
132static void free_ctx(struct rcu_head *head)
133{
134 struct perf_counter_context *ctx;
135
136 ctx = container_of(head, struct perf_counter_context, rcu_head);
137 kfree(ctx);
138}
139
140static void put_ctx(struct perf_counter_context *ctx)
141{
142 if (atomic_dec_and_test(&ctx->refcount)) {
143 if (ctx->parent_ctx)
144 put_ctx(ctx->parent_ctx);
145 if (ctx->task)
146 put_task_struct(ctx->task);
147 call_rcu(&ctx->rcu_head, free_ctx);
148 }
149}
150
151static void unclone_ctx(struct perf_counter_context *ctx)
152{
153 if (ctx->parent_ctx) {
154 put_ctx(ctx->parent_ctx);
155 ctx->parent_ctx = NULL;
156 }
157}
158
159/*
160 * If we inherit counters we want to return the parent counter id
161 * to userspace.
162 */
163static u64 primary_counter_id(struct perf_counter *counter)
164{
165 u64 id = counter->id;
166
167 if (counter->parent)
168 id = counter->parent->id;
169
170 return id;
171}
172
173/*
174 * Get the perf_counter_context for a task and lock it.
175 * This has to cope with with the fact that until it is locked,
176 * the context could get moved to another task.
177 */
178static struct perf_counter_context *
179perf_lock_task_context(struct task_struct *task, unsigned long *flags)
180{
181 struct perf_counter_context *ctx;
182
183 rcu_read_lock();
184 retry:
185 ctx = rcu_dereference(task->perf_counter_ctxp);
186 if (ctx) {
187 /*
188 * If this context is a clone of another, it might
189 * get swapped for another underneath us by
190 * perf_counter_task_sched_out, though the
191 * rcu_read_lock() protects us from any context
192 * getting freed. Lock the context and check if it
193 * got swapped before we could get the lock, and retry
194 * if so. If we locked the right context, then it
195 * can't get swapped on us any more.
196 */
197 spin_lock_irqsave(&ctx->lock, *flags);
198 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
199 spin_unlock_irqrestore(&ctx->lock, *flags);
200 goto retry;
201 }
202
203 if (!atomic_inc_not_zero(&ctx->refcount)) {
204 spin_unlock_irqrestore(&ctx->lock, *flags);
205 ctx = NULL;
206 }
207 }
208 rcu_read_unlock();
209 return ctx;
210}
211
212/*
213 * Get the context for a task and increment its pin_count so it
214 * can't get swapped to another task. This also increments its
215 * reference count so that the context can't get freed.
216 */
217static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
218{
219 struct perf_counter_context *ctx;
220 unsigned long flags;
221
222 ctx = perf_lock_task_context(task, &flags);
223 if (ctx) {
224 ++ctx->pin_count;
225 spin_unlock_irqrestore(&ctx->lock, flags);
226 }
227 return ctx;
228}
229
230static void perf_unpin_context(struct perf_counter_context *ctx)
231{
232 unsigned long flags;
233
234 spin_lock_irqsave(&ctx->lock, flags);
235 --ctx->pin_count;
236 spin_unlock_irqrestore(&ctx->lock, flags);
237 put_ctx(ctx);
238}
239
240/*
241 * Add a counter from the lists for its context.
242 * Must be called with ctx->mutex and ctx->lock held.
243 */
244static void
245list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
246{
247 struct perf_counter *group_leader = counter->group_leader;
248
249 /*
250 * Depending on whether it is a standalone or sibling counter,
251 * add it straight to the context's counter list, or to the group
252 * leader's sibling list:
253 */
254 if (group_leader == counter)
255 list_add_tail(&counter->list_entry, &ctx->counter_list);
256 else {
257 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
258 group_leader->nr_siblings++;
259 }
260
261 list_add_rcu(&counter->event_entry, &ctx->event_list);
262 ctx->nr_counters++;
263 if (counter->attr.inherit_stat)
264 ctx->nr_stat++;
265}
266
267/*
268 * Remove a counter from the lists for its context.
269 * Must be called with ctx->mutex and ctx->lock held.
270 */
271static void
272list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
273{
274 struct perf_counter *sibling, *tmp;
275
276 if (list_empty(&counter->list_entry))
277 return;
278 ctx->nr_counters--;
279 if (counter->attr.inherit_stat)
280 ctx->nr_stat--;
281
282 list_del_init(&counter->list_entry);
283 list_del_rcu(&counter->event_entry);
284
285 if (counter->group_leader != counter)
286 counter->group_leader->nr_siblings--;
287
288 /*
289 * If this was a group counter with sibling counters then
290 * upgrade the siblings to singleton counters by adding them
291 * to the context list directly:
292 */
293 list_for_each_entry_safe(sibling, tmp,
294 &counter->sibling_list, list_entry) {
295
296 list_move_tail(&sibling->list_entry, &ctx->counter_list);
297 sibling->group_leader = sibling;
298 }
299}
300
301static void
302counter_sched_out(struct perf_counter *counter,
303 struct perf_cpu_context *cpuctx,
304 struct perf_counter_context *ctx)
305{
306 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
307 return;
308
309 counter->state = PERF_COUNTER_STATE_INACTIVE;
310 if (counter->pending_disable) {
311 counter->pending_disable = 0;
312 counter->state = PERF_COUNTER_STATE_OFF;
313 }
314 counter->tstamp_stopped = ctx->time;
315 counter->pmu->disable(counter);
316 counter->oncpu = -1;
317
318 if (!is_software_counter(counter))
319 cpuctx->active_oncpu--;
320 ctx->nr_active--;
321 if (counter->attr.exclusive || !cpuctx->active_oncpu)
322 cpuctx->exclusive = 0;
323}
324
325static void
326group_sched_out(struct perf_counter *group_counter,
327 struct perf_cpu_context *cpuctx,
328 struct perf_counter_context *ctx)
329{
330 struct perf_counter *counter;
331
332 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
333 return;
334
335 counter_sched_out(group_counter, cpuctx, ctx);
336
337 /*
338 * Schedule out siblings (if any):
339 */
340 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
341 counter_sched_out(counter, cpuctx, ctx);
342
343 if (group_counter->attr.exclusive)
344 cpuctx->exclusive = 0;
345}
346
347/*
348 * Cross CPU call to remove a performance counter
349 *
350 * We disable the counter on the hardware level first. After that we
351 * remove it from the context list.
352 */
353static void __perf_counter_remove_from_context(void *info)
354{
355 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
356 struct perf_counter *counter = info;
357 struct perf_counter_context *ctx = counter->ctx;
358
359 /*
360 * If this is a task context, we need to check whether it is
361 * the current task context of this cpu. If not it has been
362 * scheduled out before the smp call arrived.
363 */
364 if (ctx->task && cpuctx->task_ctx != ctx)
365 return;
366
367 spin_lock(&ctx->lock);
368 /*
369 * Protect the list operation against NMI by disabling the
370 * counters on a global level.
371 */
372 perf_disable();
373
374 counter_sched_out(counter, cpuctx, ctx);
375
376 list_del_counter(counter, ctx);
377
378 if (!ctx->task) {
379 /*
380 * Allow more per task counters with respect to the
381 * reservation:
382 */
383 cpuctx->max_pertask =
384 min(perf_max_counters - ctx->nr_counters,
385 perf_max_counters - perf_reserved_percpu);
386 }
387
388 perf_enable();
389 spin_unlock(&ctx->lock);
390}
391
392
393/*
394 * Remove the counter from a task's (or a CPU's) list of counters.
395 *
396 * Must be called with ctx->mutex held.
397 *
398 * CPU counters are removed with a smp call. For task counters we only
399 * call when the task is on a CPU.
400 *
401 * If counter->ctx is a cloned context, callers must make sure that
402 * every task struct that counter->ctx->task could possibly point to
403 * remains valid. This is OK when called from perf_release since
404 * that only calls us on the top-level context, which can't be a clone.
405 * When called from perf_counter_exit_task, it's OK because the
406 * context has been detached from its task.
407 */
408static void perf_counter_remove_from_context(struct perf_counter *counter)
409{
410 struct perf_counter_context *ctx = counter->ctx;
411 struct task_struct *task = ctx->task;
412
413 if (!task) {
414 /*
415 * Per cpu counters are removed via an smp call and
416 * the removal is always sucessful.
417 */
418 smp_call_function_single(counter->cpu,
419 __perf_counter_remove_from_context,
420 counter, 1);
421 return;
422 }
423
424retry:
425 task_oncpu_function_call(task, __perf_counter_remove_from_context,
426 counter);
427
428 spin_lock_irq(&ctx->lock);
429 /*
430 * If the context is active we need to retry the smp call.
431 */
432 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
433 spin_unlock_irq(&ctx->lock);
434 goto retry;
435 }
436
437 /*
438 * The lock prevents that this context is scheduled in so we
439 * can remove the counter safely, if the call above did not
440 * succeed.
441 */
442 if (!list_empty(&counter->list_entry)) {
443 list_del_counter(counter, ctx);
444 }
445 spin_unlock_irq(&ctx->lock);
446}
447
448static inline u64 perf_clock(void)
449{
450 return cpu_clock(smp_processor_id());
451}
452
453/*
454 * Update the record of the current time in a context.
455 */
456static void update_context_time(struct perf_counter_context *ctx)
457{
458 u64 now = perf_clock();
459
460 ctx->time += now - ctx->timestamp;
461 ctx->timestamp = now;
462}
463
464/*
465 * Update the total_time_enabled and total_time_running fields for a counter.
466 */
467static void update_counter_times(struct perf_counter *counter)
468{
469 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end;
471
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
473 return;
474
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
476
477 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
478 run_end = counter->tstamp_stopped;
479 else
480 run_end = ctx->time;
481
482 counter->total_time_running = run_end - counter->tstamp_running;
483}
484
485/*
486 * Update total_time_enabled and total_time_running for all counters in a group.
487 */
488static void update_group_times(struct perf_counter *leader)
489{
490 struct perf_counter *counter;
491
492 update_counter_times(leader);
493 list_for_each_entry(counter, &leader->sibling_list, list_entry)
494 update_counter_times(counter);
495}
496
497/*
498 * Cross CPU call to disable a performance counter
499 */
500static void __perf_counter_disable(void *info)
501{
502 struct perf_counter *counter = info;
503 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
504 struct perf_counter_context *ctx = counter->ctx;
505
506 /*
507 * If this is a per-task counter, need to check whether this
508 * counter's task is the current task on this cpu.
509 */
510 if (ctx->task && cpuctx->task_ctx != ctx)
511 return;
512
513 spin_lock(&ctx->lock);
514
515 /*
516 * If the counter is on, turn it off.
517 * If it is in error state, leave it in error state.
518 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx);
521 update_counter_times(counter);
522 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx);
524 else
525 counter_sched_out(counter, cpuctx, ctx);
526 counter->state = PERF_COUNTER_STATE_OFF;
527 }
528
529 spin_unlock(&ctx->lock);
530}
531
532/*
533 * Disable a counter.
534 *
535 * If counter->ctx is a cloned context, callers must make sure that
536 * every task struct that counter->ctx->task could possibly point to
537 * remains valid. This condition is satisifed when called through
538 * perf_counter_for_each_child or perf_counter_for_each because they
539 * hold the top-level counter's child_mutex, so any descendant that
540 * goes to exit will block in sync_child_counter.
541 * When called from perf_pending_counter it's OK because counter->ctx
542 * is the current context on this CPU and preemption is disabled,
543 * hence we can't get into perf_counter_task_sched_out for this context.
544 */
545static void perf_counter_disable(struct perf_counter *counter)
546{
547 struct perf_counter_context *ctx = counter->ctx;
548 struct task_struct *task = ctx->task;
549
550 if (!task) {
551 /*
552 * Disable the counter on the cpu that it's on
553 */
554 smp_call_function_single(counter->cpu, __perf_counter_disable,
555 counter, 1);
556 return;
557 }
558
559 retry:
560 task_oncpu_function_call(task, __perf_counter_disable, counter);
561
562 spin_lock_irq(&ctx->lock);
563 /*
564 * If the counter is still active, we need to retry the cross-call.
565 */
566 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
567 spin_unlock_irq(&ctx->lock);
568 goto retry;
569 }
570
571 /*
572 * Since we have the lock this context can't be scheduled
573 * in, so we can change the state safely.
574 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF;
578 }
579
580 spin_unlock_irq(&ctx->lock);
581}
582
583static int
584counter_sched_in(struct perf_counter *counter,
585 struct perf_cpu_context *cpuctx,
586 struct perf_counter_context *ctx,
587 int cpu)
588{
589 if (counter->state <= PERF_COUNTER_STATE_OFF)
590 return 0;
591
592 counter->state = PERF_COUNTER_STATE_ACTIVE;
593 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
594 /*
595 * The new state must be visible before we turn it on in the hardware:
596 */
597 smp_wmb();
598
599 if (counter->pmu->enable(counter)) {
600 counter->state = PERF_COUNTER_STATE_INACTIVE;
601 counter->oncpu = -1;
602 return -EAGAIN;
603 }
604
605 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
606
607 if (!is_software_counter(counter))
608 cpuctx->active_oncpu++;
609 ctx->nr_active++;
610
611 if (counter->attr.exclusive)
612 cpuctx->exclusive = 1;
613
614 return 0;
615}
616
617static int
618group_sched_in(struct perf_counter *group_counter,
619 struct perf_cpu_context *cpuctx,
620 struct perf_counter_context *ctx,
621 int cpu)
622{
623 struct perf_counter *counter, *partial_group;
624 int ret;
625
626 if (group_counter->state == PERF_COUNTER_STATE_OFF)
627 return 0;
628
629 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
630 if (ret)
631 return ret < 0 ? ret : 0;
632
633 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
634 return -EAGAIN;
635
636 /*
637 * Schedule in siblings as one group (if any):
638 */
639 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
640 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
641 partial_group = counter;
642 goto group_error;
643 }
644 }
645
646 return 0;
647
648group_error:
649 /*
650 * Groups can be scheduled in as one unit only, so undo any
651 * partial group before returning:
652 */
653 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
654 if (counter == partial_group)
655 break;
656 counter_sched_out(counter, cpuctx, ctx);
657 }
658 counter_sched_out(group_counter, cpuctx, ctx);
659
660 return -EAGAIN;
661}
662
663/*
664 * Return 1 for a group consisting entirely of software counters,
665 * 0 if the group contains any hardware counters.
666 */
667static int is_software_only_group(struct perf_counter *leader)
668{
669 struct perf_counter *counter;
670
671 if (!is_software_counter(leader))
672 return 0;
673
674 list_for_each_entry(counter, &leader->sibling_list, list_entry)
675 if (!is_software_counter(counter))
676 return 0;
677
678 return 1;
679}
680
681/*
682 * Work out whether we can put this counter group on the CPU now.
683 */
684static int group_can_go_on(struct perf_counter *counter,
685 struct perf_cpu_context *cpuctx,
686 int can_add_hw)
687{
688 /*
689 * Groups consisting entirely of software counters can always go on.
690 */
691 if (is_software_only_group(counter))
692 return 1;
693 /*
694 * If an exclusive group is already on, no other hardware
695 * counters can go on.
696 */
697 if (cpuctx->exclusive)
698 return 0;
699 /*
700 * If this group is exclusive and there are already
701 * counters on the CPU, it can't go on.
702 */
703 if (counter->attr.exclusive && cpuctx->active_oncpu)
704 return 0;
705 /*
706 * Otherwise, try to add it if all previous groups were able
707 * to go on.
708 */
709 return can_add_hw;
710}
711
712static void add_counter_to_ctx(struct perf_counter *counter,
713 struct perf_counter_context *ctx)
714{
715 list_add_counter(counter, ctx);
716 counter->tstamp_enabled = ctx->time;
717 counter->tstamp_running = ctx->time;
718 counter->tstamp_stopped = ctx->time;
719}
720
721/*
722 * Cross CPU call to install and enable a performance counter
723 *
724 * Must be called with ctx->mutex held
725 */
726static void __perf_install_in_context(void *info)
727{
728 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
729 struct perf_counter *counter = info;
730 struct perf_counter_context *ctx = counter->ctx;
731 struct perf_counter *leader = counter->group_leader;
732 int cpu = smp_processor_id();
733 int err;
734
735 /*
736 * If this is a task context, we need to check whether it is
737 * the current task context of this cpu. If not it has been
738 * scheduled out before the smp call arrived.
739 * Or possibly this is the right context but it isn't
740 * on this cpu because it had no counters.
741 */
742 if (ctx->task && cpuctx->task_ctx != ctx) {
743 if (cpuctx->task_ctx || ctx->task != current)
744 return;
745 cpuctx->task_ctx = ctx;
746 }
747
748 spin_lock(&ctx->lock);
749 ctx->is_active = 1;
750 update_context_time(ctx);
751
752 /*
753 * Protect the list operation against NMI by disabling the
754 * counters on a global level. NOP for non NMI based counters.
755 */
756 perf_disable();
757
758 add_counter_to_ctx(counter, ctx);
759
760 /*
761 * Don't put the counter on if it is disabled or if
762 * it is in a group and the group isn't on.
763 */
764 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
765 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
766 goto unlock;
767
768 /*
769 * An exclusive counter can't go on if there are already active
770 * hardware counters, and no hardware counter can go on if there
771 * is already an exclusive counter on.
772 */
773 if (!group_can_go_on(counter, cpuctx, 1))
774 err = -EEXIST;
775 else
776 err = counter_sched_in(counter, cpuctx, ctx, cpu);
777
778 if (err) {
779 /*
780 * This counter couldn't go on. If it is in a group
781 * then we have to pull the whole group off.
782 * If the counter group is pinned then put it in error state.
783 */
784 if (leader != counter)
785 group_sched_out(leader, cpuctx, ctx);
786 if (leader->attr.pinned) {
787 update_group_times(leader);
788 leader->state = PERF_COUNTER_STATE_ERROR;
789 }
790 }
791
792 if (!err && !ctx->task && cpuctx->max_pertask)
793 cpuctx->max_pertask--;
794
795 unlock:
796 perf_enable();
797
798 spin_unlock(&ctx->lock);
799}
800
801/*
802 * Attach a performance counter to a context
803 *
804 * First we add the counter to the list with the hardware enable bit
805 * in counter->hw_config cleared.
806 *
807 * If the counter is attached to a task which is on a CPU we use a smp
808 * call to enable it in the task context. The task might have been
809 * scheduled away, but we check this in the smp call again.
810 *
811 * Must be called with ctx->mutex held.
812 */
813static void
814perf_install_in_context(struct perf_counter_context *ctx,
815 struct perf_counter *counter,
816 int cpu)
817{
818 struct task_struct *task = ctx->task;
819
820 if (!task) {
821 /*
822 * Per cpu counters are installed via an smp call and
823 * the install is always sucessful.
824 */
825 smp_call_function_single(cpu, __perf_install_in_context,
826 counter, 1);
827 return;
828 }
829
830retry:
831 task_oncpu_function_call(task, __perf_install_in_context,
832 counter);
833
834 spin_lock_irq(&ctx->lock);
835 /*
836 * we need to retry the smp call.
837 */
838 if (ctx->is_active && list_empty(&counter->list_entry)) {
839 spin_unlock_irq(&ctx->lock);
840 goto retry;
841 }
842
843 /*
844 * The lock prevents that this context is scheduled in so we
845 * can add the counter safely, if it the call above did not
846 * succeed.
847 */
848 if (list_empty(&counter->list_entry))
849 add_counter_to_ctx(counter, ctx);
850 spin_unlock_irq(&ctx->lock);
851}
852
853/*
854 * Cross CPU call to enable a performance counter
855 */
856static void __perf_counter_enable(void *info)
857{
858 struct perf_counter *counter = info;
859 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
860 struct perf_counter_context *ctx = counter->ctx;
861 struct perf_counter *leader = counter->group_leader;
862 int err;
863
864 /*
865 * If this is a per-task counter, need to check whether this
866 * counter's task is the current task on this cpu.
867 */
868 if (ctx->task && cpuctx->task_ctx != ctx) {
869 if (cpuctx->task_ctx || ctx->task != current)
870 return;
871 cpuctx->task_ctx = ctx;
872 }
873
874 spin_lock(&ctx->lock);
875 ctx->is_active = 1;
876 update_context_time(ctx);
877
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE;
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882
883 /*
884 * If the counter is in a group and isn't the group leader,
885 * then don't put it on unless the group is on.
886 */
887 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
888 goto unlock;
889
890 if (!group_can_go_on(counter, cpuctx, 1)) {
891 err = -EEXIST;
892 } else {
893 perf_disable();
894 if (counter == leader)
895 err = group_sched_in(counter, cpuctx, ctx,
896 smp_processor_id());
897 else
898 err = counter_sched_in(counter, cpuctx, ctx,
899 smp_processor_id());
900 perf_enable();
901 }
902
903 if (err) {
904 /*
905 * If this counter can't go on and it's part of a
906 * group, then the whole group has to come off.
907 */
908 if (leader != counter)
909 group_sched_out(leader, cpuctx, ctx);
910 if (leader->attr.pinned) {
911 update_group_times(leader);
912 leader->state = PERF_COUNTER_STATE_ERROR;
913 }
914 }
915
916 unlock:
917 spin_unlock(&ctx->lock);
918}
919
920/*
921 * Enable a counter.
922 *
923 * If counter->ctx is a cloned context, callers must make sure that
924 * every task struct that counter->ctx->task could possibly point to
925 * remains valid. This condition is satisfied when called through
926 * perf_counter_for_each_child or perf_counter_for_each as described
927 * for perf_counter_disable.
928 */
929static void perf_counter_enable(struct perf_counter *counter)
930{
931 struct perf_counter_context *ctx = counter->ctx;
932 struct task_struct *task = ctx->task;
933
934 if (!task) {
935 /*
936 * Enable the counter on the cpu that it's on
937 */
938 smp_call_function_single(counter->cpu, __perf_counter_enable,
939 counter, 1);
940 return;
941 }
942
943 spin_lock_irq(&ctx->lock);
944 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
945 goto out;
946
947 /*
948 * If the counter is in error state, clear that first.
949 * That way, if we see the counter in error state below, we
950 * know that it has gone back into error state, as distinct
951 * from the task having been scheduled away before the
952 * cross-call arrived.
953 */
954 if (counter->state == PERF_COUNTER_STATE_ERROR)
955 counter->state = PERF_COUNTER_STATE_OFF;
956
957 retry:
958 spin_unlock_irq(&ctx->lock);
959 task_oncpu_function_call(task, __perf_counter_enable, counter);
960
961 spin_lock_irq(&ctx->lock);
962
963 /*
964 * If the context is active and the counter is still off,
965 * we need to retry the cross-call.
966 */
967 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
968 goto retry;
969
970 /*
971 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely.
973 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) {
975 counter->state = PERF_COUNTER_STATE_INACTIVE;
976 counter->tstamp_enabled =
977 ctx->time - counter->total_time_enabled;
978 }
979 out:
980 spin_unlock_irq(&ctx->lock);
981}
982
983static int perf_counter_refresh(struct perf_counter *counter, int refresh)
984{
985 /*
986 * not supported on inherited counters
987 */
988 if (counter->attr.inherit)
989 return -EINVAL;
990
991 atomic_add(refresh, &counter->event_limit);
992 perf_counter_enable(counter);
993
994 return 0;
995}
996
997void __perf_counter_sched_out(struct perf_counter_context *ctx,
998 struct perf_cpu_context *cpuctx)
999{
1000 struct perf_counter *counter;
1001
1002 spin_lock(&ctx->lock);
1003 ctx->is_active = 0;
1004 if (likely(!ctx->nr_counters))
1005 goto out;
1006 update_context_time(ctx);
1007
1008 perf_disable();
1009 if (ctx->nr_active) {
1010 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1011 if (counter != counter->group_leader)
1012 counter_sched_out(counter, cpuctx, ctx);
1013 else
1014 group_sched_out(counter, cpuctx, ctx);
1015 }
1016 }
1017 perf_enable();
1018 out:
1019 spin_unlock(&ctx->lock);
1020}
1021
1022/*
1023 * Test whether two contexts are equivalent, i.e. whether they
1024 * have both been cloned from the same version of the same context
1025 * and they both have the same number of enabled counters.
1026 * If the number of enabled counters is the same, then the set
1027 * of enabled counters should be the same, because these are both
1028 * inherited contexts, therefore we can't access individual counters
1029 * in them directly with an fd; we can only enable/disable all
1030 * counters via prctl, or enable/disable all counters in a family
1031 * via ioctl, which will have the same effect on both contexts.
1032 */
1033static int context_equiv(struct perf_counter_context *ctx1,
1034 struct perf_counter_context *ctx2)
1035{
1036 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1037 && ctx1->parent_gen == ctx2->parent_gen
1038 && !ctx1->pin_count && !ctx2->pin_count;
1039}
1040
1041static void __perf_counter_read(void *counter);
1042
1043static void __perf_counter_sync_stat(struct perf_counter *counter,
1044 struct perf_counter *next_counter)
1045{
1046 u64 value;
1047
1048 if (!counter->attr.inherit_stat)
1049 return;
1050
1051 /*
1052 * Update the counter value, we cannot use perf_counter_read()
1053 * because we're in the middle of a context switch and have IRQs
1054 * disabled, which upsets smp_call_function_single(), however
1055 * we know the counter must be on the current CPU, therefore we
1056 * don't need to use it.
1057 */
1058 switch (counter->state) {
1059 case PERF_COUNTER_STATE_ACTIVE:
1060 __perf_counter_read(counter);
1061 break;
1062
1063 case PERF_COUNTER_STATE_INACTIVE:
1064 update_counter_times(counter);
1065 break;
1066
1067 default:
1068 break;
1069 }
1070
1071 /*
1072 * In order to keep per-task stats reliable we need to flip the counter
1073 * values when we flip the contexts.
1074 */
1075 value = atomic64_read(&next_counter->count);
1076 value = atomic64_xchg(&counter->count, value);
1077 atomic64_set(&next_counter->count, value);
1078
1079 swap(counter->total_time_enabled, next_counter->total_time_enabled);
1080 swap(counter->total_time_running, next_counter->total_time_running);
1081
1082 /*
1083 * Since we swizzled the values, update the user visible data too.
1084 */
1085 perf_counter_update_userpage(counter);
1086 perf_counter_update_userpage(next_counter);
1087}
1088
1089#define list_next_entry(pos, member) \
1090 list_entry(pos->member.next, typeof(*pos), member)
1091
1092static void perf_counter_sync_stat(struct perf_counter_context *ctx,
1093 struct perf_counter_context *next_ctx)
1094{
1095 struct perf_counter *counter, *next_counter;
1096
1097 if (!ctx->nr_stat)
1098 return;
1099
1100 counter = list_first_entry(&ctx->event_list,
1101 struct perf_counter, event_entry);
1102
1103 next_counter = list_first_entry(&next_ctx->event_list,
1104 struct perf_counter, event_entry);
1105
1106 while (&counter->event_entry != &ctx->event_list &&
1107 &next_counter->event_entry != &next_ctx->event_list) {
1108
1109 __perf_counter_sync_stat(counter, next_counter);
1110
1111 counter = list_next_entry(counter, event_entry);
1112 next_counter = list_next_entry(next_counter, event_entry);
1113 }
1114}
1115
1116/*
1117 * Called from scheduler to remove the counters of the current task,
1118 * with interrupts disabled.
1119 *
1120 * We stop each counter and update the counter value in counter->count.
1121 *
1122 * This does not protect us against NMI, but disable()
1123 * sets the disabled bit in the control field of counter _before_
1124 * accessing the counter control register. If a NMI hits, then it will
1125 * not restart the counter.
1126 */
1127void perf_counter_task_sched_out(struct task_struct *task,
1128 struct task_struct *next, int cpu)
1129{
1130 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1131 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1132 struct perf_counter_context *next_ctx;
1133 struct perf_counter_context *parent;
1134 struct pt_regs *regs;
1135 int do_switch = 1;
1136
1137 regs = task_pt_regs(task);
1138 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1139
1140 if (likely(!ctx || !cpuctx->task_ctx))
1141 return;
1142
1143 update_context_time(ctx);
1144
1145 rcu_read_lock();
1146 parent = rcu_dereference(ctx->parent_ctx);
1147 next_ctx = next->perf_counter_ctxp;
1148 if (parent && next_ctx &&
1149 rcu_dereference(next_ctx->parent_ctx) == parent) {
1150 /*
1151 * Looks like the two contexts are clones, so we might be
1152 * able to optimize the context switch. We lock both
1153 * contexts and check that they are clones under the
1154 * lock (including re-checking that neither has been
1155 * uncloned in the meantime). It doesn't matter which
1156 * order we take the locks because no other cpu could
1157 * be trying to lock both of these tasks.
1158 */
1159 spin_lock(&ctx->lock);
1160 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1161 if (context_equiv(ctx, next_ctx)) {
1162 /*
1163 * XXX do we need a memory barrier of sorts
1164 * wrt to rcu_dereference() of perf_counter_ctxp
1165 */
1166 task->perf_counter_ctxp = next_ctx;
1167 next->perf_counter_ctxp = ctx;
1168 ctx->task = next;
1169 next_ctx->task = task;
1170 do_switch = 0;
1171
1172 perf_counter_sync_stat(ctx, next_ctx);
1173 }
1174 spin_unlock(&next_ctx->lock);
1175 spin_unlock(&ctx->lock);
1176 }
1177 rcu_read_unlock();
1178
1179 if (do_switch) {
1180 __perf_counter_sched_out(ctx, cpuctx);
1181 cpuctx->task_ctx = NULL;
1182 }
1183}
1184
1185/*
1186 * Called with IRQs disabled
1187 */
1188static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1189{
1190 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1191
1192 if (!cpuctx->task_ctx)
1193 return;
1194
1195 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1196 return;
1197
1198 __perf_counter_sched_out(ctx, cpuctx);
1199 cpuctx->task_ctx = NULL;
1200}
1201
1202/*
1203 * Called with IRQs disabled
1204 */
1205static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1206{
1207 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1208}
1209
1210static void
1211__perf_counter_sched_in(struct perf_counter_context *ctx,
1212 struct perf_cpu_context *cpuctx, int cpu)
1213{
1214 struct perf_counter *counter;
1215 int can_add_hw = 1;
1216
1217 spin_lock(&ctx->lock);
1218 ctx->is_active = 1;
1219 if (likely(!ctx->nr_counters))
1220 goto out;
1221
1222 ctx->timestamp = perf_clock();
1223
1224 perf_disable();
1225
1226 /*
1227 * First go through the list and put on any pinned groups
1228 * in order to give them the best chance of going on.
1229 */
1230 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1231 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1232 !counter->attr.pinned)
1233 continue;
1234 if (counter->cpu != -1 && counter->cpu != cpu)
1235 continue;
1236
1237 if (counter != counter->group_leader)
1238 counter_sched_in(counter, cpuctx, ctx, cpu);
1239 else {
1240 if (group_can_go_on(counter, cpuctx, 1))
1241 group_sched_in(counter, cpuctx, ctx, cpu);
1242 }
1243
1244 /*
1245 * If this pinned group hasn't been scheduled,
1246 * put it in error state.
1247 */
1248 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1249 update_group_times(counter);
1250 counter->state = PERF_COUNTER_STATE_ERROR;
1251 }
1252 }
1253
1254 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1255 /*
1256 * Ignore counters in OFF or ERROR state, and
1257 * ignore pinned counters since we did them already.
1258 */
1259 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1260 counter->attr.pinned)
1261 continue;
1262
1263 /*
1264 * Listen to the 'cpu' scheduling filter constraint
1265 * of counters:
1266 */
1267 if (counter->cpu != -1 && counter->cpu != cpu)
1268 continue;
1269
1270 if (counter != counter->group_leader) {
1271 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1272 can_add_hw = 0;
1273 } else {
1274 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1275 if (group_sched_in(counter, cpuctx, ctx, cpu))
1276 can_add_hw = 0;
1277 }
1278 }
1279 }
1280 perf_enable();
1281 out:
1282 spin_unlock(&ctx->lock);
1283}
1284
1285/*
1286 * Called from scheduler to add the counters of the current task
1287 * with interrupts disabled.
1288 *
1289 * We restore the counter value and then enable it.
1290 *
1291 * This does not protect us against NMI, but enable()
1292 * sets the enabled bit in the control field of counter _before_
1293 * accessing the counter control register. If a NMI hits, then it will
1294 * keep the counter running.
1295 */
1296void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1297{
1298 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1299 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1300
1301 if (likely(!ctx))
1302 return;
1303 if (cpuctx->task_ctx == ctx)
1304 return;
1305 __perf_counter_sched_in(ctx, cpuctx, cpu);
1306 cpuctx->task_ctx = ctx;
1307}
1308
1309static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1310{
1311 struct perf_counter_context *ctx = &cpuctx->ctx;
1312
1313 __perf_counter_sched_in(ctx, cpuctx, cpu);
1314}
1315
1316#define MAX_INTERRUPTS (~0ULL)
1317
1318static void perf_log_throttle(struct perf_counter *counter, int enable);
1319
1320static void perf_adjust_period(struct perf_counter *counter, u64 events)
1321{
1322 struct hw_perf_counter *hwc = &counter->hw;
1323 u64 period, sample_period;
1324 s64 delta;
1325
1326 events *= hwc->sample_period;
1327 period = div64_u64(events, counter->attr.sample_freq);
1328
1329 delta = (s64)(period - hwc->sample_period);
1330 delta = (delta + 7) / 8; /* low pass filter */
1331
1332 sample_period = hwc->sample_period + delta;
1333
1334 if (!sample_period)
1335 sample_period = 1;
1336
1337 hwc->sample_period = sample_period;
1338}
1339
1340static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1341{
1342 struct perf_counter *counter;
1343 struct hw_perf_counter *hwc;
1344 u64 interrupts, freq;
1345
1346 spin_lock(&ctx->lock);
1347 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1348 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1349 continue;
1350
1351 hwc = &counter->hw;
1352
1353 interrupts = hwc->interrupts;
1354 hwc->interrupts = 0;
1355
1356 /*
1357 * unthrottle counters on the tick
1358 */
1359 if (interrupts == MAX_INTERRUPTS) {
1360 perf_log_throttle(counter, 1);
1361 counter->pmu->unthrottle(counter);
1362 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1363 }
1364
1365 if (!counter->attr.freq || !counter->attr.sample_freq)
1366 continue;
1367
1368 /*
1369 * if the specified freq < HZ then we need to skip ticks
1370 */
1371 if (counter->attr.sample_freq < HZ) {
1372 freq = counter->attr.sample_freq;
1373
1374 hwc->freq_count += freq;
1375 hwc->freq_interrupts += interrupts;
1376
1377 if (hwc->freq_count < HZ)
1378 continue;
1379
1380 interrupts = hwc->freq_interrupts;
1381 hwc->freq_interrupts = 0;
1382 hwc->freq_count -= HZ;
1383 } else
1384 freq = HZ;
1385
1386 perf_adjust_period(counter, freq * interrupts);
1387
1388 /*
1389 * In order to avoid being stalled by an (accidental) huge
1390 * sample period, force reset the sample period if we didn't
1391 * get any events in this freq period.
1392 */
1393 if (!interrupts) {
1394 perf_disable();
1395 counter->pmu->disable(counter);
1396 atomic64_set(&hwc->period_left, 0);
1397 counter->pmu->enable(counter);
1398 perf_enable();
1399 }
1400 }
1401 spin_unlock(&ctx->lock);
1402}
1403
1404/*
1405 * Round-robin a context's counters:
1406 */
1407static void rotate_ctx(struct perf_counter_context *ctx)
1408{
1409 struct perf_counter *counter;
1410
1411 if (!ctx->nr_counters)
1412 return;
1413
1414 spin_lock(&ctx->lock);
1415 /*
1416 * Rotate the first entry last (works just fine for group counters too):
1417 */
1418 perf_disable();
1419 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1420 list_move_tail(&counter->list_entry, &ctx->counter_list);
1421 break;
1422 }
1423 perf_enable();
1424
1425 spin_unlock(&ctx->lock);
1426}
1427
1428void perf_counter_task_tick(struct task_struct *curr, int cpu)
1429{
1430 struct perf_cpu_context *cpuctx;
1431 struct perf_counter_context *ctx;
1432
1433 if (!atomic_read(&nr_counters))
1434 return;
1435
1436 cpuctx = &per_cpu(perf_cpu_context, cpu);
1437 ctx = curr->perf_counter_ctxp;
1438
1439 perf_ctx_adjust_freq(&cpuctx->ctx);
1440 if (ctx)
1441 perf_ctx_adjust_freq(ctx);
1442
1443 perf_counter_cpu_sched_out(cpuctx);
1444 if (ctx)
1445 __perf_counter_task_sched_out(ctx);
1446
1447 rotate_ctx(&cpuctx->ctx);
1448 if (ctx)
1449 rotate_ctx(ctx);
1450
1451 perf_counter_cpu_sched_in(cpuctx, cpu);
1452 if (ctx)
1453 perf_counter_task_sched_in(curr, cpu);
1454}
1455
1456/*
1457 * Enable all of a task's counters that have been marked enable-on-exec.
1458 * This expects task == current.
1459 */
1460static void perf_counter_enable_on_exec(struct task_struct *task)
1461{
1462 struct perf_counter_context *ctx;
1463 struct perf_counter *counter;
1464 unsigned long flags;
1465 int enabled = 0;
1466
1467 local_irq_save(flags);
1468 ctx = task->perf_counter_ctxp;
1469 if (!ctx || !ctx->nr_counters)
1470 goto out;
1471
1472 __perf_counter_task_sched_out(ctx);
1473
1474 spin_lock(&ctx->lock);
1475
1476 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1477 if (!counter->attr.enable_on_exec)
1478 continue;
1479 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE;
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1;
1486 }
1487
1488 /*
1489 * Unclone this context if we enabled any counter.
1490 */
1491 if (enabled)
1492 unclone_ctx(ctx);
1493
1494 spin_unlock(&ctx->lock);
1495
1496 perf_counter_task_sched_in(task, smp_processor_id());
1497 out:
1498 local_irq_restore(flags);
1499}
1500
1501/*
1502 * Cross CPU call to read the hardware counter
1503 */
1504static void __perf_counter_read(void *info)
1505{
1506 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1507 struct perf_counter *counter = info;
1508 struct perf_counter_context *ctx = counter->ctx;
1509 unsigned long flags;
1510
1511 /*
1512 * If this is a task context, we need to check whether it is
1513 * the current task context of this cpu. If not it has been
1514 * scheduled out before the smp call arrived. In that case
1515 * counter->count would have been updated to a recent sample
1516 * when the counter was scheduled out.
1517 */
1518 if (ctx->task && cpuctx->task_ctx != ctx)
1519 return;
1520
1521 local_irq_save(flags);
1522 if (ctx->is_active)
1523 update_context_time(ctx);
1524 counter->pmu->read(counter);
1525 update_counter_times(counter);
1526 local_irq_restore(flags);
1527}
1528
1529static u64 perf_counter_read(struct perf_counter *counter)
1530{
1531 /*
1532 * If counter is enabled and currently active on a CPU, update the
1533 * value in the counter structure:
1534 */
1535 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1536 smp_call_function_single(counter->oncpu,
1537 __perf_counter_read, counter, 1);
1538 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1539 update_counter_times(counter);
1540 }
1541
1542 return atomic64_read(&counter->count);
1543}
1544
1545/*
1546 * Initialize the perf_counter context in a task_struct:
1547 */
1548static void
1549__perf_counter_init_context(struct perf_counter_context *ctx,
1550 struct task_struct *task)
1551{
1552 memset(ctx, 0, sizeof(*ctx));
1553 spin_lock_init(&ctx->lock);
1554 mutex_init(&ctx->mutex);
1555 INIT_LIST_HEAD(&ctx->counter_list);
1556 INIT_LIST_HEAD(&ctx->event_list);
1557 atomic_set(&ctx->refcount, 1);
1558 ctx->task = task;
1559}
1560
1561static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1562{
1563 struct perf_counter_context *ctx;
1564 struct perf_cpu_context *cpuctx;
1565 struct task_struct *task;
1566 unsigned long flags;
1567 int err;
1568
1569 /*
1570 * If cpu is not a wildcard then this is a percpu counter:
1571 */
1572 if (cpu != -1) {
1573 /* Must be root to operate on a CPU counter: */
1574 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1575 return ERR_PTR(-EACCES);
1576
1577 if (cpu < 0 || cpu > num_possible_cpus())
1578 return ERR_PTR(-EINVAL);
1579
1580 /*
1581 * We could be clever and allow to attach a counter to an
1582 * offline CPU and activate it when the CPU comes up, but
1583 * that's for later.
1584 */
1585 if (!cpu_isset(cpu, cpu_online_map))
1586 return ERR_PTR(-ENODEV);
1587
1588 cpuctx = &per_cpu(perf_cpu_context, cpu);
1589 ctx = &cpuctx->ctx;
1590 get_ctx(ctx);
1591
1592 return ctx;
1593 }
1594
1595 rcu_read_lock();
1596 if (!pid)
1597 task = current;
1598 else
1599 task = find_task_by_vpid(pid);
1600 if (task)
1601 get_task_struct(task);
1602 rcu_read_unlock();
1603
1604 if (!task)
1605 return ERR_PTR(-ESRCH);
1606
1607 /*
1608 * Can't attach counters to a dying task.
1609 */
1610 err = -ESRCH;
1611 if (task->flags & PF_EXITING)
1612 goto errout;
1613
1614 /* Reuse ptrace permission checks for now. */
1615 err = -EACCES;
1616 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1617 goto errout;
1618
1619 retry:
1620 ctx = perf_lock_task_context(task, &flags);
1621 if (ctx) {
1622 unclone_ctx(ctx);
1623 spin_unlock_irqrestore(&ctx->lock, flags);
1624 }
1625
1626 if (!ctx) {
1627 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1628 err = -ENOMEM;
1629 if (!ctx)
1630 goto errout;
1631 __perf_counter_init_context(ctx, task);
1632 get_ctx(ctx);
1633 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1634 /*
1635 * We raced with some other task; use
1636 * the context they set.
1637 */
1638 kfree(ctx);
1639 goto retry;
1640 }
1641 get_task_struct(task);
1642 }
1643
1644 put_task_struct(task);
1645 return ctx;
1646
1647 errout:
1648 put_task_struct(task);
1649 return ERR_PTR(err);
1650}
1651
1652static void free_counter_rcu(struct rcu_head *head)
1653{
1654 struct perf_counter *counter;
1655
1656 counter = container_of(head, struct perf_counter, rcu_head);
1657 if (counter->ns)
1658 put_pid_ns(counter->ns);
1659 kfree(counter);
1660}
1661
1662static void perf_pending_sync(struct perf_counter *counter);
1663
1664static void free_counter(struct perf_counter *counter)
1665{
1666 perf_pending_sync(counter);
1667
1668 if (!counter->parent) {
1669 atomic_dec(&nr_counters);
1670 if (counter->attr.mmap)
1671 atomic_dec(&nr_mmap_counters);
1672 if (counter->attr.comm)
1673 atomic_dec(&nr_comm_counters);
1674 if (counter->attr.task)
1675 atomic_dec(&nr_task_counters);
1676 }
1677
1678 if (counter->destroy)
1679 counter->destroy(counter);
1680
1681 put_ctx(counter->ctx);
1682 call_rcu(&counter->rcu_head, free_counter_rcu);
1683}
1684
1685/*
1686 * Called when the last reference to the file is gone.
1687 */
1688static int perf_release(struct inode *inode, struct file *file)
1689{
1690 struct perf_counter *counter = file->private_data;
1691 struct perf_counter_context *ctx = counter->ctx;
1692
1693 file->private_data = NULL;
1694
1695 WARN_ON_ONCE(ctx->parent_ctx);
1696 mutex_lock(&ctx->mutex);
1697 perf_counter_remove_from_context(counter);
1698 mutex_unlock(&ctx->mutex);
1699
1700 mutex_lock(&counter->owner->perf_counter_mutex);
1701 list_del_init(&counter->owner_entry);
1702 mutex_unlock(&counter->owner->perf_counter_mutex);
1703 put_task_struct(counter->owner);
1704
1705 free_counter(counter);
1706
1707 return 0;
1708}
1709
1710static int perf_counter_read_size(struct perf_counter *counter)
1711{
1712 int entry = sizeof(u64); /* value */
1713 int size = 0;
1714 int nr = 1;
1715
1716 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1717 size += sizeof(u64);
1718
1719 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1720 size += sizeof(u64);
1721
1722 if (counter->attr.read_format & PERF_FORMAT_ID)
1723 entry += sizeof(u64);
1724
1725 if (counter->attr.read_format & PERF_FORMAT_GROUP) {
1726 nr += counter->group_leader->nr_siblings;
1727 size += sizeof(u64);
1728 }
1729
1730 size += entry * nr;
1731
1732 return size;
1733}
1734
1735static u64 perf_counter_read_value(struct perf_counter *counter)
1736{
1737 struct perf_counter *child;
1738 u64 total = 0;
1739
1740 total += perf_counter_read(counter);
1741 list_for_each_entry(child, &counter->child_list, child_list)
1742 total += perf_counter_read(child);
1743
1744 return total;
1745}
1746
1747static int perf_counter_read_entry(struct perf_counter *counter,
1748 u64 read_format, char __user *buf)
1749{
1750 int n = 0, count = 0;
1751 u64 values[2];
1752
1753 values[n++] = perf_counter_read_value(counter);
1754 if (read_format & PERF_FORMAT_ID)
1755 values[n++] = primary_counter_id(counter);
1756
1757 count = n * sizeof(u64);
1758
1759 if (copy_to_user(buf, values, count))
1760 return -EFAULT;
1761
1762 return count;
1763}
1764
1765static int perf_counter_read_group(struct perf_counter *counter,
1766 u64 read_format, char __user *buf)
1767{
1768 struct perf_counter *leader = counter->group_leader, *sub;
1769 int n = 0, size = 0, err = -EFAULT;
1770 u64 values[3];
1771
1772 values[n++] = 1 + leader->nr_siblings;
1773 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1774 values[n++] = leader->total_time_enabled +
1775 atomic64_read(&leader->child_total_time_enabled);
1776 }
1777 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1778 values[n++] = leader->total_time_running +
1779 atomic64_read(&leader->child_total_time_running);
1780 }
1781
1782 size = n * sizeof(u64);
1783
1784 if (copy_to_user(buf, values, size))
1785 return -EFAULT;
1786
1787 err = perf_counter_read_entry(leader, read_format, buf + size);
1788 if (err < 0)
1789 return err;
1790
1791 size += err;
1792
1793 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1794 err = perf_counter_read_entry(sub, read_format,
1795 buf + size);
1796 if (err < 0)
1797 return err;
1798
1799 size += err;
1800 }
1801
1802 return size;
1803}
1804
1805static int perf_counter_read_one(struct perf_counter *counter,
1806 u64 read_format, char __user *buf)
1807{
1808 u64 values[4];
1809 int n = 0;
1810
1811 values[n++] = perf_counter_read_value(counter);
1812 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1813 values[n++] = counter->total_time_enabled +
1814 atomic64_read(&counter->child_total_time_enabled);
1815 }
1816 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1817 values[n++] = counter->total_time_running +
1818 atomic64_read(&counter->child_total_time_running);
1819 }
1820 if (read_format & PERF_FORMAT_ID)
1821 values[n++] = primary_counter_id(counter);
1822
1823 if (copy_to_user(buf, values, n * sizeof(u64)))
1824 return -EFAULT;
1825
1826 return n * sizeof(u64);
1827}
1828
1829/*
1830 * Read the performance counter - simple non blocking version for now
1831 */
1832static ssize_t
1833perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1834{
1835 u64 read_format = counter->attr.read_format;
1836 int ret;
1837
1838 /*
1839 * Return end-of-file for a read on a counter that is in
1840 * error state (i.e. because it was pinned but it couldn't be
1841 * scheduled on to the CPU at some point).
1842 */
1843 if (counter->state == PERF_COUNTER_STATE_ERROR)
1844 return 0;
1845
1846 if (count < perf_counter_read_size(counter))
1847 return -ENOSPC;
1848
1849 WARN_ON_ONCE(counter->ctx->parent_ctx);
1850 mutex_lock(&counter->child_mutex);
1851 if (read_format & PERF_FORMAT_GROUP)
1852 ret = perf_counter_read_group(counter, read_format, buf);
1853 else
1854 ret = perf_counter_read_one(counter, read_format, buf);
1855 mutex_unlock(&counter->child_mutex);
1856
1857 return ret;
1858}
1859
1860static ssize_t
1861perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1862{
1863 struct perf_counter *counter = file->private_data;
1864
1865 return perf_read_hw(counter, buf, count);
1866}
1867
1868static unsigned int perf_poll(struct file *file, poll_table *wait)
1869{
1870 struct perf_counter *counter = file->private_data;
1871 struct perf_mmap_data *data;
1872 unsigned int events = POLL_HUP;
1873
1874 rcu_read_lock();
1875 data = rcu_dereference(counter->data);
1876 if (data)
1877 events = atomic_xchg(&data->poll, 0);
1878 rcu_read_unlock();
1879
1880 poll_wait(file, &counter->waitq, wait);
1881
1882 return events;
1883}
1884
1885static void perf_counter_reset(struct perf_counter *counter)
1886{
1887 (void)perf_counter_read(counter);
1888 atomic64_set(&counter->count, 0);
1889 perf_counter_update_userpage(counter);
1890}
1891
1892/*
1893 * Holding the top-level counter's child_mutex means that any
1894 * descendant process that has inherited this counter will block
1895 * in sync_child_counter if it goes to exit, thus satisfying the
1896 * task existence requirements of perf_counter_enable/disable.
1897 */
1898static void perf_counter_for_each_child(struct perf_counter *counter,
1899 void (*func)(struct perf_counter *))
1900{
1901 struct perf_counter *child;
1902
1903 WARN_ON_ONCE(counter->ctx->parent_ctx);
1904 mutex_lock(&counter->child_mutex);
1905 func(counter);
1906 list_for_each_entry(child, &counter->child_list, child_list)
1907 func(child);
1908 mutex_unlock(&counter->child_mutex);
1909}
1910
1911static void perf_counter_for_each(struct perf_counter *counter,
1912 void (*func)(struct perf_counter *))
1913{
1914 struct perf_counter_context *ctx = counter->ctx;
1915 struct perf_counter *sibling;
1916
1917 WARN_ON_ONCE(ctx->parent_ctx);
1918 mutex_lock(&ctx->mutex);
1919 counter = counter->group_leader;
1920
1921 perf_counter_for_each_child(counter, func);
1922 func(counter);
1923 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1924 perf_counter_for_each_child(counter, func);
1925 mutex_unlock(&ctx->mutex);
1926}
1927
1928static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1929{
1930 struct perf_counter_context *ctx = counter->ctx;
1931 unsigned long size;
1932 int ret = 0;
1933 u64 value;
1934
1935 if (!counter->attr.sample_period)
1936 return -EINVAL;
1937
1938 size = copy_from_user(&value, arg, sizeof(value));
1939 if (size != sizeof(value))
1940 return -EFAULT;
1941
1942 if (!value)
1943 return -EINVAL;
1944
1945 spin_lock_irq(&ctx->lock);
1946 if (counter->attr.freq) {
1947 if (value > sysctl_perf_counter_sample_rate) {
1948 ret = -EINVAL;
1949 goto unlock;
1950 }
1951
1952 counter->attr.sample_freq = value;
1953 } else {
1954 counter->attr.sample_period = value;
1955 counter->hw.sample_period = value;
1956 }
1957unlock:
1958 spin_unlock_irq(&ctx->lock);
1959
1960 return ret;
1961}
1962
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{
1965 struct perf_counter *counter = file->private_data;
1966 void (*func)(struct perf_counter *);
1967 u32 flags = arg;
1968
1969 switch (cmd) {
1970 case PERF_COUNTER_IOC_ENABLE:
1971 func = perf_counter_enable;
1972 break;
1973 case PERF_COUNTER_IOC_DISABLE:
1974 func = perf_counter_disable;
1975 break;
1976 case PERF_COUNTER_IOC_RESET:
1977 func = perf_counter_reset;
1978 break;
1979
1980 case PERF_COUNTER_IOC_REFRESH:
1981 return perf_counter_refresh(counter, arg);
1982
1983 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg);
1985
1986 default:
1987 return -ENOTTY;
1988 }
1989
1990 if (flags & PERF_IOC_FLAG_GROUP)
1991 perf_counter_for_each(counter, func);
1992 else
1993 perf_counter_for_each_child(counter, func);
1994
1995 return 0;
1996}
1997
1998int perf_counter_task_enable(void)
1999{
2000 struct perf_counter *counter;
2001
2002 mutex_lock(&current->perf_counter_mutex);
2003 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2004 perf_counter_for_each_child(counter, perf_counter_enable);
2005 mutex_unlock(&current->perf_counter_mutex);
2006
2007 return 0;
2008}
2009
2010int perf_counter_task_disable(void)
2011{
2012 struct perf_counter *counter;
2013
2014 mutex_lock(&current->perf_counter_mutex);
2015 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
2016 perf_counter_for_each_child(counter, perf_counter_disable);
2017 mutex_unlock(&current->perf_counter_mutex);
2018
2019 return 0;
2020}
2021
2022#ifndef PERF_COUNTER_INDEX_OFFSET
2023# define PERF_COUNTER_INDEX_OFFSET 0
2024#endif
2025
2026static int perf_counter_index(struct perf_counter *counter)
2027{
2028 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2029 return 0;
2030
2031 return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET;
2032}
2033
2034/*
2035 * Callers need to ensure there can be no nesting of this function, otherwise
2036 * the seqlock logic goes bad. We can not serialize this because the arch
2037 * code calls this from NMI context.
2038 */
2039void perf_counter_update_userpage(struct perf_counter *counter)
2040{
2041 struct perf_counter_mmap_page *userpg;
2042 struct perf_mmap_data *data;
2043
2044 rcu_read_lock();
2045 data = rcu_dereference(counter->data);
2046 if (!data)
2047 goto unlock;
2048
2049 userpg = data->user_page;
2050
2051 /*
2052 * Disable preemption so as to not let the corresponding user-space
2053 * spin too long if we get preempted.
2054 */
2055 preempt_disable();
2056 ++userpg->lock;
2057 barrier();
2058 userpg->index = perf_counter_index(counter);
2059 userpg->offset = atomic64_read(&counter->count);
2060 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
2061 userpg->offset -= atomic64_read(&counter->hw.prev_count);
2062
2063 userpg->time_enabled = counter->total_time_enabled +
2064 atomic64_read(&counter->child_total_time_enabled);
2065
2066 userpg->time_running = counter->total_time_running +
2067 atomic64_read(&counter->child_total_time_running);
2068
2069 barrier();
2070 ++userpg->lock;
2071 preempt_enable();
2072unlock:
2073 rcu_read_unlock();
2074}
2075
2076static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2077{
2078 struct perf_counter *counter = vma->vm_file->private_data;
2079 struct perf_mmap_data *data;
2080 int ret = VM_FAULT_SIGBUS;
2081
2082 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2083 if (vmf->pgoff == 0)
2084 ret = 0;
2085 return ret;
2086 }
2087
2088 rcu_read_lock();
2089 data = rcu_dereference(counter->data);
2090 if (!data)
2091 goto unlock;
2092
2093 if (vmf->pgoff == 0) {
2094 vmf->page = virt_to_page(data->user_page);
2095 } else {
2096 int nr = vmf->pgoff - 1;
2097
2098 if ((unsigned)nr > data->nr_pages)
2099 goto unlock;
2100
2101 if (vmf->flags & FAULT_FLAG_WRITE)
2102 goto unlock;
2103
2104 vmf->page = virt_to_page(data->data_pages[nr]);
2105 }
2106
2107 get_page(vmf->page);
2108 vmf->page->mapping = vma->vm_file->f_mapping;
2109 vmf->page->index = vmf->pgoff;
2110
2111 ret = 0;
2112unlock:
2113 rcu_read_unlock();
2114
2115 return ret;
2116}
2117
2118static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2119{
2120 struct perf_mmap_data *data;
2121 unsigned long size;
2122 int i;
2123
2124 WARN_ON(atomic_read(&counter->mmap_count));
2125
2126 size = sizeof(struct perf_mmap_data);
2127 size += nr_pages * sizeof(void *);
2128
2129 data = kzalloc(size, GFP_KERNEL);
2130 if (!data)
2131 goto fail;
2132
2133 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2134 if (!data->user_page)
2135 goto fail_user_page;
2136
2137 for (i = 0; i < nr_pages; i++) {
2138 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2139 if (!data->data_pages[i])
2140 goto fail_data_pages;
2141 }
2142
2143 data->nr_pages = nr_pages;
2144 atomic_set(&data->lock, -1);
2145
2146 rcu_assign_pointer(counter->data, data);
2147
2148 return 0;
2149
2150fail_data_pages:
2151 for (i--; i >= 0; i--)
2152 free_page((unsigned long)data->data_pages[i]);
2153
2154 free_page((unsigned long)data->user_page);
2155
2156fail_user_page:
2157 kfree(data);
2158
2159fail:
2160 return -ENOMEM;
2161}
2162
2163static void perf_mmap_free_page(unsigned long addr)
2164{
2165 struct page *page = virt_to_page((void *)addr);
2166
2167 page->mapping = NULL;
2168 __free_page(page);
2169}
2170
2171static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2172{
2173 struct perf_mmap_data *data;
2174 int i;
2175
2176 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2177
2178 perf_mmap_free_page((unsigned long)data->user_page);
2179 for (i = 0; i < data->nr_pages; i++)
2180 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2181
2182 kfree(data);
2183}
2184
2185static void perf_mmap_data_free(struct perf_counter *counter)
2186{
2187 struct perf_mmap_data *data = counter->data;
2188
2189 WARN_ON(atomic_read(&counter->mmap_count));
2190
2191 rcu_assign_pointer(counter->data, NULL);
2192 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2193}
2194
2195static void perf_mmap_open(struct vm_area_struct *vma)
2196{
2197 struct perf_counter *counter = vma->vm_file->private_data;
2198
2199 atomic_inc(&counter->mmap_count);
2200}
2201
2202static void perf_mmap_close(struct vm_area_struct *vma)
2203{
2204 struct perf_counter *counter = vma->vm_file->private_data;
2205
2206 WARN_ON_ONCE(counter->ctx->parent_ctx);
2207 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
2208 struct user_struct *user = current_user();
2209
2210 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
2211 vma->vm_mm->locked_vm -= counter->data->nr_locked;
2212 perf_mmap_data_free(counter);
2213 mutex_unlock(&counter->mmap_mutex);
2214 }
2215}
2216
2217static struct vm_operations_struct perf_mmap_vmops = {
2218 .open = perf_mmap_open,
2219 .close = perf_mmap_close,
2220 .fault = perf_mmap_fault,
2221 .page_mkwrite = perf_mmap_fault,
2222};
2223
2224static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2225{
2226 struct perf_counter *counter = file->private_data;
2227 unsigned long user_locked, user_lock_limit;
2228 struct user_struct *user = current_user();
2229 unsigned long locked, lock_limit;
2230 unsigned long vma_size;
2231 unsigned long nr_pages;
2232 long user_extra, extra;
2233 int ret = 0;
2234
2235 if (!(vma->vm_flags & VM_SHARED))
2236 return -EINVAL;
2237
2238 vma_size = vma->vm_end - vma->vm_start;
2239 nr_pages = (vma_size / PAGE_SIZE) - 1;
2240
2241 /*
2242 * If we have data pages ensure they're a power-of-two number, so we
2243 * can do bitmasks instead of modulo.
2244 */
2245 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2246 return -EINVAL;
2247
2248 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2249 return -EINVAL;
2250
2251 if (vma->vm_pgoff != 0)
2252 return -EINVAL;
2253
2254 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex);
2256 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL;
2259 goto unlock;
2260 }
2261
2262 user_extra = nr_pages + 1;
2263 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
2264
2265 /*
2266 * Increase the limit linearly with more CPUs:
2267 */
2268 user_lock_limit *= num_online_cpus();
2269
2270 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2271
2272 extra = 0;
2273 if (user_locked > user_lock_limit)
2274 extra = user_locked - user_lock_limit;
2275
2276 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2277 lock_limit >>= PAGE_SHIFT;
2278 locked = vma->vm_mm->locked_vm + extra;
2279
2280 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
2281 ret = -EPERM;
2282 goto unlock;
2283 }
2284
2285 WARN_ON(counter->data);
2286 ret = perf_mmap_data_alloc(counter, nr_pages);
2287 if (ret)
2288 goto unlock;
2289
2290 atomic_set(&counter->mmap_count, 1);
2291 atomic_long_add(user_extra, &user->locked_vm);
2292 vma->vm_mm->locked_vm += extra;
2293 counter->data->nr_locked = extra;
2294 if (vma->vm_flags & VM_WRITE)
2295 counter->data->writable = 1;
2296
2297unlock:
2298 mutex_unlock(&counter->mmap_mutex);
2299
2300 vma->vm_flags |= VM_RESERVED;
2301 vma->vm_ops = &perf_mmap_vmops;
2302
2303 return ret;
2304}
2305
2306static int perf_fasync(int fd, struct file *filp, int on)
2307{
2308 struct inode *inode = filp->f_path.dentry->d_inode;
2309 struct perf_counter *counter = filp->private_data;
2310 int retval;
2311
2312 mutex_lock(&inode->i_mutex);
2313 retval = fasync_helper(fd, filp, on, &counter->fasync);
2314 mutex_unlock(&inode->i_mutex);
2315
2316 if (retval < 0)
2317 return retval;
2318
2319 return 0;
2320}
2321
2322static const struct file_operations perf_fops = {
2323 .release = perf_release,
2324 .read = perf_read,
2325 .poll = perf_poll,
2326 .unlocked_ioctl = perf_ioctl,
2327 .compat_ioctl = perf_ioctl,
2328 .mmap = perf_mmap,
2329 .fasync = perf_fasync,
2330};
2331
2332/*
2333 * Perf counter wakeup
2334 *
2335 * If there's data, ensure we set the poll() state and publish everything
2336 * to user-space before waking everybody up.
2337 */
2338
2339void perf_counter_wakeup(struct perf_counter *counter)
2340{
2341 wake_up_all(&counter->waitq);
2342
2343 if (counter->pending_kill) {
2344 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2345 counter->pending_kill = 0;
2346 }
2347}
2348
2349/*
2350 * Pending wakeups
2351 *
2352 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2353 *
2354 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2355 * single linked list and use cmpxchg() to add entries lockless.
2356 */
2357
2358static void perf_pending_counter(struct perf_pending_entry *entry)
2359{
2360 struct perf_counter *counter = container_of(entry,
2361 struct perf_counter, pending);
2362
2363 if (counter->pending_disable) {
2364 counter->pending_disable = 0;
2365 __perf_counter_disable(counter);
2366 }
2367
2368 if (counter->pending_wakeup) {
2369 counter->pending_wakeup = 0;
2370 perf_counter_wakeup(counter);
2371 }
2372}
2373
2374#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2375
2376static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2377 PENDING_TAIL,
2378};
2379
2380static void perf_pending_queue(struct perf_pending_entry *entry,
2381 void (*func)(struct perf_pending_entry *))
2382{
2383 struct perf_pending_entry **head;
2384
2385 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2386 return;
2387
2388 entry->func = func;
2389
2390 head = &get_cpu_var(perf_pending_head);
2391
2392 do {
2393 entry->next = *head;
2394 } while (cmpxchg(head, entry->next, entry) != entry->next);
2395
2396 set_perf_counter_pending();
2397
2398 put_cpu_var(perf_pending_head);
2399}
2400
2401static int __perf_pending_run(void)
2402{
2403 struct perf_pending_entry *list;
2404 int nr = 0;
2405
2406 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2407 while (list != PENDING_TAIL) {
2408 void (*func)(struct perf_pending_entry *);
2409 struct perf_pending_entry *entry = list;
2410
2411 list = list->next;
2412
2413 func = entry->func;
2414 entry->next = NULL;
2415 /*
2416 * Ensure we observe the unqueue before we issue the wakeup,
2417 * so that we won't be waiting forever.
2418 * -- see perf_not_pending().
2419 */
2420 smp_wmb();
2421
2422 func(entry);
2423 nr++;
2424 }
2425
2426 return nr;
2427}
2428
2429static inline int perf_not_pending(struct perf_counter *counter)
2430{
2431 /*
2432 * If we flush on whatever cpu we run, there is a chance we don't
2433 * need to wait.
2434 */
2435 get_cpu();
2436 __perf_pending_run();
2437 put_cpu();
2438
2439 /*
2440 * Ensure we see the proper queue state before going to sleep
2441 * so that we do not miss the wakeup. -- see perf_pending_handle()
2442 */
2443 smp_rmb();
2444 return counter->pending.next == NULL;
2445}
2446
2447static void perf_pending_sync(struct perf_counter *counter)
2448{
2449 wait_event(counter->waitq, perf_not_pending(counter));
2450}
2451
2452void perf_counter_do_pending(void)
2453{
2454 __perf_pending_run();
2455}
2456
2457/*
2458 * Callchain support -- arch specific
2459 */
2460
2461__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2462{
2463 return NULL;
2464}
2465
2466/*
2467 * Output
2468 */
2469
2470struct perf_output_handle {
2471 struct perf_counter *counter;
2472 struct perf_mmap_data *data;
2473 unsigned long head;
2474 unsigned long offset;
2475 int nmi;
2476 int sample;
2477 int locked;
2478 unsigned long flags;
2479};
2480
2481static bool perf_output_space(struct perf_mmap_data *data,
2482 unsigned int offset, unsigned int head)
2483{
2484 unsigned long tail;
2485 unsigned long mask;
2486
2487 if (!data->writable)
2488 return true;
2489
2490 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2491 /*
2492 * Userspace could choose to issue a mb() before updating the tail
2493 * pointer. So that all reads will be completed before the write is
2494 * issued.
2495 */
2496 tail = ACCESS_ONCE(data->user_page->data_tail);
2497 smp_rmb();
2498
2499 offset = (offset - tail) & mask;
2500 head = (head - tail) & mask;
2501
2502 if ((int)(head - offset) < 0)
2503 return false;
2504
2505 return true;
2506}
2507
2508static void perf_output_wakeup(struct perf_output_handle *handle)
2509{
2510 atomic_set(&handle->data->poll, POLL_IN);
2511
2512 if (handle->nmi) {
2513 handle->counter->pending_wakeup = 1;
2514 perf_pending_queue(&handle->counter->pending,
2515 perf_pending_counter);
2516 } else
2517 perf_counter_wakeup(handle->counter);
2518}
2519
2520/*
2521 * Curious locking construct.
2522 *
2523 * We need to ensure a later event doesn't publish a head when a former
2524 * event isn't done writing. However since we need to deal with NMIs we
2525 * cannot fully serialize things.
2526 *
2527 * What we do is serialize between CPUs so we only have to deal with NMI
2528 * nesting on a single CPU.
2529 *
2530 * We only publish the head (and generate a wakeup) when the outer-most
2531 * event completes.
2532 */
2533static void perf_output_lock(struct perf_output_handle *handle)
2534{
2535 struct perf_mmap_data *data = handle->data;
2536 int cpu;
2537
2538 handle->locked = 0;
2539
2540 local_irq_save(handle->flags);
2541 cpu = smp_processor_id();
2542
2543 if (in_nmi() && atomic_read(&data->lock) == cpu)
2544 return;
2545
2546 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2547 cpu_relax();
2548
2549 handle->locked = 1;
2550}
2551
2552static void perf_output_unlock(struct perf_output_handle *handle)
2553{
2554 struct perf_mmap_data *data = handle->data;
2555 unsigned long head;
2556 int cpu;
2557
2558 data->done_head = data->head;
2559
2560 if (!handle->locked)
2561 goto out;
2562
2563again:
2564 /*
2565 * The xchg implies a full barrier that ensures all writes are done
2566 * before we publish the new head, matched by a rmb() in userspace when
2567 * reading this position.
2568 */
2569 while ((head = atomic_long_xchg(&data->done_head, 0)))
2570 data->user_page->data_head = head;
2571
2572 /*
2573 * NMI can happen here, which means we can miss a done_head update.
2574 */
2575
2576 cpu = atomic_xchg(&data->lock, -1);
2577 WARN_ON_ONCE(cpu != smp_processor_id());
2578
2579 /*
2580 * Therefore we have to validate we did not indeed do so.
2581 */
2582 if (unlikely(atomic_long_read(&data->done_head))) {
2583 /*
2584 * Since we had it locked, we can lock it again.
2585 */
2586 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2587 cpu_relax();
2588
2589 goto again;
2590 }
2591
2592 if (atomic_xchg(&data->wakeup, 0))
2593 perf_output_wakeup(handle);
2594out:
2595 local_irq_restore(handle->flags);
2596}
2597
2598static void perf_output_copy(struct perf_output_handle *handle,
2599 const void *buf, unsigned int len)
2600{
2601 unsigned int pages_mask;
2602 unsigned int offset;
2603 unsigned int size;
2604 void **pages;
2605
2606 offset = handle->offset;
2607 pages_mask = handle->data->nr_pages - 1;
2608 pages = handle->data->data_pages;
2609
2610 do {
2611 unsigned int page_offset;
2612 int nr;
2613
2614 nr = (offset >> PAGE_SHIFT) & pages_mask;
2615 page_offset = offset & (PAGE_SIZE - 1);
2616 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2617
2618 memcpy(pages[nr] + page_offset, buf, size);
2619
2620 len -= size;
2621 buf += size;
2622 offset += size;
2623 } while (len);
2624
2625 handle->offset = offset;
2626
2627 /*
2628 * Check we didn't copy past our reservation window, taking the
2629 * possible unsigned int wrap into account.
2630 */
2631 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2632}
2633
2634#define perf_output_put(handle, x) \
2635 perf_output_copy((handle), &(x), sizeof(x))
2636
2637static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample)
2640{
2641 struct perf_mmap_data *data;
2642 unsigned int offset, head;
2643 int have_lost;
2644 struct {
2645 struct perf_event_header header;
2646 u64 id;
2647 u64 lost;
2648 } lost_event;
2649
2650 /*
2651 * For inherited counters we send all the output towards the parent.
2652 */
2653 if (counter->parent)
2654 counter = counter->parent;
2655
2656 rcu_read_lock();
2657 data = rcu_dereference(counter->data);
2658 if (!data)
2659 goto out;
2660
2661 handle->data = data;
2662 handle->counter = counter;
2663 handle->nmi = nmi;
2664 handle->sample = sample;
2665
2666 if (!data->nr_pages)
2667 goto fail;
2668
2669 have_lost = atomic_read(&data->lost);
2670 if (have_lost)
2671 size += sizeof(lost_event);
2672
2673 perf_output_lock(handle);
2674
2675 do {
2676 offset = head = atomic_long_read(&data->head);
2677 head += size;
2678 if (unlikely(!perf_output_space(data, offset, head)))
2679 goto fail;
2680 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2681
2682 handle->offset = offset;
2683 handle->head = head;
2684
2685 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2686 atomic_set(&data->wakeup, 1);
2687
2688 if (have_lost) {
2689 lost_event.header.type = PERF_EVENT_LOST;
2690 lost_event.header.misc = 0;
2691 lost_event.header.size = sizeof(lost_event);
2692 lost_event.id = counter->id;
2693 lost_event.lost = atomic_xchg(&data->lost, 0);
2694
2695 perf_output_put(handle, lost_event);
2696 }
2697
2698 return 0;
2699
2700fail:
2701 atomic_inc(&data->lost);
2702 perf_output_unlock(handle);
2703out:
2704 rcu_read_unlock();
2705
2706 return -ENOSPC;
2707}
2708
2709static void perf_output_end(struct perf_output_handle *handle)
2710{
2711 struct perf_counter *counter = handle->counter;
2712 struct perf_mmap_data *data = handle->data;
2713
2714 int wakeup_events = counter->attr.wakeup_events;
2715
2716 if (handle->sample && wakeup_events) {
2717 int events = atomic_inc_return(&data->events);
2718 if (events >= wakeup_events) {
2719 atomic_sub(wakeup_events, &data->events);
2720 atomic_set(&data->wakeup, 1);
2721 }
2722 }
2723
2724 perf_output_unlock(handle);
2725 rcu_read_unlock();
2726}
2727
2728static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2729{
2730 /*
2731 * only top level counters have the pid namespace they were created in
2732 */
2733 if (counter->parent)
2734 counter = counter->parent;
2735
2736 return task_tgid_nr_ns(p, counter->ns);
2737}
2738
2739static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2740{
2741 /*
2742 * only top level counters have the pid namespace they were created in
2743 */
2744 if (counter->parent)
2745 counter = counter->parent;
2746
2747 return task_pid_nr_ns(p, counter->ns);
2748}
2749
2750static void perf_output_read_one(struct perf_output_handle *handle,
2751 struct perf_counter *counter)
2752{
2753 u64 read_format = counter->attr.read_format;
2754 u64 values[4];
2755 int n = 0;
2756
2757 values[n++] = atomic64_read(&counter->count);
2758 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2759 values[n++] = counter->total_time_enabled +
2760 atomic64_read(&counter->child_total_time_enabled);
2761 }
2762 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2763 values[n++] = counter->total_time_running +
2764 atomic64_read(&counter->child_total_time_running);
2765 }
2766 if (read_format & PERF_FORMAT_ID)
2767 values[n++] = primary_counter_id(counter);
2768
2769 perf_output_copy(handle, values, n * sizeof(u64));
2770}
2771
2772/*
2773 * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult.
2774 */
2775static void perf_output_read_group(struct perf_output_handle *handle,
2776 struct perf_counter *counter)
2777{
2778 struct perf_counter *leader = counter->group_leader, *sub;
2779 u64 read_format = counter->attr.read_format;
2780 u64 values[5];
2781 int n = 0;
2782
2783 values[n++] = 1 + leader->nr_siblings;
2784
2785 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2786 values[n++] = leader->total_time_enabled;
2787
2788 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2789 values[n++] = leader->total_time_running;
2790
2791 if (leader != counter)
2792 leader->pmu->read(leader);
2793
2794 values[n++] = atomic64_read(&leader->count);
2795 if (read_format & PERF_FORMAT_ID)
2796 values[n++] = primary_counter_id(leader);
2797
2798 perf_output_copy(handle, values, n * sizeof(u64));
2799
2800 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2801 n = 0;
2802
2803 if (sub != counter)
2804 sub->pmu->read(sub);
2805
2806 values[n++] = atomic64_read(&sub->count);
2807 if (read_format & PERF_FORMAT_ID)
2808 values[n++] = primary_counter_id(sub);
2809
2810 perf_output_copy(handle, values, n * sizeof(u64));
2811 }
2812}
2813
2814static void perf_output_read(struct perf_output_handle *handle,
2815 struct perf_counter *counter)
2816{
2817 if (counter->attr.read_format & PERF_FORMAT_GROUP)
2818 perf_output_read_group(handle, counter);
2819 else
2820 perf_output_read_one(handle, counter);
2821}
2822
2823void perf_counter_output(struct perf_counter *counter, int nmi,
2824 struct perf_sample_data *data)
2825{
2826 int ret;
2827 u64 sample_type = counter->attr.sample_type;
2828 struct perf_output_handle handle;
2829 struct perf_event_header header;
2830 u64 ip;
2831 struct {
2832 u32 pid, tid;
2833 } tid_entry;
2834 struct perf_callchain_entry *callchain = NULL;
2835 int callchain_size = 0;
2836 u64 time;
2837 struct {
2838 u32 cpu, reserved;
2839 } cpu_entry;
2840
2841 header.type = PERF_EVENT_SAMPLE;
2842 header.size = sizeof(header);
2843
2844 header.misc = 0;
2845 header.misc |= perf_misc_flags(data->regs);
2846
2847 if (sample_type & PERF_SAMPLE_IP) {
2848 ip = perf_instruction_pointer(data->regs);
2849 header.size += sizeof(ip);
2850 }
2851
2852 if (sample_type & PERF_SAMPLE_TID) {
2853 /* namespace issues */
2854 tid_entry.pid = perf_counter_pid(counter, current);
2855 tid_entry.tid = perf_counter_tid(counter, current);
2856
2857 header.size += sizeof(tid_entry);
2858 }
2859
2860 if (sample_type & PERF_SAMPLE_TIME) {
2861 /*
2862 * Maybe do better on x86 and provide cpu_clock_nmi()
2863 */
2864 time = sched_clock();
2865
2866 header.size += sizeof(u64);
2867 }
2868
2869 if (sample_type & PERF_SAMPLE_ADDR)
2870 header.size += sizeof(u64);
2871
2872 if (sample_type & PERF_SAMPLE_ID)
2873 header.size += sizeof(u64);
2874
2875 if (sample_type & PERF_SAMPLE_STREAM_ID)
2876 header.size += sizeof(u64);
2877
2878 if (sample_type & PERF_SAMPLE_CPU) {
2879 header.size += sizeof(cpu_entry);
2880
2881 cpu_entry.cpu = raw_smp_processor_id();
2882 cpu_entry.reserved = 0;
2883 }
2884
2885 if (sample_type & PERF_SAMPLE_PERIOD)
2886 header.size += sizeof(u64);
2887
2888 if (sample_type & PERF_SAMPLE_READ)
2889 header.size += perf_counter_read_size(counter);
2890
2891 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2892 callchain = perf_callchain(data->regs);
2893
2894 if (callchain) {
2895 callchain_size = (1 + callchain->nr) * sizeof(u64);
2896 header.size += callchain_size;
2897 } else
2898 header.size += sizeof(u64);
2899 }
2900
2901 if (sample_type & PERF_SAMPLE_RAW) {
2902 int size = sizeof(u32);
2903
2904 if (data->raw)
2905 size += data->raw->size;
2906 else
2907 size += sizeof(u32);
2908
2909 WARN_ON_ONCE(size & (sizeof(u64)-1));
2910 header.size += size;
2911 }
2912
2913 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2914 if (ret)
2915 return;
2916
2917 perf_output_put(&handle, header);
2918
2919 if (sample_type & PERF_SAMPLE_IP)
2920 perf_output_put(&handle, ip);
2921
2922 if (sample_type & PERF_SAMPLE_TID)
2923 perf_output_put(&handle, tid_entry);
2924
2925 if (sample_type & PERF_SAMPLE_TIME)
2926 perf_output_put(&handle, time);
2927
2928 if (sample_type & PERF_SAMPLE_ADDR)
2929 perf_output_put(&handle, data->addr);
2930
2931 if (sample_type & PERF_SAMPLE_ID) {
2932 u64 id = primary_counter_id(counter);
2933
2934 perf_output_put(&handle, id);
2935 }
2936
2937 if (sample_type & PERF_SAMPLE_STREAM_ID)
2938 perf_output_put(&handle, counter->id);
2939
2940 if (sample_type & PERF_SAMPLE_CPU)
2941 perf_output_put(&handle, cpu_entry);
2942
2943 if (sample_type & PERF_SAMPLE_PERIOD)
2944 perf_output_put(&handle, data->period);
2945
2946 if (sample_type & PERF_SAMPLE_READ)
2947 perf_output_read(&handle, counter);
2948
2949 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2950 if (callchain)
2951 perf_output_copy(&handle, callchain, callchain_size);
2952 else {
2953 u64 nr = 0;
2954 perf_output_put(&handle, nr);
2955 }
2956 }
2957
2958 if (sample_type & PERF_SAMPLE_RAW) {
2959 if (data->raw) {
2960 perf_output_put(&handle, data->raw->size);
2961 perf_output_copy(&handle, data->raw->data, data->raw->size);
2962 } else {
2963 struct {
2964 u32 size;
2965 u32 data;
2966 } raw = {
2967 .size = sizeof(u32),
2968 .data = 0,
2969 };
2970 perf_output_put(&handle, raw);
2971 }
2972 }
2973
2974 perf_output_end(&handle);
2975}
2976
2977/*
2978 * read event
2979 */
2980
2981struct perf_read_event {
2982 struct perf_event_header header;
2983
2984 u32 pid;
2985 u32 tid;
2986};
2987
2988static void
2989perf_counter_read_event(struct perf_counter *counter,
2990 struct task_struct *task)
2991{
2992 struct perf_output_handle handle;
2993 struct perf_read_event event = {
2994 .header = {
2995 .type = PERF_EVENT_READ,
2996 .misc = 0,
2997 .size = sizeof(event) + perf_counter_read_size(counter),
2998 },
2999 .pid = perf_counter_pid(counter, task),
3000 .tid = perf_counter_tid(counter, task),
3001 };
3002 int ret;
3003
3004 ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
3005 if (ret)
3006 return;
3007
3008 perf_output_put(&handle, event);
3009 perf_output_read(&handle, counter);
3010
3011 perf_output_end(&handle);
3012}
3013
3014/*
3015 * task tracking -- fork/exit
3016 *
3017 * enabled by: attr.comm | attr.mmap | attr.task
3018 */
3019
3020struct perf_task_event {
3021 struct task_struct *task;
3022 struct perf_counter_context *task_ctx;
3023
3024 struct {
3025 struct perf_event_header header;
3026
3027 u32 pid;
3028 u32 ppid;
3029 u32 tid;
3030 u32 ptid;
3031 } event;
3032};
3033
3034static void perf_counter_task_output(struct perf_counter *counter,
3035 struct perf_task_event *task_event)
3036{
3037 struct perf_output_handle handle;
3038 int size = task_event->event.header.size;
3039 struct task_struct *task = task_event->task;
3040 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3041
3042 if (ret)
3043 return;
3044
3045 task_event->event.pid = perf_counter_pid(counter, task);
3046 task_event->event.ppid = perf_counter_pid(counter, current);
3047
3048 task_event->event.tid = perf_counter_tid(counter, task);
3049 task_event->event.ptid = perf_counter_tid(counter, current);
3050
3051 perf_output_put(&handle, task_event->event);
3052 perf_output_end(&handle);
3053}
3054
3055static int perf_counter_task_match(struct perf_counter *counter)
3056{
3057 if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
3058 return 1;
3059
3060 return 0;
3061}
3062
3063static void perf_counter_task_ctx(struct perf_counter_context *ctx,
3064 struct perf_task_event *task_event)
3065{
3066 struct perf_counter *counter;
3067
3068 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3069 return;
3070
3071 rcu_read_lock();
3072 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3073 if (perf_counter_task_match(counter))
3074 perf_counter_task_output(counter, task_event);
3075 }
3076 rcu_read_unlock();
3077}
3078
3079static void perf_counter_task_event(struct perf_task_event *task_event)
3080{
3081 struct perf_cpu_context *cpuctx;
3082 struct perf_counter_context *ctx = task_event->task_ctx;
3083
3084 cpuctx = &get_cpu_var(perf_cpu_context);
3085 perf_counter_task_ctx(&cpuctx->ctx, task_event);
3086 put_cpu_var(perf_cpu_context);
3087
3088 rcu_read_lock();
3089 if (!ctx)
3090 ctx = rcu_dereference(task_event->task->perf_counter_ctxp);
3091 if (ctx)
3092 perf_counter_task_ctx(ctx, task_event);
3093 rcu_read_unlock();
3094}
3095
3096static void perf_counter_task(struct task_struct *task,
3097 struct perf_counter_context *task_ctx,
3098 int new)
3099{
3100 struct perf_task_event task_event;
3101
3102 if (!atomic_read(&nr_comm_counters) &&
3103 !atomic_read(&nr_mmap_counters) &&
3104 !atomic_read(&nr_task_counters))
3105 return;
3106
3107 task_event = (struct perf_task_event){
3108 .task = task,
3109 .task_ctx = task_ctx,
3110 .event = {
3111 .header = {
3112 .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
3113 .misc = 0,
3114 .size = sizeof(task_event.event),
3115 },
3116 /* .pid */
3117 /* .ppid */
3118 /* .tid */
3119 /* .ptid */
3120 },
3121 };
3122
3123 perf_counter_task_event(&task_event);
3124}
3125
3126void perf_counter_fork(struct task_struct *task)
3127{
3128 perf_counter_task(task, NULL, 1);
3129}
3130
3131/*
3132 * comm tracking
3133 */
3134
3135struct perf_comm_event {
3136 struct task_struct *task;
3137 char *comm;
3138 int comm_size;
3139
3140 struct {
3141 struct perf_event_header header;
3142
3143 u32 pid;
3144 u32 tid;
3145 } event;
3146};
3147
3148static void perf_counter_comm_output(struct perf_counter *counter,
3149 struct perf_comm_event *comm_event)
3150{
3151 struct perf_output_handle handle;
3152 int size = comm_event->event.header.size;
3153 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3154
3155 if (ret)
3156 return;
3157
3158 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
3159 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
3160
3161 perf_output_put(&handle, comm_event->event);
3162 perf_output_copy(&handle, comm_event->comm,
3163 comm_event->comm_size);
3164 perf_output_end(&handle);
3165}
3166
3167static int perf_counter_comm_match(struct perf_counter *counter)
3168{
3169 if (counter->attr.comm)
3170 return 1;
3171
3172 return 0;
3173}
3174
3175static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
3176 struct perf_comm_event *comm_event)
3177{
3178 struct perf_counter *counter;
3179
3180 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3181 return;
3182
3183 rcu_read_lock();
3184 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3185 if (perf_counter_comm_match(counter))
3186 perf_counter_comm_output(counter, comm_event);
3187 }
3188 rcu_read_unlock();
3189}
3190
3191static void perf_counter_comm_event(struct perf_comm_event *comm_event)
3192{
3193 struct perf_cpu_context *cpuctx;
3194 struct perf_counter_context *ctx;
3195 unsigned int size;
3196 char comm[TASK_COMM_LEN];
3197
3198 memset(comm, 0, sizeof(comm));
3199 strncpy(comm, comm_event->task->comm, sizeof(comm));
3200 size = ALIGN(strlen(comm)+1, sizeof(u64));
3201
3202 comm_event->comm = comm;
3203 comm_event->comm_size = size;
3204
3205 comm_event->event.header.size = sizeof(comm_event->event) + size;
3206
3207 cpuctx = &get_cpu_var(perf_cpu_context);
3208 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
3209 put_cpu_var(perf_cpu_context);
3210
3211 rcu_read_lock();
3212 /*
3213 * doesn't really matter which of the child contexts the
3214 * events ends up in.
3215 */
3216 ctx = rcu_dereference(current->perf_counter_ctxp);
3217 if (ctx)
3218 perf_counter_comm_ctx(ctx, comm_event);
3219 rcu_read_unlock();
3220}
3221
3222void perf_counter_comm(struct task_struct *task)
3223{
3224 struct perf_comm_event comm_event;
3225
3226 if (task->perf_counter_ctxp)
3227 perf_counter_enable_on_exec(task);
3228
3229 if (!atomic_read(&nr_comm_counters))
3230 return;
3231
3232 comm_event = (struct perf_comm_event){
3233 .task = task,
3234 /* .comm */
3235 /* .comm_size */
3236 .event = {
3237 .header = {
3238 .type = PERF_EVENT_COMM,
3239 .misc = 0,
3240 /* .size */
3241 },
3242 /* .pid */
3243 /* .tid */
3244 },
3245 };
3246
3247 perf_counter_comm_event(&comm_event);
3248}
3249
3250/*
3251 * mmap tracking
3252 */
3253
3254struct perf_mmap_event {
3255 struct vm_area_struct *vma;
3256
3257 const char *file_name;
3258 int file_size;
3259
3260 struct {
3261 struct perf_event_header header;
3262
3263 u32 pid;
3264 u32 tid;
3265 u64 start;
3266 u64 len;
3267 u64 pgoff;
3268 } event;
3269};
3270
3271static void perf_counter_mmap_output(struct perf_counter *counter,
3272 struct perf_mmap_event *mmap_event)
3273{
3274 struct perf_output_handle handle;
3275 int size = mmap_event->event.header.size;
3276 int ret = perf_output_begin(&handle, counter, size, 0, 0);
3277
3278 if (ret)
3279 return;
3280
3281 mmap_event->event.pid = perf_counter_pid(counter, current);
3282 mmap_event->event.tid = perf_counter_tid(counter, current);
3283
3284 perf_output_put(&handle, mmap_event->event);
3285 perf_output_copy(&handle, mmap_event->file_name,
3286 mmap_event->file_size);
3287 perf_output_end(&handle);
3288}
3289
3290static int perf_counter_mmap_match(struct perf_counter *counter,
3291 struct perf_mmap_event *mmap_event)
3292{
3293 if (counter->attr.mmap)
3294 return 1;
3295
3296 return 0;
3297}
3298
3299static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
3300 struct perf_mmap_event *mmap_event)
3301{
3302 struct perf_counter *counter;
3303
3304 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3305 return;
3306
3307 rcu_read_lock();
3308 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3309 if (perf_counter_mmap_match(counter, mmap_event))
3310 perf_counter_mmap_output(counter, mmap_event);
3311 }
3312 rcu_read_unlock();
3313}
3314
3315static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
3316{
3317 struct perf_cpu_context *cpuctx;
3318 struct perf_counter_context *ctx;
3319 struct vm_area_struct *vma = mmap_event->vma;
3320 struct file *file = vma->vm_file;
3321 unsigned int size;
3322 char tmp[16];
3323 char *buf = NULL;
3324 const char *name;
3325
3326 memset(tmp, 0, sizeof(tmp));
3327
3328 if (file) {
3329 /*
3330 * d_path works from the end of the buffer backwards, so we
3331 * need to add enough zero bytes after the string to handle
3332 * the 64bit alignment we do later.
3333 */
3334 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3335 if (!buf) {
3336 name = strncpy(tmp, "//enomem", sizeof(tmp));
3337 goto got_name;
3338 }
3339 name = d_path(&file->f_path, buf, PATH_MAX);
3340 if (IS_ERR(name)) {
3341 name = strncpy(tmp, "//toolong", sizeof(tmp));
3342 goto got_name;
3343 }
3344 } else {
3345 if (arch_vma_name(mmap_event->vma)) {
3346 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3347 sizeof(tmp));
3348 goto got_name;
3349 }
3350
3351 if (!vma->vm_mm) {
3352 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3353 goto got_name;
3354 }
3355
3356 name = strncpy(tmp, "//anon", sizeof(tmp));
3357 goto got_name;
3358 }
3359
3360got_name:
3361 size = ALIGN(strlen(name)+1, sizeof(u64));
3362
3363 mmap_event->file_name = name;
3364 mmap_event->file_size = size;
3365
3366 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
3367
3368 cpuctx = &get_cpu_var(perf_cpu_context);
3369 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
3370 put_cpu_var(perf_cpu_context);
3371
3372 rcu_read_lock();
3373 /*
3374 * doesn't really matter which of the child contexts the
3375 * events ends up in.
3376 */
3377 ctx = rcu_dereference(current->perf_counter_ctxp);
3378 if (ctx)
3379 perf_counter_mmap_ctx(ctx, mmap_event);
3380 rcu_read_unlock();
3381
3382 kfree(buf);
3383}
3384
3385void __perf_counter_mmap(struct vm_area_struct *vma)
3386{
3387 struct perf_mmap_event mmap_event;
3388
3389 if (!atomic_read(&nr_mmap_counters))
3390 return;
3391
3392 mmap_event = (struct perf_mmap_event){
3393 .vma = vma,
3394 /* .file_name */
3395 /* .file_size */
3396 .event = {
3397 .header = {
3398 .type = PERF_EVENT_MMAP,
3399 .misc = 0,
3400 /* .size */
3401 },
3402 /* .pid */
3403 /* .tid */
3404 .start = vma->vm_start,
3405 .len = vma->vm_end - vma->vm_start,
3406 .pgoff = vma->vm_pgoff,
3407 },
3408 };
3409
3410 perf_counter_mmap_event(&mmap_event);
3411}
3412
3413/*
3414 * IRQ throttle logging
3415 */
3416
3417static void perf_log_throttle(struct perf_counter *counter, int enable)
3418{
3419 struct perf_output_handle handle;
3420 int ret;
3421
3422 struct {
3423 struct perf_event_header header;
3424 u64 time;
3425 u64 id;
3426 u64 stream_id;
3427 } throttle_event = {
3428 .header = {
3429 .type = PERF_EVENT_THROTTLE,
3430 .misc = 0,
3431 .size = sizeof(throttle_event),
3432 },
3433 .time = sched_clock(),
3434 .id = primary_counter_id(counter),
3435 .stream_id = counter->id,
3436 };
3437
3438 if (enable)
3439 throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
3440
3441 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3442 if (ret)
3443 return;
3444
3445 perf_output_put(&handle, throttle_event);
3446 perf_output_end(&handle);
3447}
3448
3449/*
3450 * Generic counter overflow handling, sampling.
3451 */
3452
3453int perf_counter_overflow(struct perf_counter *counter, int nmi,
3454 struct perf_sample_data *data)
3455{
3456 int events = atomic_read(&counter->event_limit);
3457 int throttle = counter->pmu->unthrottle != NULL;
3458 struct hw_perf_counter *hwc = &counter->hw;
3459 int ret = 0;
3460
3461 if (!throttle) {
3462 hwc->interrupts++;
3463 } else {
3464 if (hwc->interrupts != MAX_INTERRUPTS) {
3465 hwc->interrupts++;
3466 if (HZ * hwc->interrupts >
3467 (u64)sysctl_perf_counter_sample_rate) {
3468 hwc->interrupts = MAX_INTERRUPTS;
3469 perf_log_throttle(counter, 0);
3470 ret = 1;
3471 }
3472 } else {
3473 /*
3474 * Keep re-disabling counters even though on the previous
3475 * pass we disabled it - just in case we raced with a
3476 * sched-in and the counter got enabled again:
3477 */
3478 ret = 1;
3479 }
3480 }
3481
3482 if (counter->attr.freq) {
3483 u64 now = sched_clock();
3484 s64 delta = now - hwc->freq_stamp;
3485
3486 hwc->freq_stamp = now;
3487
3488 if (delta > 0 && delta < TICK_NSEC)
3489 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3490 }
3491
3492 /*
3493 * XXX event_limit might not quite work as expected on inherited
3494 * counters
3495 */
3496
3497 counter->pending_kill = POLL_IN;
3498 if (events && atomic_dec_and_test(&counter->event_limit)) {
3499 ret = 1;
3500 counter->pending_kill = POLL_HUP;
3501 if (nmi) {
3502 counter->pending_disable = 1;
3503 perf_pending_queue(&counter->pending,
3504 perf_pending_counter);
3505 } else
3506 perf_counter_disable(counter);
3507 }
3508
3509 perf_counter_output(counter, nmi, data);
3510 return ret;
3511}
3512
3513/*
3514 * Generic software counter infrastructure
3515 */
3516
3517/*
3518 * We directly increment counter->count and keep a second value in
3519 * counter->hw.period_left to count intervals. This period counter
3520 * is kept in the range [-sample_period, 0] so that we can use the
3521 * sign as trigger.
3522 */
3523
3524static u64 perf_swcounter_set_period(struct perf_counter *counter)
3525{
3526 struct hw_perf_counter *hwc = &counter->hw;
3527 u64 period = hwc->last_period;
3528 u64 nr, offset;
3529 s64 old, val;
3530
3531 hwc->last_period = hwc->sample_period;
3532
3533again:
3534 old = val = atomic64_read(&hwc->period_left);
3535 if (val < 0)
3536 return 0;
3537
3538 nr = div64_u64(period + val, period);
3539 offset = nr * period;
3540 val -= offset;
3541 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3542 goto again;
3543
3544 return nr;
3545}
3546
3547static void perf_swcounter_overflow(struct perf_counter *counter,
3548 int nmi, struct perf_sample_data *data)
3549{
3550 struct hw_perf_counter *hwc = &counter->hw;
3551 u64 overflow;
3552
3553 data->period = counter->hw.last_period;
3554 overflow = perf_swcounter_set_period(counter);
3555
3556 if (hwc->interrupts == MAX_INTERRUPTS)
3557 return;
3558
3559 for (; overflow; overflow--) {
3560 if (perf_counter_overflow(counter, nmi, data)) {
3561 /*
3562 * We inhibit the overflow from happening when
3563 * hwc->interrupts == MAX_INTERRUPTS.
3564 */
3565 break;
3566 }
3567 }
3568}
3569
3570static void perf_swcounter_unthrottle(struct perf_counter *counter)
3571{
3572 /*
3573 * Nothing to do, we already reset hwc->interrupts.
3574 */
3575}
3576
3577static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3578 int nmi, struct perf_sample_data *data)
3579{
3580 struct hw_perf_counter *hwc = &counter->hw;
3581
3582 atomic64_add(nr, &counter->count);
3583
3584 if (!hwc->sample_period)
3585 return;
3586
3587 if (!data->regs)
3588 return;
3589
3590 if (!atomic64_add_negative(nr, &hwc->period_left))
3591 perf_swcounter_overflow(counter, nmi, data);
3592}
3593
3594static int perf_swcounter_is_counting(struct perf_counter *counter)
3595{
3596 /*
3597 * The counter is active, we're good!
3598 */
3599 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3600 return 1;
3601
3602 /*
3603 * The counter is off/error, not counting.
3604 */
3605 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3606 return 0;
3607
3608 /*
3609 * The counter is inactive, if the context is active
3610 * we're part of a group that didn't make it on the 'pmu',
3611 * not counting.
3612 */
3613 if (counter->ctx->is_active)
3614 return 0;
3615
3616 /*
3617 * We're inactive and the context is too, this means the
3618 * task is scheduled out, we're counting events that happen
3619 * to us, like migration events.
3620 */
3621 return 1;
3622}
3623
3624static int perf_swcounter_match(struct perf_counter *counter,
3625 enum perf_type_id type,
3626 u32 event, struct pt_regs *regs)
3627{
3628 if (!perf_swcounter_is_counting(counter))
3629 return 0;
3630
3631 if (counter->attr.type != type)
3632 return 0;
3633 if (counter->attr.config != event)
3634 return 0;
3635
3636 if (regs) {
3637 if (counter->attr.exclude_user && user_mode(regs))
3638 return 0;
3639
3640 if (counter->attr.exclude_kernel && !user_mode(regs))
3641 return 0;
3642 }
3643
3644 return 1;
3645}
3646
3647static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3648 enum perf_type_id type,
3649 u32 event, u64 nr, int nmi,
3650 struct perf_sample_data *data)
3651{
3652 struct perf_counter *counter;
3653
3654 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3655 return;
3656
3657 rcu_read_lock();
3658 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3659 if (perf_swcounter_match(counter, type, event, data->regs))
3660 perf_swcounter_add(counter, nr, nmi, data);
3661 }
3662 rcu_read_unlock();
3663}
3664
3665static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3666{
3667 if (in_nmi())
3668 return &cpuctx->recursion[3];
3669
3670 if (in_irq())
3671 return &cpuctx->recursion[2];
3672
3673 if (in_softirq())
3674 return &cpuctx->recursion[1];
3675
3676 return &cpuctx->recursion[0];
3677}
3678
3679static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3680 u64 nr, int nmi,
3681 struct perf_sample_data *data)
3682{
3683 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3684 int *recursion = perf_swcounter_recursion_context(cpuctx);
3685 struct perf_counter_context *ctx;
3686
3687 if (*recursion)
3688 goto out;
3689
3690 (*recursion)++;
3691 barrier();
3692
3693 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3694 nr, nmi, data);
3695 rcu_read_lock();
3696 /*
3697 * doesn't really matter which of the child contexts the
3698 * events ends up in.
3699 */
3700 ctx = rcu_dereference(current->perf_counter_ctxp);
3701 if (ctx)
3702 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3703 rcu_read_unlock();
3704
3705 barrier();
3706 (*recursion)--;
3707
3708out:
3709 put_cpu_var(perf_cpu_context);
3710}
3711
3712void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3713 struct pt_regs *regs, u64 addr)
3714{
3715 struct perf_sample_data data = {
3716 .regs = regs,
3717 .addr = addr,
3718 };
3719
3720 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3721}
3722
3723static void perf_swcounter_read(struct perf_counter *counter)
3724{
3725}
3726
3727static int perf_swcounter_enable(struct perf_counter *counter)
3728{
3729 struct hw_perf_counter *hwc = &counter->hw;
3730
3731 if (hwc->sample_period) {
3732 hwc->last_period = hwc->sample_period;
3733 perf_swcounter_set_period(counter);
3734 }
3735 return 0;
3736}
3737
3738static void perf_swcounter_disable(struct perf_counter *counter)
3739{
3740}
3741
3742static const struct pmu perf_ops_generic = {
3743 .enable = perf_swcounter_enable,
3744 .disable = perf_swcounter_disable,
3745 .read = perf_swcounter_read,
3746 .unthrottle = perf_swcounter_unthrottle,
3747};
3748
3749/*
3750 * hrtimer based swcounter callback
3751 */
3752
3753static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3754{
3755 enum hrtimer_restart ret = HRTIMER_RESTART;
3756 struct perf_sample_data data;
3757 struct perf_counter *counter;
3758 u64 period;
3759
3760 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3761 counter->pmu->read(counter);
3762
3763 data.addr = 0;
3764 data.regs = get_irq_regs();
3765 /*
3766 * In case we exclude kernel IPs or are somehow not in interrupt
3767 * context, provide the next best thing, the user IP.
3768 */
3769 if ((counter->attr.exclude_kernel || !data.regs) &&
3770 !counter->attr.exclude_user)
3771 data.regs = task_pt_regs(current);
3772
3773 if (data.regs) {
3774 if (perf_counter_overflow(counter, 0, &data))
3775 ret = HRTIMER_NORESTART;
3776 }
3777
3778 period = max_t(u64, 10000, counter->hw.sample_period);
3779 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3780
3781 return ret;
3782}
3783
3784/*
3785 * Software counter: cpu wall time clock
3786 */
3787
3788static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3789{
3790 int cpu = raw_smp_processor_id();
3791 s64 prev;
3792 u64 now;
3793
3794 now = cpu_clock(cpu);
3795 prev = atomic64_read(&counter->hw.prev_count);
3796 atomic64_set(&counter->hw.prev_count, now);
3797 atomic64_add(now - prev, &counter->count);
3798}
3799
3800static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3801{
3802 struct hw_perf_counter *hwc = &counter->hw;
3803 int cpu = raw_smp_processor_id();
3804
3805 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3806 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3807 hwc->hrtimer.function = perf_swcounter_hrtimer;
3808 if (hwc->sample_period) {
3809 u64 period = max_t(u64, 10000, hwc->sample_period);
3810 __hrtimer_start_range_ns(&hwc->hrtimer,
3811 ns_to_ktime(period), 0,
3812 HRTIMER_MODE_REL, 0);
3813 }
3814
3815 return 0;
3816}
3817
3818static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3819{
3820 if (counter->hw.sample_period)
3821 hrtimer_cancel(&counter->hw.hrtimer);
3822 cpu_clock_perf_counter_update(counter);
3823}
3824
3825static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3826{
3827 cpu_clock_perf_counter_update(counter);
3828}
3829
3830static const struct pmu perf_ops_cpu_clock = {
3831 .enable = cpu_clock_perf_counter_enable,
3832 .disable = cpu_clock_perf_counter_disable,
3833 .read = cpu_clock_perf_counter_read,
3834};
3835
3836/*
3837 * Software counter: task time clock
3838 */
3839
3840static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3841{
3842 u64 prev;
3843 s64 delta;
3844
3845 prev = atomic64_xchg(&counter->hw.prev_count, now);
3846 delta = now - prev;
3847 atomic64_add(delta, &counter->count);
3848}
3849
3850static int task_clock_perf_counter_enable(struct perf_counter *counter)
3851{
3852 struct hw_perf_counter *hwc = &counter->hw;
3853 u64 now;
3854
3855 now = counter->ctx->time;
3856
3857 atomic64_set(&hwc->prev_count, now);
3858 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3859 hwc->hrtimer.function = perf_swcounter_hrtimer;
3860 if (hwc->sample_period) {
3861 u64 period = max_t(u64, 10000, hwc->sample_period);
3862 __hrtimer_start_range_ns(&hwc->hrtimer,
3863 ns_to_ktime(period), 0,
3864 HRTIMER_MODE_REL, 0);
3865 }
3866
3867 return 0;
3868}
3869
3870static void task_clock_perf_counter_disable(struct perf_counter *counter)
3871{
3872 if (counter->hw.sample_period)
3873 hrtimer_cancel(&counter->hw.hrtimer);
3874 task_clock_perf_counter_update(counter, counter->ctx->time);
3875
3876}
3877
3878static void task_clock_perf_counter_read(struct perf_counter *counter)
3879{
3880 u64 time;
3881
3882 if (!in_nmi()) {
3883 update_context_time(counter->ctx);
3884 time = counter->ctx->time;
3885 } else {
3886 u64 now = perf_clock();
3887 u64 delta = now - counter->ctx->timestamp;
3888 time = counter->ctx->time + delta;
3889 }
3890
3891 task_clock_perf_counter_update(counter, time);
3892}
3893
3894static const struct pmu perf_ops_task_clock = {
3895 .enable = task_clock_perf_counter_enable,
3896 .disable = task_clock_perf_counter_disable,
3897 .read = task_clock_perf_counter_read,
3898};
3899
3900#ifdef CONFIG_EVENT_PROFILE
3901void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3902 int entry_size)
3903{
3904 struct perf_raw_record raw = {
3905 .size = entry_size,
3906 .data = record,
3907 };
3908
3909 struct perf_sample_data data = {
3910 .regs = get_irq_regs(),
3911 .addr = addr,
3912 .raw = &raw,
3913 };
3914
3915 if (!data.regs)
3916 data.regs = task_pt_regs(current);
3917
3918 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
3919}
3920EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3921
3922extern int ftrace_profile_enable(int);
3923extern void ftrace_profile_disable(int);
3924
3925static void tp_perf_counter_destroy(struct perf_counter *counter)
3926{
3927 ftrace_profile_disable(counter->attr.config);
3928}
3929
3930static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3931{
3932 /*
3933 * Raw tracepoint data is a severe data leak, only allow root to
3934 * have these.
3935 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3937 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM);
3939
3940 if (ftrace_profile_enable(counter->attr.config))
3941 return NULL;
3942
3943 counter->destroy = tp_perf_counter_destroy;
3944
3945 return &perf_ops_generic;
3946}
3947#else
3948static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3949{
3950 return NULL;
3951}
3952#endif
3953
3954atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX];
3955
3956static void sw_perf_counter_destroy(struct perf_counter *counter)
3957{
3958 u64 event = counter->attr.config;
3959
3960 WARN_ON(counter->parent);
3961
3962 atomic_dec(&perf_swcounter_enabled[event]);
3963}
3964
3965static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3966{
3967 const struct pmu *pmu = NULL;
3968 u64 event = counter->attr.config;
3969
3970 /*
3971 * Software counters (currently) can't in general distinguish
3972 * between user, kernel and hypervisor events.
3973 * However, context switches and cpu migrations are considered
3974 * to be kernel events, and page faults are never hypervisor
3975 * events.
3976 */
3977 switch (event) {
3978 case PERF_COUNT_SW_CPU_CLOCK:
3979 pmu = &perf_ops_cpu_clock;
3980
3981 break;
3982 case PERF_COUNT_SW_TASK_CLOCK:
3983 /*
3984 * If the user instantiates this as a per-cpu counter,
3985 * use the cpu_clock counter instead.
3986 */
3987 if (counter->ctx->task)
3988 pmu = &perf_ops_task_clock;
3989 else
3990 pmu = &perf_ops_cpu_clock;
3991
3992 break;
3993 case PERF_COUNT_SW_PAGE_FAULTS:
3994 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3995 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3996 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3997 case PERF_COUNT_SW_CPU_MIGRATIONS:
3998 if (!counter->parent) {
3999 atomic_inc(&perf_swcounter_enabled[event]);
4000 counter->destroy = sw_perf_counter_destroy;
4001 }
4002 pmu = &perf_ops_generic;
4003 break;
4004 }
4005
4006 return pmu;
4007}
4008
4009/*
4010 * Allocate and initialize a counter structure
4011 */
4012static struct perf_counter *
4013perf_counter_alloc(struct perf_counter_attr *attr,
4014 int cpu,
4015 struct perf_counter_context *ctx,
4016 struct perf_counter *group_leader,
4017 struct perf_counter *parent_counter,
4018 gfp_t gfpflags)
4019{
4020 const struct pmu *pmu;
4021 struct perf_counter *counter;
4022 struct hw_perf_counter *hwc;
4023 long err;
4024
4025 counter = kzalloc(sizeof(*counter), gfpflags);
4026 if (!counter)
4027 return ERR_PTR(-ENOMEM);
4028
4029 /*
4030 * Single counters are their own group leaders, with an
4031 * empty sibling list:
4032 */
4033 if (!group_leader)
4034 group_leader = counter;
4035
4036 mutex_init(&counter->child_mutex);
4037 INIT_LIST_HEAD(&counter->child_list);
4038
4039 INIT_LIST_HEAD(&counter->list_entry);
4040 INIT_LIST_HEAD(&counter->event_entry);
4041 INIT_LIST_HEAD(&counter->sibling_list);
4042 init_waitqueue_head(&counter->waitq);
4043
4044 mutex_init(&counter->mmap_mutex);
4045
4046 counter->cpu = cpu;
4047 counter->attr = *attr;
4048 counter->group_leader = group_leader;
4049 counter->pmu = NULL;
4050 counter->ctx = ctx;
4051 counter->oncpu = -1;
4052
4053 counter->parent = parent_counter;
4054
4055 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
4056 counter->id = atomic64_inc_return(&perf_counter_id);
4057
4058 counter->state = PERF_COUNTER_STATE_INACTIVE;
4059
4060 if (attr->disabled)
4061 counter->state = PERF_COUNTER_STATE_OFF;
4062
4063 pmu = NULL;
4064
4065 hwc = &counter->hw;
4066 hwc->sample_period = attr->sample_period;
4067 if (attr->freq && attr->sample_freq)
4068 hwc->sample_period = 1;
4069 hwc->last_period = hwc->sample_period;
4070
4071 atomic64_set(&hwc->period_left, hwc->sample_period);
4072
4073 /*
4074 * we currently do not support PERF_FORMAT_GROUP on inherited counters
4075 */
4076 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4077 goto done;
4078
4079 switch (attr->type) {
4080 case PERF_TYPE_RAW:
4081 case PERF_TYPE_HARDWARE:
4082 case PERF_TYPE_HW_CACHE:
4083 pmu = hw_perf_counter_init(counter);
4084 break;
4085
4086 case PERF_TYPE_SOFTWARE:
4087 pmu = sw_perf_counter_init(counter);
4088 break;
4089
4090 case PERF_TYPE_TRACEPOINT:
4091 pmu = tp_perf_counter_init(counter);
4092 break;
4093
4094 default:
4095 break;
4096 }
4097done:
4098 err = 0;
4099 if (!pmu)
4100 err = -EINVAL;
4101 else if (IS_ERR(pmu))
4102 err = PTR_ERR(pmu);
4103
4104 if (err) {
4105 if (counter->ns)
4106 put_pid_ns(counter->ns);
4107 kfree(counter);
4108 return ERR_PTR(err);
4109 }
4110
4111 counter->pmu = pmu;
4112
4113 if (!counter->parent) {
4114 atomic_inc(&nr_counters);
4115 if (counter->attr.mmap)
4116 atomic_inc(&nr_mmap_counters);
4117 if (counter->attr.comm)
4118 atomic_inc(&nr_comm_counters);
4119 if (counter->attr.task)
4120 atomic_inc(&nr_task_counters);
4121 }
4122
4123 return counter;
4124}
4125
4126static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4127 struct perf_counter_attr *attr)
4128{
4129 int ret;
4130 u32 size;
4131
4132 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4133 return -EFAULT;
4134
4135 /*
4136 * zero the full structure, so that a short copy will be nice.
4137 */
4138 memset(attr, 0, sizeof(*attr));
4139
4140 ret = get_user(size, &uattr->size);
4141 if (ret)
4142 return ret;
4143
4144 if (size > PAGE_SIZE) /* silly large */
4145 goto err_size;
4146
4147 if (!size) /* abi compat */
4148 size = PERF_ATTR_SIZE_VER0;
4149
4150 if (size < PERF_ATTR_SIZE_VER0)
4151 goto err_size;
4152
4153 /*
4154 * If we're handed a bigger struct than we know of,
4155 * ensure all the unknown bits are 0.
4156 */
4157 if (size > sizeof(*attr)) {
4158 unsigned long val;
4159 unsigned long __user *addr;
4160 unsigned long __user *end;
4161
4162 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
4163 sizeof(unsigned long));
4164 end = PTR_ALIGN((void __user *)uattr + size,
4165 sizeof(unsigned long));
4166
4167 for (; addr < end; addr += sizeof(unsigned long)) {
4168 ret = get_user(val, addr);
4169 if (ret)
4170 return ret;
4171 if (val)
4172 goto err_size;
4173 }
4174 }
4175
4176 ret = copy_from_user(attr, uattr, size);
4177 if (ret)
4178 return -EFAULT;
4179
4180 /*
4181 * If the type exists, the corresponding creation will verify
4182 * the attr->config.
4183 */
4184 if (attr->type >= PERF_TYPE_MAX)
4185 return -EINVAL;
4186
4187 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4188 return -EINVAL;
4189
4190 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4191 return -EINVAL;
4192
4193 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4194 return -EINVAL;
4195
4196out:
4197 return ret;
4198
4199err_size:
4200 put_user(sizeof(*attr), &uattr->size);
4201 ret = -E2BIG;
4202 goto out;
4203}
4204
4205/**
4206 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4207 *
4208 * @attr_uptr: event type attributes for monitoring/sampling
4209 * @pid: target pid
4210 * @cpu: target cpu
4211 * @group_fd: group leader counter fd
4212 */
4213SYSCALL_DEFINE5(perf_counter_open,
4214 struct perf_counter_attr __user *, attr_uptr,
4215 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4216{
4217 struct perf_counter *counter, *group_leader;
4218 struct perf_counter_attr attr;
4219 struct perf_counter_context *ctx;
4220 struct file *counter_file = NULL;
4221 struct file *group_file = NULL;
4222 int fput_needed = 0;
4223 int fput_needed2 = 0;
4224 int ret;
4225
4226 /* for future expandability... */
4227 if (flags)
4228 return -EINVAL;
4229
4230 ret = perf_copy_attr(attr_uptr, &attr);
4231 if (ret)
4232 return ret;
4233
4234 if (!attr.exclude_kernel) {
4235 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4236 return -EACCES;
4237 }
4238
4239 if (attr.freq) {
4240 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
4241 return -EINVAL;
4242 }
4243
4244 /*
4245 * Get the target context (task or percpu):
4246 */
4247 ctx = find_get_context(pid, cpu);
4248 if (IS_ERR(ctx))
4249 return PTR_ERR(ctx);
4250
4251 /*
4252 * Look up the group leader (we will attach this counter to it):
4253 */
4254 group_leader = NULL;
4255 if (group_fd != -1) {
4256 ret = -EINVAL;
4257 group_file = fget_light(group_fd, &fput_needed);
4258 if (!group_file)
4259 goto err_put_context;
4260 if (group_file->f_op != &perf_fops)
4261 goto err_put_context;
4262
4263 group_leader = group_file->private_data;
4264 /*
4265 * Do not allow a recursive hierarchy (this new sibling
4266 * becoming part of another group-sibling):
4267 */
4268 if (group_leader->group_leader != group_leader)
4269 goto err_put_context;
4270 /*
4271 * Do not allow to attach to a group in a different
4272 * task or CPU context:
4273 */
4274 if (group_leader->ctx != ctx)
4275 goto err_put_context;
4276 /*
4277 * Only a group leader can be exclusive or pinned
4278 */
4279 if (attr.exclusive || attr.pinned)
4280 goto err_put_context;
4281 }
4282
4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4284 NULL, GFP_KERNEL);
4285 ret = PTR_ERR(counter);
4286 if (IS_ERR(counter))
4287 goto err_put_context;
4288
4289 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4290 if (ret < 0)
4291 goto err_free_put_context;
4292
4293 counter_file = fget_light(ret, &fput_needed2);
4294 if (!counter_file)
4295 goto err_free_put_context;
4296
4297 counter->filp = counter_file;
4298 WARN_ON_ONCE(ctx->parent_ctx);
4299 mutex_lock(&ctx->mutex);
4300 perf_install_in_context(ctx, counter, cpu);
4301 ++ctx->generation;
4302 mutex_unlock(&ctx->mutex);
4303
4304 counter->owner = current;
4305 get_task_struct(current);
4306 mutex_lock(&current->perf_counter_mutex);
4307 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4308 mutex_unlock(&current->perf_counter_mutex);
4309
4310 fput_light(counter_file, fput_needed2);
4311
4312out_fput:
4313 fput_light(group_file, fput_needed);
4314
4315 return ret;
4316
4317err_free_put_context:
4318 kfree(counter);
4319
4320err_put_context:
4321 put_ctx(ctx);
4322
4323 goto out_fput;
4324}
4325
4326/*
4327 * inherit a counter from parent task to child task:
4328 */
4329static struct perf_counter *
4330inherit_counter(struct perf_counter *parent_counter,
4331 struct task_struct *parent,
4332 struct perf_counter_context *parent_ctx,
4333 struct task_struct *child,
4334 struct perf_counter *group_leader,
4335 struct perf_counter_context *child_ctx)
4336{
4337 struct perf_counter *child_counter;
4338
4339 /*
4340 * Instead of creating recursive hierarchies of counters,
4341 * we link inherited counters back to the original parent,
4342 * which has a filp for sure, which we use as the reference
4343 * count:
4344 */
4345 if (parent_counter->parent)
4346 parent_counter = parent_counter->parent;
4347
4348 child_counter = perf_counter_alloc(&parent_counter->attr,
4349 parent_counter->cpu, child_ctx,
4350 group_leader, parent_counter,
4351 GFP_KERNEL);
4352 if (IS_ERR(child_counter))
4353 return child_counter;
4354 get_ctx(child_ctx);
4355
4356 /*
4357 * Make the child state follow the state of the parent counter,
4358 * not its attr.disabled bit. We hold the parent's mutex,
4359 * so we won't race with perf_counter_{en, dis}able_family.
4360 */
4361 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
4362 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
4363 else
4364 child_counter->state = PERF_COUNTER_STATE_OFF;
4365
4366 if (parent_counter->attr.freq)
4367 child_counter->hw.sample_period = parent_counter->hw.sample_period;
4368
4369 /*
4370 * Link it up in the child's context:
4371 */
4372 add_counter_to_ctx(child_counter, child_ctx);
4373
4374 /*
4375 * Get a reference to the parent filp - we will fput it
4376 * when the child counter exits. This is safe to do because
4377 * we are in the parent and we know that the filp still
4378 * exists and has a nonzero count:
4379 */
4380 atomic_long_inc(&parent_counter->filp->f_count);
4381
4382 /*
4383 * Link this into the parent counter's child list
4384 */
4385 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4386 mutex_lock(&parent_counter->child_mutex);
4387 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
4388 mutex_unlock(&parent_counter->child_mutex);
4389
4390 return child_counter;
4391}
4392
4393static int inherit_group(struct perf_counter *parent_counter,
4394 struct task_struct *parent,
4395 struct perf_counter_context *parent_ctx,
4396 struct task_struct *child,
4397 struct perf_counter_context *child_ctx)
4398{
4399 struct perf_counter *leader;
4400 struct perf_counter *sub;
4401 struct perf_counter *child_ctr;
4402
4403 leader = inherit_counter(parent_counter, parent, parent_ctx,
4404 child, NULL, child_ctx);
4405 if (IS_ERR(leader))
4406 return PTR_ERR(leader);
4407 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
4408 child_ctr = inherit_counter(sub, parent, parent_ctx,
4409 child, leader, child_ctx);
4410 if (IS_ERR(child_ctr))
4411 return PTR_ERR(child_ctr);
4412 }
4413 return 0;
4414}
4415
4416static void sync_child_counter(struct perf_counter *child_counter,
4417 struct task_struct *child)
4418{
4419 struct perf_counter *parent_counter = child_counter->parent;
4420 u64 child_val;
4421
4422 if (child_counter->attr.inherit_stat)
4423 perf_counter_read_event(child_counter, child);
4424
4425 child_val = atomic64_read(&child_counter->count);
4426
4427 /*
4428 * Add back the child's count to the parent's count:
4429 */
4430 atomic64_add(child_val, &parent_counter->count);
4431 atomic64_add(child_counter->total_time_enabled,
4432 &parent_counter->child_total_time_enabled);
4433 atomic64_add(child_counter->total_time_running,
4434 &parent_counter->child_total_time_running);
4435
4436 /*
4437 * Remove this counter from the parent's list
4438 */
4439 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
4440 mutex_lock(&parent_counter->child_mutex);
4441 list_del_init(&child_counter->child_list);
4442 mutex_unlock(&parent_counter->child_mutex);
4443
4444 /*
4445 * Release the parent counter, if this was the last
4446 * reference to it.
4447 */
4448 fput(parent_counter->filp);
4449}
4450
4451static void
4452__perf_counter_exit_task(struct perf_counter *child_counter,
4453 struct perf_counter_context *child_ctx,
4454 struct task_struct *child)
4455{
4456 struct perf_counter *parent_counter;
4457
4458 update_counter_times(child_counter);
4459 perf_counter_remove_from_context(child_counter);
4460
4461 parent_counter = child_counter->parent;
4462 /*
4463 * It can happen that parent exits first, and has counters
4464 * that are still around due to the child reference. These
4465 * counters need to be zapped - but otherwise linger.
4466 */
4467 if (parent_counter) {
4468 sync_child_counter(child_counter, child);
4469 free_counter(child_counter);
4470 }
4471}
4472
4473/*
4474 * When a child task exits, feed back counter values to parent counters.
4475 */
4476void perf_counter_exit_task(struct task_struct *child)
4477{
4478 struct perf_counter *child_counter, *tmp;
4479 struct perf_counter_context *child_ctx;
4480 unsigned long flags;
4481
4482 if (likely(!child->perf_counter_ctxp)) {
4483 perf_counter_task(child, NULL, 0);
4484 return;
4485 }
4486
4487 local_irq_save(flags);
4488 /*
4489 * We can't reschedule here because interrupts are disabled,
4490 * and either child is current or it is a task that can't be
4491 * scheduled, so we are now safe from rescheduling changing
4492 * our context.
4493 */
4494 child_ctx = child->perf_counter_ctxp;
4495 __perf_counter_task_sched_out(child_ctx);
4496
4497 /*
4498 * Take the context lock here so that if find_get_context is
4499 * reading child->perf_counter_ctxp, we wait until it has
4500 * incremented the context's refcount before we do put_ctx below.
4501 */
4502 spin_lock(&child_ctx->lock);
4503 child->perf_counter_ctxp = NULL;
4504 /*
4505 * If this context is a clone; unclone it so it can't get
4506 * swapped to another process while we're removing all
4507 * the counters from it.
4508 */
4509 unclone_ctx(child_ctx);
4510 spin_unlock_irqrestore(&child_ctx->lock, flags);
4511
4512 /*
4513 * Report the task dead after unscheduling the counters so that we
4514 * won't get any samples after PERF_EVENT_EXIT. We can however still
4515 * get a few PERF_EVENT_READ events.
4516 */
4517 perf_counter_task(child, child_ctx, 0);
4518
4519 /*
4520 * We can recurse on the same lock type through:
4521 *
4522 * __perf_counter_exit_task()
4523 * sync_child_counter()
4524 * fput(parent_counter->filp)
4525 * perf_release()
4526 * mutex_lock(&ctx->mutex)
4527 *
4528 * But since its the parent context it won't be the same instance.
4529 */
4530 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4531
4532again:
4533 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4534 list_entry)
4535 __perf_counter_exit_task(child_counter, child_ctx, child);
4536
4537 /*
4538 * If the last counter was a group counter, it will have appended all
4539 * its siblings to the list, but we obtained 'tmp' before that which
4540 * will still point to the list head terminating the iteration.
4541 */
4542 if (!list_empty(&child_ctx->counter_list))
4543 goto again;
4544
4545 mutex_unlock(&child_ctx->mutex);
4546
4547 put_ctx(child_ctx);
4548}
4549
4550/*
4551 * free an unexposed, unused context as created by inheritance by
4552 * init_task below, used by fork() in case of fail.
4553 */
4554void perf_counter_free_task(struct task_struct *task)
4555{
4556 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4557 struct perf_counter *counter, *tmp;
4558
4559 if (!ctx)
4560 return;
4561
4562 mutex_lock(&ctx->mutex);
4563again:
4564 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4565 struct perf_counter *parent = counter->parent;
4566
4567 if (WARN_ON_ONCE(!parent))
4568 continue;
4569
4570 mutex_lock(&parent->child_mutex);
4571 list_del_init(&counter->child_list);
4572 mutex_unlock(&parent->child_mutex);
4573
4574 fput(parent->filp);
4575
4576 list_del_counter(counter, ctx);
4577 free_counter(counter);
4578 }
4579
4580 if (!list_empty(&ctx->counter_list))
4581 goto again;
4582
4583 mutex_unlock(&ctx->mutex);
4584
4585 put_ctx(ctx);
4586}
4587
4588/*
4589 * Initialize the perf_counter context in task_struct
4590 */
4591int perf_counter_init_task(struct task_struct *child)
4592{
4593 struct perf_counter_context *child_ctx, *parent_ctx;
4594 struct perf_counter_context *cloned_ctx;
4595 struct perf_counter *counter;
4596 struct task_struct *parent = current;
4597 int inherited_all = 1;
4598 int ret = 0;
4599
4600 child->perf_counter_ctxp = NULL;
4601
4602 mutex_init(&child->perf_counter_mutex);
4603 INIT_LIST_HEAD(&child->perf_counter_list);
4604
4605 if (likely(!parent->perf_counter_ctxp))
4606 return 0;
4607
4608 /*
4609 * This is executed from the parent task context, so inherit
4610 * counters that have been marked for cloning.
4611 * First allocate and initialize a context for the child.
4612 */
4613
4614 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4615 if (!child_ctx)
4616 return -ENOMEM;
4617
4618 __perf_counter_init_context(child_ctx, child);
4619 child->perf_counter_ctxp = child_ctx;
4620 get_task_struct(child);
4621
4622 /*
4623 * If the parent's context is a clone, pin it so it won't get
4624 * swapped under us.
4625 */
4626 parent_ctx = perf_pin_task_context(parent);
4627
4628 /*
4629 * No need to check if parent_ctx != NULL here; since we saw
4630 * it non-NULL earlier, the only reason for it to become NULL
4631 * is if we exit, and since we're currently in the middle of
4632 * a fork we can't be exiting at the same time.
4633 */
4634
4635 /*
4636 * Lock the parent list. No need to lock the child - not PID
4637 * hashed yet and not running, so nobody can access it.
4638 */
4639 mutex_lock(&parent_ctx->mutex);
4640
4641 /*
4642 * We dont have to disable NMIs - we are only looking at
4643 * the list, not manipulating it:
4644 */
4645 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4646 if (counter != counter->group_leader)
4647 continue;
4648
4649 if (!counter->attr.inherit) {
4650 inherited_all = 0;
4651 continue;
4652 }
4653
4654 ret = inherit_group(counter, parent, parent_ctx,
4655 child, child_ctx);
4656 if (ret) {
4657 inherited_all = 0;
4658 break;
4659 }
4660 }
4661
4662 if (inherited_all) {
4663 /*
4664 * Mark the child context as a clone of the parent
4665 * context, or of whatever the parent is a clone of.
4666 * Note that if the parent is a clone, it could get
4667 * uncloned at any point, but that doesn't matter
4668 * because the list of counters and the generation
4669 * count can't have changed since we took the mutex.
4670 */
4671 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4672 if (cloned_ctx) {
4673 child_ctx->parent_ctx = cloned_ctx;
4674 child_ctx->parent_gen = parent_ctx->parent_gen;
4675 } else {
4676 child_ctx->parent_ctx = parent_ctx;
4677 child_ctx->parent_gen = parent_ctx->generation;
4678 }
4679 get_ctx(child_ctx->parent_ctx);
4680 }
4681
4682 mutex_unlock(&parent_ctx->mutex);
4683
4684 perf_unpin_context(parent_ctx);
4685
4686 return ret;
4687}
4688
4689static void __cpuinit perf_counter_init_cpu(int cpu)
4690{
4691 struct perf_cpu_context *cpuctx;
4692
4693 cpuctx = &per_cpu(perf_cpu_context, cpu);
4694 __perf_counter_init_context(&cpuctx->ctx, NULL);
4695
4696 spin_lock(&perf_resource_lock);
4697 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4698 spin_unlock(&perf_resource_lock);
4699
4700 hw_perf_counter_setup(cpu);
4701}
4702
4703#ifdef CONFIG_HOTPLUG_CPU
4704static void __perf_counter_exit_cpu(void *info)
4705{
4706 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4707 struct perf_counter_context *ctx = &cpuctx->ctx;
4708 struct perf_counter *counter, *tmp;
4709
4710 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4711 __perf_counter_remove_from_context(counter);
4712}
4713static void perf_counter_exit_cpu(int cpu)
4714{
4715 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4716 struct perf_counter_context *ctx = &cpuctx->ctx;
4717
4718 mutex_lock(&ctx->mutex);
4719 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4720 mutex_unlock(&ctx->mutex);
4721}
4722#else
4723static inline void perf_counter_exit_cpu(int cpu) { }
4724#endif
4725
4726static int __cpuinit
4727perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4728{
4729 unsigned int cpu = (long)hcpu;
4730
4731 switch (action) {
4732
4733 case CPU_UP_PREPARE:
4734 case CPU_UP_PREPARE_FROZEN:
4735 perf_counter_init_cpu(cpu);
4736 break;
4737
4738 case CPU_ONLINE:
4739 case CPU_ONLINE_FROZEN:
4740 hw_perf_counter_setup_online(cpu);
4741 break;
4742
4743 case CPU_DOWN_PREPARE:
4744 case CPU_DOWN_PREPARE_FROZEN:
4745 perf_counter_exit_cpu(cpu);
4746 break;
4747
4748 default:
4749 break;
4750 }
4751
4752 return NOTIFY_OK;
4753}
4754
4755/*
4756 * This has to have a higher priority than migration_notifier in sched.c.
4757 */
4758static struct notifier_block __cpuinitdata perf_cpu_nb = {
4759 .notifier_call = perf_cpu_notify,
4760 .priority = 20,
4761};
4762
4763void __init perf_counter_init(void)
4764{
4765 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4766 (void *)(long)smp_processor_id());
4767 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4768 (void *)(long)smp_processor_id());
4769 register_cpu_notifier(&perf_cpu_nb);
4770}
4771
4772static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4773{
4774 return sprintf(buf, "%d\n", perf_reserved_percpu);
4775}
4776
4777static ssize_t
4778perf_set_reserve_percpu(struct sysdev_class *class,
4779 const char *buf,
4780 size_t count)
4781{
4782 struct perf_cpu_context *cpuctx;
4783 unsigned long val;
4784 int err, cpu, mpt;
4785
4786 err = strict_strtoul(buf, 10, &val);
4787 if (err)
4788 return err;
4789 if (val > perf_max_counters)
4790 return -EINVAL;
4791
4792 spin_lock(&perf_resource_lock);
4793 perf_reserved_percpu = val;
4794 for_each_online_cpu(cpu) {
4795 cpuctx = &per_cpu(perf_cpu_context, cpu);
4796 spin_lock_irq(&cpuctx->ctx.lock);
4797 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4798 perf_max_counters - perf_reserved_percpu);
4799 cpuctx->max_pertask = mpt;
4800 spin_unlock_irq(&cpuctx->ctx.lock);
4801 }
4802 spin_unlock(&perf_resource_lock);
4803
4804 return count;
4805}
4806
4807static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4808{
4809 return sprintf(buf, "%d\n", perf_overcommit);
4810}
4811
4812static ssize_t
4813perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4814{
4815 unsigned long val;
4816 int err;
4817
4818 err = strict_strtoul(buf, 10, &val);
4819 if (err)
4820 return err;
4821 if (val > 1)
4822 return -EINVAL;
4823
4824 spin_lock(&perf_resource_lock);
4825 perf_overcommit = val;
4826 spin_unlock(&perf_resource_lock);
4827
4828 return count;
4829}
4830
4831static SYSDEV_CLASS_ATTR(
4832 reserve_percpu,
4833 0644,
4834 perf_show_reserve_percpu,
4835 perf_set_reserve_percpu
4836 );
4837
4838static SYSDEV_CLASS_ATTR(
4839 overcommit,
4840 0644,
4841 perf_show_overcommit,
4842 perf_set_overcommit
4843 );
4844
4845static struct attribute *perfclass_attrs[] = {
4846 &attr_reserve_percpu.attr,
4847 &attr_overcommit.attr,
4848 NULL
4849};
4850
4851static struct attribute_group perfclass_attr_group = {
4852 .attrs = perfclass_attrs,
4853 .name = "perf_counters",
4854};
4855
4856static int __init perf_counter_sysfs_init(void)
4857{
4858 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4859 &perfclass_attr_group);
4860}
4861device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..76ac4db405e9
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_event.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU events:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_events __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_events __read_mostly;
43static atomic_t nr_mmap_events __read_mostly;
44static atomic_t nr_comm_events __read_mostly;
45static atomic_t nr_task_events __read_mostly;
46
47/*
48 * perf event paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu events for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_event_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_event_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_event_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_event_paranoid > 1;
69}
70
71int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf event sample rate
75 */
76int sysctl_perf_event_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_event_id;
79
80/*
81 * Lock for (sysadmin-configurable) event reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_event_setup(int cpu) { barrier(); }
97void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_event *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_event_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_event_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, perf_disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(perf_disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(perf_disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_event_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_event_context *ctx;
141
142 ctx = container_of(head, struct perf_event_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_event_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_event_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit events we want to return the parent event id
167 * to userspace.
168 */
169static u64 primary_event_id(struct perf_event *event)
170{
171 u64 id = event->id;
172
173 if (event->parent)
174 id = event->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_event_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_event_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_event_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_event_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_event_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_event_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_event_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a event from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_event(struct perf_event *event, struct perf_event_context *ctx)
252{
253 struct perf_event *group_leader = event->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling event,
257 * add it straight to the context's event list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == event)
261 list_add_tail(&event->group_entry, &ctx->group_list);
262 else {
263 list_add_tail(&event->group_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&event->event_entry, &ctx->event_list);
268 ctx->nr_events++;
269 if (event->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a event from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_event(struct perf_event *event, struct perf_event_context *ctx)
279{
280 struct perf_event *sibling, *tmp;
281
282 if (list_empty(&event->group_entry))
283 return;
284 ctx->nr_events--;
285 if (event->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&event->group_entry);
289 list_del_rcu(&event->event_entry);
290
291 if (event->group_leader != event)
292 event->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group event with sibling events then
296 * upgrade the siblings to singleton events by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
300
301 list_move_tail(&sibling->group_entry, &ctx->group_list);
302 sibling->group_leader = sibling;
303 }
304}
305
306static void
307event_sched_out(struct perf_event *event,
308 struct perf_cpu_context *cpuctx,
309 struct perf_event_context *ctx)
310{
311 if (event->state != PERF_EVENT_STATE_ACTIVE)
312 return;
313
314 event->state = PERF_EVENT_STATE_INACTIVE;
315 if (event->pending_disable) {
316 event->pending_disable = 0;
317 event->state = PERF_EVENT_STATE_OFF;
318 }
319 event->tstamp_stopped = ctx->time;
320 event->pmu->disable(event);
321 event->oncpu = -1;
322
323 if (!is_software_event(event))
324 cpuctx->active_oncpu--;
325 ctx->nr_active--;
326 if (event->attr.exclusive || !cpuctx->active_oncpu)
327 cpuctx->exclusive = 0;
328}
329
330static void
331group_sched_out(struct perf_event *group_event,
332 struct perf_cpu_context *cpuctx,
333 struct perf_event_context *ctx)
334{
335 struct perf_event *event;
336
337 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
338 return;
339
340 event_sched_out(group_event, cpuctx, ctx);
341
342 /*
343 * Schedule out siblings (if any):
344 */
345 list_for_each_entry(event, &group_event->sibling_list, group_entry)
346 event_sched_out(event, cpuctx, ctx);
347
348 if (group_event->attr.exclusive)
349 cpuctx->exclusive = 0;
350}
351
352/*
353 * Cross CPU call to remove a performance event
354 *
355 * We disable the event on the hardware level first. After that we
356 * remove it from the context list.
357 */
358static void __perf_event_remove_from_context(void *info)
359{
360 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
361 struct perf_event *event = info;
362 struct perf_event_context *ctx = event->ctx;
363
364 /*
365 * If this is a task context, we need to check whether it is
366 * the current task context of this cpu. If not it has been
367 * scheduled out before the smp call arrived.
368 */
369 if (ctx->task && cpuctx->task_ctx != ctx)
370 return;
371
372 spin_lock(&ctx->lock);
373 /*
374 * Protect the list operation against NMI by disabling the
375 * events on a global level.
376 */
377 perf_disable();
378
379 event_sched_out(event, cpuctx, ctx);
380
381 list_del_event(event, ctx);
382
383 if (!ctx->task) {
384 /*
385 * Allow more per task events with respect to the
386 * reservation:
387 */
388 cpuctx->max_pertask =
389 min(perf_max_events - ctx->nr_events,
390 perf_max_events - perf_reserved_percpu);
391 }
392
393 perf_enable();
394 spin_unlock(&ctx->lock);
395}
396
397
398/*
399 * Remove the event from a task's (or a CPU's) list of events.
400 *
401 * Must be called with ctx->mutex held.
402 *
403 * CPU events are removed with a smp call. For task events we only
404 * call when the task is on a CPU.
405 *
406 * If event->ctx is a cloned context, callers must make sure that
407 * every task struct that event->ctx->task could possibly point to
408 * remains valid. This is OK when called from perf_release since
409 * that only calls us on the top-level context, which can't be a clone.
410 * When called from perf_event_exit_task, it's OK because the
411 * context has been detached from its task.
412 */
413static void perf_event_remove_from_context(struct perf_event *event)
414{
415 struct perf_event_context *ctx = event->ctx;
416 struct task_struct *task = ctx->task;
417
418 if (!task) {
419 /*
420 * Per cpu events are removed via an smp call and
421 * the removal is always sucessful.
422 */
423 smp_call_function_single(event->cpu,
424 __perf_event_remove_from_context,
425 event, 1);
426 return;
427 }
428
429retry:
430 task_oncpu_function_call(task, __perf_event_remove_from_context,
431 event);
432
433 spin_lock_irq(&ctx->lock);
434 /*
435 * If the context is active we need to retry the smp call.
436 */
437 if (ctx->nr_active && !list_empty(&event->group_entry)) {
438 spin_unlock_irq(&ctx->lock);
439 goto retry;
440 }
441
442 /*
443 * The lock prevents that this context is scheduled in so we
444 * can remove the event safely, if the call above did not
445 * succeed.
446 */
447 if (!list_empty(&event->group_entry)) {
448 list_del_event(event, ctx);
449 }
450 spin_unlock_irq(&ctx->lock);
451}
452
453static inline u64 perf_clock(void)
454{
455 return cpu_clock(smp_processor_id());
456}
457
458/*
459 * Update the record of the current time in a context.
460 */
461static void update_context_time(struct perf_event_context *ctx)
462{
463 u64 now = perf_clock();
464
465 ctx->time += now - ctx->timestamp;
466 ctx->timestamp = now;
467}
468
469/*
470 * Update the total_time_enabled and total_time_running fields for a event.
471 */
472static void update_event_times(struct perf_event *event)
473{
474 struct perf_event_context *ctx = event->ctx;
475 u64 run_end;
476
477 if (event->state < PERF_EVENT_STATE_INACTIVE ||
478 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 return;
480
481 event->total_time_enabled = ctx->time - event->tstamp_enabled;
482
483 if (event->state == PERF_EVENT_STATE_INACTIVE)
484 run_end = event->tstamp_stopped;
485 else
486 run_end = ctx->time;
487
488 event->total_time_running = run_end - event->tstamp_running;
489}
490
491/*
492 * Update total_time_enabled and total_time_running for all events in a group.
493 */
494static void update_group_times(struct perf_event *leader)
495{
496 struct perf_event *event;
497
498 update_event_times(leader);
499 list_for_each_entry(event, &leader->sibling_list, group_entry)
500 update_event_times(event);
501}
502
503/*
504 * Cross CPU call to disable a performance event
505 */
506static void __perf_event_disable(void *info)
507{
508 struct perf_event *event = info;
509 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
510 struct perf_event_context *ctx = event->ctx;
511
512 /*
513 * If this is a per-task event, need to check whether this
514 * event's task is the current task on this cpu.
515 */
516 if (ctx->task && cpuctx->task_ctx != ctx)
517 return;
518
519 spin_lock(&ctx->lock);
520
521 /*
522 * If the event is on, turn it off.
523 * If it is in error state, leave it in error state.
524 */
525 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
526 update_context_time(ctx);
527 update_group_times(event);
528 if (event == event->group_leader)
529 group_sched_out(event, cpuctx, ctx);
530 else
531 event_sched_out(event, cpuctx, ctx);
532 event->state = PERF_EVENT_STATE_OFF;
533 }
534
535 spin_unlock(&ctx->lock);
536}
537
538/*
539 * Disable a event.
540 *
541 * If event->ctx is a cloned context, callers must make sure that
542 * every task struct that event->ctx->task could possibly point to
543 * remains valid. This condition is satisifed when called through
544 * perf_event_for_each_child or perf_event_for_each because they
545 * hold the top-level event's child_mutex, so any descendant that
546 * goes to exit will block in sync_child_event.
547 * When called from perf_pending_event it's OK because event->ctx
548 * is the current context on this CPU and preemption is disabled,
549 * hence we can't get into perf_event_task_sched_out for this context.
550 */
551static void perf_event_disable(struct perf_event *event)
552{
553 struct perf_event_context *ctx = event->ctx;
554 struct task_struct *task = ctx->task;
555
556 if (!task) {
557 /*
558 * Disable the event on the cpu that it's on
559 */
560 smp_call_function_single(event->cpu, __perf_event_disable,
561 event, 1);
562 return;
563 }
564
565 retry:
566 task_oncpu_function_call(task, __perf_event_disable, event);
567
568 spin_lock_irq(&ctx->lock);
569 /*
570 * If the event is still active, we need to retry the cross-call.
571 */
572 if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 spin_unlock_irq(&ctx->lock);
574 goto retry;
575 }
576
577 /*
578 * Since we have the lock this context can't be scheduled
579 * in, so we can change the state safely.
580 */
581 if (event->state == PERF_EVENT_STATE_INACTIVE) {
582 update_group_times(event);
583 event->state = PERF_EVENT_STATE_OFF;
584 }
585
586 spin_unlock_irq(&ctx->lock);
587}
588
589static int
590event_sched_in(struct perf_event *event,
591 struct perf_cpu_context *cpuctx,
592 struct perf_event_context *ctx,
593 int cpu)
594{
595 if (event->state <= PERF_EVENT_STATE_OFF)
596 return 0;
597
598 event->state = PERF_EVENT_STATE_ACTIVE;
599 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
600 /*
601 * The new state must be visible before we turn it on in the hardware:
602 */
603 smp_wmb();
604
605 if (event->pmu->enable(event)) {
606 event->state = PERF_EVENT_STATE_INACTIVE;
607 event->oncpu = -1;
608 return -EAGAIN;
609 }
610
611 event->tstamp_running += ctx->time - event->tstamp_stopped;
612
613 if (!is_software_event(event))
614 cpuctx->active_oncpu++;
615 ctx->nr_active++;
616
617 if (event->attr.exclusive)
618 cpuctx->exclusive = 1;
619
620 return 0;
621}
622
623static int
624group_sched_in(struct perf_event *group_event,
625 struct perf_cpu_context *cpuctx,
626 struct perf_event_context *ctx,
627 int cpu)
628{
629 struct perf_event *event, *partial_group;
630 int ret;
631
632 if (group_event->state == PERF_EVENT_STATE_OFF)
633 return 0;
634
635 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
636 if (ret)
637 return ret < 0 ? ret : 0;
638
639 if (event_sched_in(group_event, cpuctx, ctx, cpu))
640 return -EAGAIN;
641
642 /*
643 * Schedule in siblings as one group (if any):
644 */
645 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
646 if (event_sched_in(event, cpuctx, ctx, cpu)) {
647 partial_group = event;
648 goto group_error;
649 }
650 }
651
652 return 0;
653
654group_error:
655 /*
656 * Groups can be scheduled in as one unit only, so undo any
657 * partial group before returning:
658 */
659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
660 if (event == partial_group)
661 break;
662 event_sched_out(event, cpuctx, ctx);
663 }
664 event_sched_out(group_event, cpuctx, ctx);
665
666 return -EAGAIN;
667}
668
669/*
670 * Return 1 for a group consisting entirely of software events,
671 * 0 if the group contains any hardware events.
672 */
673static int is_software_only_group(struct perf_event *leader)
674{
675 struct perf_event *event;
676
677 if (!is_software_event(leader))
678 return 0;
679
680 list_for_each_entry(event, &leader->sibling_list, group_entry)
681 if (!is_software_event(event))
682 return 0;
683
684 return 1;
685}
686
687/*
688 * Work out whether we can put this event group on the CPU now.
689 */
690static int group_can_go_on(struct perf_event *event,
691 struct perf_cpu_context *cpuctx,
692 int can_add_hw)
693{
694 /*
695 * Groups consisting entirely of software events can always go on.
696 */
697 if (is_software_only_group(event))
698 return 1;
699 /*
700 * If an exclusive group is already on, no other hardware
701 * events can go on.
702 */
703 if (cpuctx->exclusive)
704 return 0;
705 /*
706 * If this group is exclusive and there are already
707 * events on the CPU, it can't go on.
708 */
709 if (event->attr.exclusive && cpuctx->active_oncpu)
710 return 0;
711 /*
712 * Otherwise, try to add it if all previous groups were able
713 * to go on.
714 */
715 return can_add_hw;
716}
717
718static void add_event_to_ctx(struct perf_event *event,
719 struct perf_event_context *ctx)
720{
721 list_add_event(event, ctx);
722 event->tstamp_enabled = ctx->time;
723 event->tstamp_running = ctx->time;
724 event->tstamp_stopped = ctx->time;
725}
726
727/*
728 * Cross CPU call to install and enable a performance event
729 *
730 * Must be called with ctx->mutex held
731 */
732static void __perf_install_in_context(void *info)
733{
734 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
735 struct perf_event *event = info;
736 struct perf_event_context *ctx = event->ctx;
737 struct perf_event *leader = event->group_leader;
738 int cpu = smp_processor_id();
739 int err;
740
741 /*
742 * If this is a task context, we need to check whether it is
743 * the current task context of this cpu. If not it has been
744 * scheduled out before the smp call arrived.
745 * Or possibly this is the right context but it isn't
746 * on this cpu because it had no events.
747 */
748 if (ctx->task && cpuctx->task_ctx != ctx) {
749 if (cpuctx->task_ctx || ctx->task != current)
750 return;
751 cpuctx->task_ctx = ctx;
752 }
753
754 spin_lock(&ctx->lock);
755 ctx->is_active = 1;
756 update_context_time(ctx);
757
758 /*
759 * Protect the list operation against NMI by disabling the
760 * events on a global level. NOP for non NMI based events.
761 */
762 perf_disable();
763
764 add_event_to_ctx(event, ctx);
765
766 /*
767 * Don't put the event on if it is disabled or if
768 * it is in a group and the group isn't on.
769 */
770 if (event->state != PERF_EVENT_STATE_INACTIVE ||
771 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
772 goto unlock;
773
774 /*
775 * An exclusive event can't go on if there are already active
776 * hardware events, and no hardware event can go on if there
777 * is already an exclusive event on.
778 */
779 if (!group_can_go_on(event, cpuctx, 1))
780 err = -EEXIST;
781 else
782 err = event_sched_in(event, cpuctx, ctx, cpu);
783
784 if (err) {
785 /*
786 * This event couldn't go on. If it is in a group
787 * then we have to pull the whole group off.
788 * If the event group is pinned then put it in error state.
789 */
790 if (leader != event)
791 group_sched_out(leader, cpuctx, ctx);
792 if (leader->attr.pinned) {
793 update_group_times(leader);
794 leader->state = PERF_EVENT_STATE_ERROR;
795 }
796 }
797
798 if (!err && !ctx->task && cpuctx->max_pertask)
799 cpuctx->max_pertask--;
800
801 unlock:
802 perf_enable();
803
804 spin_unlock(&ctx->lock);
805}
806
807/*
808 * Attach a performance event to a context
809 *
810 * First we add the event to the list with the hardware enable bit
811 * in event->hw_config cleared.
812 *
813 * If the event is attached to a task which is on a CPU we use a smp
814 * call to enable it in the task context. The task might have been
815 * scheduled away, but we check this in the smp call again.
816 *
817 * Must be called with ctx->mutex held.
818 */
819static void
820perf_install_in_context(struct perf_event_context *ctx,
821 struct perf_event *event,
822 int cpu)
823{
824 struct task_struct *task = ctx->task;
825
826 if (!task) {
827 /*
828 * Per cpu events are installed via an smp call and
829 * the install is always sucessful.
830 */
831 smp_call_function_single(cpu, __perf_install_in_context,
832 event, 1);
833 return;
834 }
835
836retry:
837 task_oncpu_function_call(task, __perf_install_in_context,
838 event);
839
840 spin_lock_irq(&ctx->lock);
841 /*
842 * we need to retry the smp call.
843 */
844 if (ctx->is_active && list_empty(&event->group_entry)) {
845 spin_unlock_irq(&ctx->lock);
846 goto retry;
847 }
848
849 /*
850 * The lock prevents that this context is scheduled in so we
851 * can add the event safely, if it the call above did not
852 * succeed.
853 */
854 if (list_empty(&event->group_entry))
855 add_event_to_ctx(event, ctx);
856 spin_unlock_irq(&ctx->lock);
857}
858
859/*
860 * Put a event into inactive state and update time fields.
861 * Enabling the leader of a group effectively enables all
862 * the group members that aren't explicitly disabled, so we
863 * have to update their ->tstamp_enabled also.
864 * Note: this works for group members as well as group leaders
865 * since the non-leader members' sibling_lists will be empty.
866 */
867static void __perf_event_mark_enabled(struct perf_event *event,
868 struct perf_event_context *ctx)
869{
870 struct perf_event *sub;
871
872 event->state = PERF_EVENT_STATE_INACTIVE;
873 event->tstamp_enabled = ctx->time - event->total_time_enabled;
874 list_for_each_entry(sub, &event->sibling_list, group_entry)
875 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
876 sub->tstamp_enabled =
877 ctx->time - sub->total_time_enabled;
878}
879
880/*
881 * Cross CPU call to enable a performance event
882 */
883static void __perf_event_enable(void *info)
884{
885 struct perf_event *event = info;
886 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
887 struct perf_event_context *ctx = event->ctx;
888 struct perf_event *leader = event->group_leader;
889 int err;
890
891 /*
892 * If this is a per-task event, need to check whether this
893 * event's task is the current task on this cpu.
894 */
895 if (ctx->task && cpuctx->task_ctx != ctx) {
896 if (cpuctx->task_ctx || ctx->task != current)
897 return;
898 cpuctx->task_ctx = ctx;
899 }
900
901 spin_lock(&ctx->lock);
902 ctx->is_active = 1;
903 update_context_time(ctx);
904
905 if (event->state >= PERF_EVENT_STATE_INACTIVE)
906 goto unlock;
907 __perf_event_mark_enabled(event, ctx);
908
909 /*
910 * If the event is in a group and isn't the group leader,
911 * then don't put it on unless the group is on.
912 */
913 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
914 goto unlock;
915
916 if (!group_can_go_on(event, cpuctx, 1)) {
917 err = -EEXIST;
918 } else {
919 perf_disable();
920 if (event == leader)
921 err = group_sched_in(event, cpuctx, ctx,
922 smp_processor_id());
923 else
924 err = event_sched_in(event, cpuctx, ctx,
925 smp_processor_id());
926 perf_enable();
927 }
928
929 if (err) {
930 /*
931 * If this event can't go on and it's part of a
932 * group, then the whole group has to come off.
933 */
934 if (leader != event)
935 group_sched_out(leader, cpuctx, ctx);
936 if (leader->attr.pinned) {
937 update_group_times(leader);
938 leader->state = PERF_EVENT_STATE_ERROR;
939 }
940 }
941
942 unlock:
943 spin_unlock(&ctx->lock);
944}
945
946/*
947 * Enable a event.
948 *
949 * If event->ctx is a cloned context, callers must make sure that
950 * every task struct that event->ctx->task could possibly point to
951 * remains valid. This condition is satisfied when called through
952 * perf_event_for_each_child or perf_event_for_each as described
953 * for perf_event_disable.
954 */
955static void perf_event_enable(struct perf_event *event)
956{
957 struct perf_event_context *ctx = event->ctx;
958 struct task_struct *task = ctx->task;
959
960 if (!task) {
961 /*
962 * Enable the event on the cpu that it's on
963 */
964 smp_call_function_single(event->cpu, __perf_event_enable,
965 event, 1);
966 return;
967 }
968
969 spin_lock_irq(&ctx->lock);
970 if (event->state >= PERF_EVENT_STATE_INACTIVE)
971 goto out;
972
973 /*
974 * If the event is in error state, clear that first.
975 * That way, if we see the event in error state below, we
976 * know that it has gone back into error state, as distinct
977 * from the task having been scheduled away before the
978 * cross-call arrived.
979 */
980 if (event->state == PERF_EVENT_STATE_ERROR)
981 event->state = PERF_EVENT_STATE_OFF;
982
983 retry:
984 spin_unlock_irq(&ctx->lock);
985 task_oncpu_function_call(task, __perf_event_enable, event);
986
987 spin_lock_irq(&ctx->lock);
988
989 /*
990 * If the context is active and the event is still off,
991 * we need to retry the cross-call.
992 */
993 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
994 goto retry;
995
996 /*
997 * Since we have the lock this context can't be scheduled
998 * in, so we can change the state safely.
999 */
1000 if (event->state == PERF_EVENT_STATE_OFF)
1001 __perf_event_mark_enabled(event, ctx);
1002
1003 out:
1004 spin_unlock_irq(&ctx->lock);
1005}
1006
1007static int perf_event_refresh(struct perf_event *event, int refresh)
1008{
1009 /*
1010 * not supported on inherited events
1011 */
1012 if (event->attr.inherit)
1013 return -EINVAL;
1014
1015 atomic_add(refresh, &event->event_limit);
1016 perf_event_enable(event);
1017
1018 return 0;
1019}
1020
1021void __perf_event_sched_out(struct perf_event_context *ctx,
1022 struct perf_cpu_context *cpuctx)
1023{
1024 struct perf_event *event;
1025
1026 spin_lock(&ctx->lock);
1027 ctx->is_active = 0;
1028 if (likely(!ctx->nr_events))
1029 goto out;
1030 update_context_time(ctx);
1031
1032 perf_disable();
1033 if (ctx->nr_active) {
1034 list_for_each_entry(event, &ctx->group_list, group_entry) {
1035 if (event != event->group_leader)
1036 event_sched_out(event, cpuctx, ctx);
1037 else
1038 group_sched_out(event, cpuctx, ctx);
1039 }
1040 }
1041 perf_enable();
1042 out:
1043 spin_unlock(&ctx->lock);
1044}
1045
1046/*
1047 * Test whether two contexts are equivalent, i.e. whether they
1048 * have both been cloned from the same version of the same context
1049 * and they both have the same number of enabled events.
1050 * If the number of enabled events is the same, then the set
1051 * of enabled events should be the same, because these are both
1052 * inherited contexts, therefore we can't access individual events
1053 * in them directly with an fd; we can only enable/disable all
1054 * events via prctl, or enable/disable all events in a family
1055 * via ioctl, which will have the same effect on both contexts.
1056 */
1057static int context_equiv(struct perf_event_context *ctx1,
1058 struct perf_event_context *ctx2)
1059{
1060 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1061 && ctx1->parent_gen == ctx2->parent_gen
1062 && !ctx1->pin_count && !ctx2->pin_count;
1063}
1064
1065static void __perf_event_read(void *event);
1066
1067static void __perf_event_sync_stat(struct perf_event *event,
1068 struct perf_event *next_event)
1069{
1070 u64 value;
1071
1072 if (!event->attr.inherit_stat)
1073 return;
1074
1075 /*
1076 * Update the event value, we cannot use perf_event_read()
1077 * because we're in the middle of a context switch and have IRQs
1078 * disabled, which upsets smp_call_function_single(), however
1079 * we know the event must be on the current CPU, therefore we
1080 * don't need to use it.
1081 */
1082 switch (event->state) {
1083 case PERF_EVENT_STATE_ACTIVE:
1084 __perf_event_read(event);
1085 break;
1086
1087 case PERF_EVENT_STATE_INACTIVE:
1088 update_event_times(event);
1089 break;
1090
1091 default:
1092 break;
1093 }
1094
1095 /*
1096 * In order to keep per-task stats reliable we need to flip the event
1097 * values when we flip the contexts.
1098 */
1099 value = atomic64_read(&next_event->count);
1100 value = atomic64_xchg(&event->count, value);
1101 atomic64_set(&next_event->count, value);
1102
1103 swap(event->total_time_enabled, next_event->total_time_enabled);
1104 swap(event->total_time_running, next_event->total_time_running);
1105
1106 /*
1107 * Since we swizzled the values, update the user visible data too.
1108 */
1109 perf_event_update_userpage(event);
1110 perf_event_update_userpage(next_event);
1111}
1112
1113#define list_next_entry(pos, member) \
1114 list_entry(pos->member.next, typeof(*pos), member)
1115
1116static void perf_event_sync_stat(struct perf_event_context *ctx,
1117 struct perf_event_context *next_ctx)
1118{
1119 struct perf_event *event, *next_event;
1120
1121 if (!ctx->nr_stat)
1122 return;
1123
1124 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry);
1126
1127 next_event = list_first_entry(&next_ctx->event_list,
1128 struct perf_event, event_entry);
1129
1130 while (&event->event_entry != &ctx->event_list &&
1131 &next_event->event_entry != &next_ctx->event_list) {
1132
1133 __perf_event_sync_stat(event, next_event);
1134
1135 event = list_next_entry(event, event_entry);
1136 next_event = list_next_entry(next_event, event_entry);
1137 }
1138}
1139
1140/*
1141 * Called from scheduler to remove the events of the current task,
1142 * with interrupts disabled.
1143 *
1144 * We stop each event and update the event value in event->count.
1145 *
1146 * This does not protect us against NMI, but disable()
1147 * sets the disabled bit in the control field of event _before_
1148 * accessing the event control register. If a NMI hits, then it will
1149 * not restart the event.
1150 */
1151void perf_event_task_sched_out(struct task_struct *task,
1152 struct task_struct *next, int cpu)
1153{
1154 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1155 struct perf_event_context *ctx = task->perf_event_ctxp;
1156 struct perf_event_context *next_ctx;
1157 struct perf_event_context *parent;
1158 struct pt_regs *regs;
1159 int do_switch = 1;
1160
1161 regs = task_pt_regs(task);
1162 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1163
1164 if (likely(!ctx || !cpuctx->task_ctx))
1165 return;
1166
1167 update_context_time(ctx);
1168
1169 rcu_read_lock();
1170 parent = rcu_dereference(ctx->parent_ctx);
1171 next_ctx = next->perf_event_ctxp;
1172 if (parent && next_ctx &&
1173 rcu_dereference(next_ctx->parent_ctx) == parent) {
1174 /*
1175 * Looks like the two contexts are clones, so we might be
1176 * able to optimize the context switch. We lock both
1177 * contexts and check that they are clones under the
1178 * lock (including re-checking that neither has been
1179 * uncloned in the meantime). It doesn't matter which
1180 * order we take the locks because no other cpu could
1181 * be trying to lock both of these tasks.
1182 */
1183 spin_lock(&ctx->lock);
1184 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1185 if (context_equiv(ctx, next_ctx)) {
1186 /*
1187 * XXX do we need a memory barrier of sorts
1188 * wrt to rcu_dereference() of perf_event_ctxp
1189 */
1190 task->perf_event_ctxp = next_ctx;
1191 next->perf_event_ctxp = ctx;
1192 ctx->task = next;
1193 next_ctx->task = task;
1194 do_switch = 0;
1195
1196 perf_event_sync_stat(ctx, next_ctx);
1197 }
1198 spin_unlock(&next_ctx->lock);
1199 spin_unlock(&ctx->lock);
1200 }
1201 rcu_read_unlock();
1202
1203 if (do_switch) {
1204 __perf_event_sched_out(ctx, cpuctx);
1205 cpuctx->task_ctx = NULL;
1206 }
1207}
1208
1209/*
1210 * Called with IRQs disabled
1211 */
1212static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1213{
1214 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1215
1216 if (!cpuctx->task_ctx)
1217 return;
1218
1219 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1220 return;
1221
1222 __perf_event_sched_out(ctx, cpuctx);
1223 cpuctx->task_ctx = NULL;
1224}
1225
1226/*
1227 * Called with IRQs disabled
1228 */
1229static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1230{
1231 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1232}
1233
1234static void
1235__perf_event_sched_in(struct perf_event_context *ctx,
1236 struct perf_cpu_context *cpuctx, int cpu)
1237{
1238 struct perf_event *event;
1239 int can_add_hw = 1;
1240
1241 spin_lock(&ctx->lock);
1242 ctx->is_active = 1;
1243 if (likely(!ctx->nr_events))
1244 goto out;
1245
1246 ctx->timestamp = perf_clock();
1247
1248 perf_disable();
1249
1250 /*
1251 * First go through the list and put on any pinned groups
1252 * in order to give them the best chance of going on.
1253 */
1254 list_for_each_entry(event, &ctx->group_list, group_entry) {
1255 if (event->state <= PERF_EVENT_STATE_OFF ||
1256 !event->attr.pinned)
1257 continue;
1258 if (event->cpu != -1 && event->cpu != cpu)
1259 continue;
1260
1261 if (event != event->group_leader)
1262 event_sched_in(event, cpuctx, ctx, cpu);
1263 else {
1264 if (group_can_go_on(event, cpuctx, 1))
1265 group_sched_in(event, cpuctx, ctx, cpu);
1266 }
1267
1268 /*
1269 * If this pinned group hasn't been scheduled,
1270 * put it in error state.
1271 */
1272 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1273 update_group_times(event);
1274 event->state = PERF_EVENT_STATE_ERROR;
1275 }
1276 }
1277
1278 list_for_each_entry(event, &ctx->group_list, group_entry) {
1279 /*
1280 * Ignore events in OFF or ERROR state, and
1281 * ignore pinned events since we did them already.
1282 */
1283 if (event->state <= PERF_EVENT_STATE_OFF ||
1284 event->attr.pinned)
1285 continue;
1286
1287 /*
1288 * Listen to the 'cpu' scheduling filter constraint
1289 * of events:
1290 */
1291 if (event->cpu != -1 && event->cpu != cpu)
1292 continue;
1293
1294 if (event != event->group_leader) {
1295 if (event_sched_in(event, cpuctx, ctx, cpu))
1296 can_add_hw = 0;
1297 } else {
1298 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1299 if (group_sched_in(event, cpuctx, ctx, cpu))
1300 can_add_hw = 0;
1301 }
1302 }
1303 }
1304 perf_enable();
1305 out:
1306 spin_unlock(&ctx->lock);
1307}
1308
1309/*
1310 * Called from scheduler to add the events of the current task
1311 * with interrupts disabled.
1312 *
1313 * We restore the event value and then enable it.
1314 *
1315 * This does not protect us against NMI, but enable()
1316 * sets the enabled bit in the control field of event _before_
1317 * accessing the event control register. If a NMI hits, then it will
1318 * keep the event running.
1319 */
1320void perf_event_task_sched_in(struct task_struct *task, int cpu)
1321{
1322 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1323 struct perf_event_context *ctx = task->perf_event_ctxp;
1324
1325 if (likely(!ctx))
1326 return;
1327 if (cpuctx->task_ctx == ctx)
1328 return;
1329 __perf_event_sched_in(ctx, cpuctx, cpu);
1330 cpuctx->task_ctx = ctx;
1331}
1332
1333static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1334{
1335 struct perf_event_context *ctx = &cpuctx->ctx;
1336
1337 __perf_event_sched_in(ctx, cpuctx, cpu);
1338}
1339
1340#define MAX_INTERRUPTS (~0ULL)
1341
1342static void perf_log_throttle(struct perf_event *event, int enable);
1343
1344static void perf_adjust_period(struct perf_event *event, u64 events)
1345{
1346 struct hw_perf_event *hwc = &event->hw;
1347 u64 period, sample_period;
1348 s64 delta;
1349
1350 events *= hwc->sample_period;
1351 period = div64_u64(events, event->attr.sample_freq);
1352
1353 delta = (s64)(period - hwc->sample_period);
1354 delta = (delta + 7) / 8; /* low pass filter */
1355
1356 sample_period = hwc->sample_period + delta;
1357
1358 if (!sample_period)
1359 sample_period = 1;
1360
1361 hwc->sample_period = sample_period;
1362}
1363
1364static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1365{
1366 struct perf_event *event;
1367 struct hw_perf_event *hwc;
1368 u64 interrupts, freq;
1369
1370 spin_lock(&ctx->lock);
1371 list_for_each_entry(event, &ctx->group_list, group_entry) {
1372 if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 continue;
1374
1375 hwc = &event->hw;
1376
1377 interrupts = hwc->interrupts;
1378 hwc->interrupts = 0;
1379
1380 /*
1381 * unthrottle events on the tick
1382 */
1383 if (interrupts == MAX_INTERRUPTS) {
1384 perf_log_throttle(event, 1);
1385 event->pmu->unthrottle(event);
1386 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1387 }
1388
1389 if (!event->attr.freq || !event->attr.sample_freq)
1390 continue;
1391
1392 /*
1393 * if the specified freq < HZ then we need to skip ticks
1394 */
1395 if (event->attr.sample_freq < HZ) {
1396 freq = event->attr.sample_freq;
1397
1398 hwc->freq_count += freq;
1399 hwc->freq_interrupts += interrupts;
1400
1401 if (hwc->freq_count < HZ)
1402 continue;
1403
1404 interrupts = hwc->freq_interrupts;
1405 hwc->freq_interrupts = 0;
1406 hwc->freq_count -= HZ;
1407 } else
1408 freq = HZ;
1409
1410 perf_adjust_period(event, freq * interrupts);
1411
1412 /*
1413 * In order to avoid being stalled by an (accidental) huge
1414 * sample period, force reset the sample period if we didn't
1415 * get any events in this freq period.
1416 */
1417 if (!interrupts) {
1418 perf_disable();
1419 event->pmu->disable(event);
1420 atomic64_set(&hwc->period_left, 0);
1421 event->pmu->enable(event);
1422 perf_enable();
1423 }
1424 }
1425 spin_unlock(&ctx->lock);
1426}
1427
1428/*
1429 * Round-robin a context's events:
1430 */
1431static void rotate_ctx(struct perf_event_context *ctx)
1432{
1433 struct perf_event *event;
1434
1435 if (!ctx->nr_events)
1436 return;
1437
1438 spin_lock(&ctx->lock);
1439 /*
1440 * Rotate the first entry last (works just fine for group events too):
1441 */
1442 perf_disable();
1443 list_for_each_entry(event, &ctx->group_list, group_entry) {
1444 list_move_tail(&event->group_entry, &ctx->group_list);
1445 break;
1446 }
1447 perf_enable();
1448
1449 spin_unlock(&ctx->lock);
1450}
1451
1452void perf_event_task_tick(struct task_struct *curr, int cpu)
1453{
1454 struct perf_cpu_context *cpuctx;
1455 struct perf_event_context *ctx;
1456
1457 if (!atomic_read(&nr_events))
1458 return;
1459
1460 cpuctx = &per_cpu(perf_cpu_context, cpu);
1461 ctx = curr->perf_event_ctxp;
1462
1463 perf_ctx_adjust_freq(&cpuctx->ctx);
1464 if (ctx)
1465 perf_ctx_adjust_freq(ctx);
1466
1467 perf_event_cpu_sched_out(cpuctx);
1468 if (ctx)
1469 __perf_event_task_sched_out(ctx);
1470
1471 rotate_ctx(&cpuctx->ctx);
1472 if (ctx)
1473 rotate_ctx(ctx);
1474
1475 perf_event_cpu_sched_in(cpuctx, cpu);
1476 if (ctx)
1477 perf_event_task_sched_in(curr, cpu);
1478}
1479
1480/*
1481 * Enable all of a task's events that have been marked enable-on-exec.
1482 * This expects task == current.
1483 */
1484static void perf_event_enable_on_exec(struct task_struct *task)
1485{
1486 struct perf_event_context *ctx;
1487 struct perf_event *event;
1488 unsigned long flags;
1489 int enabled = 0;
1490
1491 local_irq_save(flags);
1492 ctx = task->perf_event_ctxp;
1493 if (!ctx || !ctx->nr_events)
1494 goto out;
1495
1496 __perf_event_task_sched_out(ctx);
1497
1498 spin_lock(&ctx->lock);
1499
1500 list_for_each_entry(event, &ctx->group_list, group_entry) {
1501 if (!event->attr.enable_on_exec)
1502 continue;
1503 event->attr.enable_on_exec = 0;
1504 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1505 continue;
1506 __perf_event_mark_enabled(event, ctx);
1507 enabled = 1;
1508 }
1509
1510 /*
1511 * Unclone this context if we enabled any event.
1512 */
1513 if (enabled)
1514 unclone_ctx(ctx);
1515
1516 spin_unlock(&ctx->lock);
1517
1518 perf_event_task_sched_in(task, smp_processor_id());
1519 out:
1520 local_irq_restore(flags);
1521}
1522
1523/*
1524 * Cross CPU call to read the hardware event
1525 */
1526static void __perf_event_read(void *info)
1527{
1528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 struct perf_event *event = info;
1530 struct perf_event_context *ctx = event->ctx;
1531 unsigned long flags;
1532
1533 /*
1534 * If this is a task context, we need to check whether it is
1535 * the current task context of this cpu. If not it has been
1536 * scheduled out before the smp call arrived. In that case
1537 * event->count would have been updated to a recent sample
1538 * when the event was scheduled out.
1539 */
1540 if (ctx->task && cpuctx->task_ctx != ctx)
1541 return;
1542
1543 local_irq_save(flags);
1544 if (ctx->is_active)
1545 update_context_time(ctx);
1546 event->pmu->read(event);
1547 update_event_times(event);
1548 local_irq_restore(flags);
1549}
1550
1551static u64 perf_event_read(struct perf_event *event)
1552{
1553 /*
1554 * If event is enabled and currently active on a CPU, update the
1555 * value in the event structure:
1556 */
1557 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1558 smp_call_function_single(event->oncpu,
1559 __perf_event_read, event, 1);
1560 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1561 update_event_times(event);
1562 }
1563
1564 return atomic64_read(&event->count);
1565}
1566
1567/*
1568 * Initialize the perf_event context in a task_struct:
1569 */
1570static void
1571__perf_event_init_context(struct perf_event_context *ctx,
1572 struct task_struct *task)
1573{
1574 memset(ctx, 0, sizeof(*ctx));
1575 spin_lock_init(&ctx->lock);
1576 mutex_init(&ctx->mutex);
1577 INIT_LIST_HEAD(&ctx->group_list);
1578 INIT_LIST_HEAD(&ctx->event_list);
1579 atomic_set(&ctx->refcount, 1);
1580 ctx->task = task;
1581}
1582
1583static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1584{
1585 struct perf_event_context *ctx;
1586 struct perf_cpu_context *cpuctx;
1587 struct task_struct *task;
1588 unsigned long flags;
1589 int err;
1590
1591 /*
1592 * If cpu is not a wildcard then this is a percpu event:
1593 */
1594 if (cpu != -1) {
1595 /* Must be root to operate on a CPU event: */
1596 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1597 return ERR_PTR(-EACCES);
1598
1599 if (cpu < 0 || cpu > num_possible_cpus())
1600 return ERR_PTR(-EINVAL);
1601
1602 /*
1603 * We could be clever and allow to attach a event to an
1604 * offline CPU and activate it when the CPU comes up, but
1605 * that's for later.
1606 */
1607 if (!cpu_isset(cpu, cpu_online_map))
1608 return ERR_PTR(-ENODEV);
1609
1610 cpuctx = &per_cpu(perf_cpu_context, cpu);
1611 ctx = &cpuctx->ctx;
1612 get_ctx(ctx);
1613
1614 return ctx;
1615 }
1616
1617 rcu_read_lock();
1618 if (!pid)
1619 task = current;
1620 else
1621 task = find_task_by_vpid(pid);
1622 if (task)
1623 get_task_struct(task);
1624 rcu_read_unlock();
1625
1626 if (!task)
1627 return ERR_PTR(-ESRCH);
1628
1629 /*
1630 * Can't attach events to a dying task.
1631 */
1632 err = -ESRCH;
1633 if (task->flags & PF_EXITING)
1634 goto errout;
1635
1636 /* Reuse ptrace permission checks for now. */
1637 err = -EACCES;
1638 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1639 goto errout;
1640
1641 retry:
1642 ctx = perf_lock_task_context(task, &flags);
1643 if (ctx) {
1644 unclone_ctx(ctx);
1645 spin_unlock_irqrestore(&ctx->lock, flags);
1646 }
1647
1648 if (!ctx) {
1649 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1650 err = -ENOMEM;
1651 if (!ctx)
1652 goto errout;
1653 __perf_event_init_context(ctx, task);
1654 get_ctx(ctx);
1655 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1656 /*
1657 * We raced with some other task; use
1658 * the context they set.
1659 */
1660 kfree(ctx);
1661 goto retry;
1662 }
1663 get_task_struct(task);
1664 }
1665
1666 put_task_struct(task);
1667 return ctx;
1668
1669 errout:
1670 put_task_struct(task);
1671 return ERR_PTR(err);
1672}
1673
1674static void free_event_rcu(struct rcu_head *head)
1675{
1676 struct perf_event *event;
1677
1678 event = container_of(head, struct perf_event, rcu_head);
1679 if (event->ns)
1680 put_pid_ns(event->ns);
1681 kfree(event);
1682}
1683
1684static void perf_pending_sync(struct perf_event *event);
1685
1686static void free_event(struct perf_event *event)
1687{
1688 perf_pending_sync(event);
1689
1690 if (!event->parent) {
1691 atomic_dec(&nr_events);
1692 if (event->attr.mmap)
1693 atomic_dec(&nr_mmap_events);
1694 if (event->attr.comm)
1695 atomic_dec(&nr_comm_events);
1696 if (event->attr.task)
1697 atomic_dec(&nr_task_events);
1698 }
1699
1700 if (event->output) {
1701 fput(event->output->filp);
1702 event->output = NULL;
1703 }
1704
1705 if (event->destroy)
1706 event->destroy(event);
1707
1708 put_ctx(event->ctx);
1709 call_rcu(&event->rcu_head, free_event_rcu);
1710}
1711
1712/*
1713 * Called when the last reference to the file is gone.
1714 */
1715static int perf_release(struct inode *inode, struct file *file)
1716{
1717 struct perf_event *event = file->private_data;
1718 struct perf_event_context *ctx = event->ctx;
1719
1720 file->private_data = NULL;
1721
1722 WARN_ON_ONCE(ctx->parent_ctx);
1723 mutex_lock(&ctx->mutex);
1724 perf_event_remove_from_context(event);
1725 mutex_unlock(&ctx->mutex);
1726
1727 mutex_lock(&event->owner->perf_event_mutex);
1728 list_del_init(&event->owner_entry);
1729 mutex_unlock(&event->owner->perf_event_mutex);
1730 put_task_struct(event->owner);
1731
1732 free_event(event);
1733
1734 return 0;
1735}
1736
1737static int perf_event_read_size(struct perf_event *event)
1738{
1739 int entry = sizeof(u64); /* value */
1740 int size = 0;
1741 int nr = 1;
1742
1743 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1744 size += sizeof(u64);
1745
1746 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1747 size += sizeof(u64);
1748
1749 if (event->attr.read_format & PERF_FORMAT_ID)
1750 entry += sizeof(u64);
1751
1752 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1753 nr += event->group_leader->nr_siblings;
1754 size += sizeof(u64);
1755 }
1756
1757 size += entry * nr;
1758
1759 return size;
1760}
1761
1762static u64 perf_event_read_value(struct perf_event *event)
1763{
1764 struct perf_event *child;
1765 u64 total = 0;
1766
1767 total += perf_event_read(event);
1768 list_for_each_entry(child, &event->child_list, child_list)
1769 total += perf_event_read(child);
1770
1771 return total;
1772}
1773
1774static int perf_event_read_entry(struct perf_event *event,
1775 u64 read_format, char __user *buf)
1776{
1777 int n = 0, count = 0;
1778 u64 values[2];
1779
1780 values[n++] = perf_event_read_value(event);
1781 if (read_format & PERF_FORMAT_ID)
1782 values[n++] = primary_event_id(event);
1783
1784 count = n * sizeof(u64);
1785
1786 if (copy_to_user(buf, values, count))
1787 return -EFAULT;
1788
1789 return count;
1790}
1791
1792static int perf_event_read_group(struct perf_event *event,
1793 u64 read_format, char __user *buf)
1794{
1795 struct perf_event *leader = event->group_leader, *sub;
1796 int n = 0, size = 0, err = -EFAULT;
1797 u64 values[3];
1798
1799 values[n++] = 1 + leader->nr_siblings;
1800 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1801 values[n++] = leader->total_time_enabled +
1802 atomic64_read(&leader->child_total_time_enabled);
1803 }
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1805 values[n++] = leader->total_time_running +
1806 atomic64_read(&leader->child_total_time_running);
1807 }
1808
1809 size = n * sizeof(u64);
1810
1811 if (copy_to_user(buf, values, size))
1812 return -EFAULT;
1813
1814 err = perf_event_read_entry(leader, read_format, buf + size);
1815 if (err < 0)
1816 return err;
1817
1818 size += err;
1819
1820 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821 err = perf_event_read_entry(sub, read_format,
1822 buf + size);
1823 if (err < 0)
1824 return err;
1825
1826 size += err;
1827 }
1828
1829 return size;
1830}
1831
1832static int perf_event_read_one(struct perf_event *event,
1833 u64 read_format, char __user *buf)
1834{
1835 u64 values[4];
1836 int n = 0;
1837
1838 values[n++] = perf_event_read_value(event);
1839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1840 values[n++] = event->total_time_enabled +
1841 atomic64_read(&event->child_total_time_enabled);
1842 }
1843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 values[n++] = event->total_time_running +
1845 atomic64_read(&event->child_total_time_running);
1846 }
1847 if (read_format & PERF_FORMAT_ID)
1848 values[n++] = primary_event_id(event);
1849
1850 if (copy_to_user(buf, values, n * sizeof(u64)))
1851 return -EFAULT;
1852
1853 return n * sizeof(u64);
1854}
1855
1856/*
1857 * Read the performance event - simple non blocking version for now
1858 */
1859static ssize_t
1860perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861{
1862 u64 read_format = event->attr.read_format;
1863 int ret;
1864
1865 /*
1866 * Return end-of-file for a read on a event that is in
1867 * error state (i.e. because it was pinned but it couldn't be
1868 * scheduled on to the CPU at some point).
1869 */
1870 if (event->state == PERF_EVENT_STATE_ERROR)
1871 return 0;
1872
1873 if (count < perf_event_read_size(event))
1874 return -ENOSPC;
1875
1876 WARN_ON_ONCE(event->ctx->parent_ctx);
1877 mutex_lock(&event->child_mutex);
1878 if (read_format & PERF_FORMAT_GROUP)
1879 ret = perf_event_read_group(event, read_format, buf);
1880 else
1881 ret = perf_event_read_one(event, read_format, buf);
1882 mutex_unlock(&event->child_mutex);
1883
1884 return ret;
1885}
1886
1887static ssize_t
1888perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1889{
1890 struct perf_event *event = file->private_data;
1891
1892 return perf_read_hw(event, buf, count);
1893}
1894
1895static unsigned int perf_poll(struct file *file, poll_table *wait)
1896{
1897 struct perf_event *event = file->private_data;
1898 struct perf_mmap_data *data;
1899 unsigned int events = POLL_HUP;
1900
1901 rcu_read_lock();
1902 data = rcu_dereference(event->data);
1903 if (data)
1904 events = atomic_xchg(&data->poll, 0);
1905 rcu_read_unlock();
1906
1907 poll_wait(file, &event->waitq, wait);
1908
1909 return events;
1910}
1911
1912static void perf_event_reset(struct perf_event *event)
1913{
1914 (void)perf_event_read(event);
1915 atomic64_set(&event->count, 0);
1916 perf_event_update_userpage(event);
1917}
1918
1919/*
1920 * Holding the top-level event's child_mutex means that any
1921 * descendant process that has inherited this event will block
1922 * in sync_child_event if it goes to exit, thus satisfying the
1923 * task existence requirements of perf_event_enable/disable.
1924 */
1925static void perf_event_for_each_child(struct perf_event *event,
1926 void (*func)(struct perf_event *))
1927{
1928 struct perf_event *child;
1929
1930 WARN_ON_ONCE(event->ctx->parent_ctx);
1931 mutex_lock(&event->child_mutex);
1932 func(event);
1933 list_for_each_entry(child, &event->child_list, child_list)
1934 func(child);
1935 mutex_unlock(&event->child_mutex);
1936}
1937
1938static void perf_event_for_each(struct perf_event *event,
1939 void (*func)(struct perf_event *))
1940{
1941 struct perf_event_context *ctx = event->ctx;
1942 struct perf_event *sibling;
1943
1944 WARN_ON_ONCE(ctx->parent_ctx);
1945 mutex_lock(&ctx->mutex);
1946 event = event->group_leader;
1947
1948 perf_event_for_each_child(event, func);
1949 func(event);
1950 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1951 perf_event_for_each_child(event, func);
1952 mutex_unlock(&ctx->mutex);
1953}
1954
1955static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956{
1957 struct perf_event_context *ctx = event->ctx;
1958 unsigned long size;
1959 int ret = 0;
1960 u64 value;
1961
1962 if (!event->attr.sample_period)
1963 return -EINVAL;
1964
1965 size = copy_from_user(&value, arg, sizeof(value));
1966 if (size != sizeof(value))
1967 return -EFAULT;
1968
1969 if (!value)
1970 return -EINVAL;
1971
1972 spin_lock_irq(&ctx->lock);
1973 if (event->attr.freq) {
1974 if (value > sysctl_perf_event_sample_rate) {
1975 ret = -EINVAL;
1976 goto unlock;
1977 }
1978
1979 event->attr.sample_freq = value;
1980 } else {
1981 event->attr.sample_period = value;
1982 event->hw.sample_period = value;
1983 }
1984unlock:
1985 spin_unlock_irq(&ctx->lock);
1986
1987 return ret;
1988}
1989
1990int perf_event_set_output(struct perf_event *event, int output_fd);
1991
1992static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1993{
1994 struct perf_event *event = file->private_data;
1995 void (*func)(struct perf_event *);
1996 u32 flags = arg;
1997
1998 switch (cmd) {
1999 case PERF_EVENT_IOC_ENABLE:
2000 func = perf_event_enable;
2001 break;
2002 case PERF_EVENT_IOC_DISABLE:
2003 func = perf_event_disable;
2004 break;
2005 case PERF_EVENT_IOC_RESET:
2006 func = perf_event_reset;
2007 break;
2008
2009 case PERF_EVENT_IOC_REFRESH:
2010 return perf_event_refresh(event, arg);
2011
2012 case PERF_EVENT_IOC_PERIOD:
2013 return perf_event_period(event, (u64 __user *)arg);
2014
2015 case PERF_EVENT_IOC_SET_OUTPUT:
2016 return perf_event_set_output(event, arg);
2017
2018 default:
2019 return -ENOTTY;
2020 }
2021
2022 if (flags & PERF_IOC_FLAG_GROUP)
2023 perf_event_for_each(event, func);
2024 else
2025 perf_event_for_each_child(event, func);
2026
2027 return 0;
2028}
2029
2030int perf_event_task_enable(void)
2031{
2032 struct perf_event *event;
2033
2034 mutex_lock(&current->perf_event_mutex);
2035 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2036 perf_event_for_each_child(event, perf_event_enable);
2037 mutex_unlock(&current->perf_event_mutex);
2038
2039 return 0;
2040}
2041
2042int perf_event_task_disable(void)
2043{
2044 struct perf_event *event;
2045
2046 mutex_lock(&current->perf_event_mutex);
2047 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2048 perf_event_for_each_child(event, perf_event_disable);
2049 mutex_unlock(&current->perf_event_mutex);
2050
2051 return 0;
2052}
2053
2054#ifndef PERF_EVENT_INDEX_OFFSET
2055# define PERF_EVENT_INDEX_OFFSET 0
2056#endif
2057
2058static int perf_event_index(struct perf_event *event)
2059{
2060 if (event->state != PERF_EVENT_STATE_ACTIVE)
2061 return 0;
2062
2063 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2064}
2065
2066/*
2067 * Callers need to ensure there can be no nesting of this function, otherwise
2068 * the seqlock logic goes bad. We can not serialize this because the arch
2069 * code calls this from NMI context.
2070 */
2071void perf_event_update_userpage(struct perf_event *event)
2072{
2073 struct perf_event_mmap_page *userpg;
2074 struct perf_mmap_data *data;
2075
2076 rcu_read_lock();
2077 data = rcu_dereference(event->data);
2078 if (!data)
2079 goto unlock;
2080
2081 userpg = data->user_page;
2082
2083 /*
2084 * Disable preemption so as to not let the corresponding user-space
2085 * spin too long if we get preempted.
2086 */
2087 preempt_disable();
2088 ++userpg->lock;
2089 barrier();
2090 userpg->index = perf_event_index(event);
2091 userpg->offset = atomic64_read(&event->count);
2092 if (event->state == PERF_EVENT_STATE_ACTIVE)
2093 userpg->offset -= atomic64_read(&event->hw.prev_count);
2094
2095 userpg->time_enabled = event->total_time_enabled +
2096 atomic64_read(&event->child_total_time_enabled);
2097
2098 userpg->time_running = event->total_time_running +
2099 atomic64_read(&event->child_total_time_running);
2100
2101 barrier();
2102 ++userpg->lock;
2103 preempt_enable();
2104unlock:
2105 rcu_read_unlock();
2106}
2107
2108static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2109{
2110 struct perf_event *event = vma->vm_file->private_data;
2111 struct perf_mmap_data *data;
2112 int ret = VM_FAULT_SIGBUS;
2113
2114 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2115 if (vmf->pgoff == 0)
2116 ret = 0;
2117 return ret;
2118 }
2119
2120 rcu_read_lock();
2121 data = rcu_dereference(event->data);
2122 if (!data)
2123 goto unlock;
2124
2125 if (vmf->pgoff == 0) {
2126 vmf->page = virt_to_page(data->user_page);
2127 } else {
2128 int nr = vmf->pgoff - 1;
2129
2130 if ((unsigned)nr > data->nr_pages)
2131 goto unlock;
2132
2133 if (vmf->flags & FAULT_FLAG_WRITE)
2134 goto unlock;
2135
2136 vmf->page = virt_to_page(data->data_pages[nr]);
2137 }
2138
2139 get_page(vmf->page);
2140 vmf->page->mapping = vma->vm_file->f_mapping;
2141 vmf->page->index = vmf->pgoff;
2142
2143 ret = 0;
2144unlock:
2145 rcu_read_unlock();
2146
2147 return ret;
2148}
2149
2150static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151{
2152 struct perf_mmap_data *data;
2153 unsigned long size;
2154 int i;
2155
2156 WARN_ON(atomic_read(&event->mmap_count));
2157
2158 size = sizeof(struct perf_mmap_data);
2159 size += nr_pages * sizeof(void *);
2160
2161 data = kzalloc(size, GFP_KERNEL);
2162 if (!data)
2163 goto fail;
2164
2165 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2166 if (!data->user_page)
2167 goto fail_user_page;
2168
2169 for (i = 0; i < nr_pages; i++) {
2170 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2171 if (!data->data_pages[i])
2172 goto fail_data_pages;
2173 }
2174
2175 data->nr_pages = nr_pages;
2176 atomic_set(&data->lock, -1);
2177
2178 if (event->attr.watermark) {
2179 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180 event->attr.wakeup_watermark);
2181 }
2182 if (!data->watermark)
2183 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2184
2185 rcu_assign_pointer(event->data, data);
2186
2187 return 0;
2188
2189fail_data_pages:
2190 for (i--; i >= 0; i--)
2191 free_page((unsigned long)data->data_pages[i]);
2192
2193 free_page((unsigned long)data->user_page);
2194
2195fail_user_page:
2196 kfree(data);
2197
2198fail:
2199 return -ENOMEM;
2200}
2201
2202static void perf_mmap_free_page(unsigned long addr)
2203{
2204 struct page *page = virt_to_page((void *)addr);
2205
2206 page->mapping = NULL;
2207 __free_page(page);
2208}
2209
2210static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2211{
2212 struct perf_mmap_data *data;
2213 int i;
2214
2215 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2216
2217 perf_mmap_free_page((unsigned long)data->user_page);
2218 for (i = 0; i < data->nr_pages; i++)
2219 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2220
2221 kfree(data);
2222}
2223
2224static void perf_mmap_data_free(struct perf_event *event)
2225{
2226 struct perf_mmap_data *data = event->data;
2227
2228 WARN_ON(atomic_read(&event->mmap_count));
2229
2230 rcu_assign_pointer(event->data, NULL);
2231 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2232}
2233
2234static void perf_mmap_open(struct vm_area_struct *vma)
2235{
2236 struct perf_event *event = vma->vm_file->private_data;
2237
2238 atomic_inc(&event->mmap_count);
2239}
2240
2241static void perf_mmap_close(struct vm_area_struct *vma)
2242{
2243 struct perf_event *event = vma->vm_file->private_data;
2244
2245 WARN_ON_ONCE(event->ctx->parent_ctx);
2246 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2247 struct user_struct *user = current_user();
2248
2249 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
2250 vma->vm_mm->locked_vm -= event->data->nr_locked;
2251 perf_mmap_data_free(event);
2252 mutex_unlock(&event->mmap_mutex);
2253 }
2254}
2255
2256static struct vm_operations_struct perf_mmap_vmops = {
2257 .open = perf_mmap_open,
2258 .close = perf_mmap_close,
2259 .fault = perf_mmap_fault,
2260 .page_mkwrite = perf_mmap_fault,
2261};
2262
2263static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2264{
2265 struct perf_event *event = file->private_data;
2266 unsigned long user_locked, user_lock_limit;
2267 struct user_struct *user = current_user();
2268 unsigned long locked, lock_limit;
2269 unsigned long vma_size;
2270 unsigned long nr_pages;
2271 long user_extra, extra;
2272 int ret = 0;
2273
2274 if (!(vma->vm_flags & VM_SHARED))
2275 return -EINVAL;
2276
2277 vma_size = vma->vm_end - vma->vm_start;
2278 nr_pages = (vma_size / PAGE_SIZE) - 1;
2279
2280 /*
2281 * If we have data pages ensure they're a power-of-two number, so we
2282 * can do bitmasks instead of modulo.
2283 */
2284 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2285 return -EINVAL;
2286
2287 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2288 return -EINVAL;
2289
2290 if (vma->vm_pgoff != 0)
2291 return -EINVAL;
2292
2293 WARN_ON_ONCE(event->ctx->parent_ctx);
2294 mutex_lock(&event->mmap_mutex);
2295 if (event->output) {
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 if (atomic_inc_not_zero(&event->mmap_count)) {
2301 if (nr_pages != event->data->nr_pages)
2302 ret = -EINVAL;
2303 goto unlock;
2304 }
2305
2306 user_extra = nr_pages + 1;
2307 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2308
2309 /*
2310 * Increase the limit linearly with more CPUs:
2311 */
2312 user_lock_limit *= num_online_cpus();
2313
2314 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2315
2316 extra = 0;
2317 if (user_locked > user_lock_limit)
2318 extra = user_locked - user_lock_limit;
2319
2320 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2321 lock_limit >>= PAGE_SHIFT;
2322 locked = vma->vm_mm->locked_vm + extra;
2323
2324 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2325 !capable(CAP_IPC_LOCK)) {
2326 ret = -EPERM;
2327 goto unlock;
2328 }
2329
2330 WARN_ON(event->data);
2331 ret = perf_mmap_data_alloc(event, nr_pages);
2332 if (ret)
2333 goto unlock;
2334
2335 atomic_set(&event->mmap_count, 1);
2336 atomic_long_add(user_extra, &user->locked_vm);
2337 vma->vm_mm->locked_vm += extra;
2338 event->data->nr_locked = extra;
2339 if (vma->vm_flags & VM_WRITE)
2340 event->data->writable = 1;
2341
2342unlock:
2343 mutex_unlock(&event->mmap_mutex);
2344
2345 vma->vm_flags |= VM_RESERVED;
2346 vma->vm_ops = &perf_mmap_vmops;
2347
2348 return ret;
2349}
2350
2351static int perf_fasync(int fd, struct file *filp, int on)
2352{
2353 struct inode *inode = filp->f_path.dentry->d_inode;
2354 struct perf_event *event = filp->private_data;
2355 int retval;
2356
2357 mutex_lock(&inode->i_mutex);
2358 retval = fasync_helper(fd, filp, on, &event->fasync);
2359 mutex_unlock(&inode->i_mutex);
2360
2361 if (retval < 0)
2362 return retval;
2363
2364 return 0;
2365}
2366
2367static const struct file_operations perf_fops = {
2368 .release = perf_release,
2369 .read = perf_read,
2370 .poll = perf_poll,
2371 .unlocked_ioctl = perf_ioctl,
2372 .compat_ioctl = perf_ioctl,
2373 .mmap = perf_mmap,
2374 .fasync = perf_fasync,
2375};
2376
2377/*
2378 * Perf event wakeup
2379 *
2380 * If there's data, ensure we set the poll() state and publish everything
2381 * to user-space before waking everybody up.
2382 */
2383
2384void perf_event_wakeup(struct perf_event *event)
2385{
2386 wake_up_all(&event->waitq);
2387
2388 if (event->pending_kill) {
2389 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2390 event->pending_kill = 0;
2391 }
2392}
2393
2394/*
2395 * Pending wakeups
2396 *
2397 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2398 *
2399 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2400 * single linked list and use cmpxchg() to add entries lockless.
2401 */
2402
2403static void perf_pending_event(struct perf_pending_entry *entry)
2404{
2405 struct perf_event *event = container_of(entry,
2406 struct perf_event, pending);
2407
2408 if (event->pending_disable) {
2409 event->pending_disable = 0;
2410 __perf_event_disable(event);
2411 }
2412
2413 if (event->pending_wakeup) {
2414 event->pending_wakeup = 0;
2415 perf_event_wakeup(event);
2416 }
2417}
2418
2419#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2420
2421static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2422 PENDING_TAIL,
2423};
2424
2425static void perf_pending_queue(struct perf_pending_entry *entry,
2426 void (*func)(struct perf_pending_entry *))
2427{
2428 struct perf_pending_entry **head;
2429
2430 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2431 return;
2432
2433 entry->func = func;
2434
2435 head = &get_cpu_var(perf_pending_head);
2436
2437 do {
2438 entry->next = *head;
2439 } while (cmpxchg(head, entry->next, entry) != entry->next);
2440
2441 set_perf_event_pending();
2442
2443 put_cpu_var(perf_pending_head);
2444}
2445
2446static int __perf_pending_run(void)
2447{
2448 struct perf_pending_entry *list;
2449 int nr = 0;
2450
2451 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2452 while (list != PENDING_TAIL) {
2453 void (*func)(struct perf_pending_entry *);
2454 struct perf_pending_entry *entry = list;
2455
2456 list = list->next;
2457
2458 func = entry->func;
2459 entry->next = NULL;
2460 /*
2461 * Ensure we observe the unqueue before we issue the wakeup,
2462 * so that we won't be waiting forever.
2463 * -- see perf_not_pending().
2464 */
2465 smp_wmb();
2466
2467 func(entry);
2468 nr++;
2469 }
2470
2471 return nr;
2472}
2473
2474static inline int perf_not_pending(struct perf_event *event)
2475{
2476 /*
2477 * If we flush on whatever cpu we run, there is a chance we don't
2478 * need to wait.
2479 */
2480 get_cpu();
2481 __perf_pending_run();
2482 put_cpu();
2483
2484 /*
2485 * Ensure we see the proper queue state before going to sleep
2486 * so that we do not miss the wakeup. -- see perf_pending_handle()
2487 */
2488 smp_rmb();
2489 return event->pending.next == NULL;
2490}
2491
2492static void perf_pending_sync(struct perf_event *event)
2493{
2494 wait_event(event->waitq, perf_not_pending(event));
2495}
2496
2497void perf_event_do_pending(void)
2498{
2499 __perf_pending_run();
2500}
2501
2502/*
2503 * Callchain support -- arch specific
2504 */
2505
2506__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2507{
2508 return NULL;
2509}
2510
2511/*
2512 * Output
2513 */
2514static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2515 unsigned long offset, unsigned long head)
2516{
2517 unsigned long mask;
2518
2519 if (!data->writable)
2520 return true;
2521
2522 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2523
2524 offset = (offset - tail) & mask;
2525 head = (head - tail) & mask;
2526
2527 if ((int)(head - offset) < 0)
2528 return false;
2529
2530 return true;
2531}
2532
2533static void perf_output_wakeup(struct perf_output_handle *handle)
2534{
2535 atomic_set(&handle->data->poll, POLL_IN);
2536
2537 if (handle->nmi) {
2538 handle->event->pending_wakeup = 1;
2539 perf_pending_queue(&handle->event->pending,
2540 perf_pending_event);
2541 } else
2542 perf_event_wakeup(handle->event);
2543}
2544
2545/*
2546 * Curious locking construct.
2547 *
2548 * We need to ensure a later event_id doesn't publish a head when a former
2549 * event_id isn't done writing. However since we need to deal with NMIs we
2550 * cannot fully serialize things.
2551 *
2552 * What we do is serialize between CPUs so we only have to deal with NMI
2553 * nesting on a single CPU.
2554 *
2555 * We only publish the head (and generate a wakeup) when the outer-most
2556 * event_id completes.
2557 */
2558static void perf_output_lock(struct perf_output_handle *handle)
2559{
2560 struct perf_mmap_data *data = handle->data;
2561 int cpu;
2562
2563 handle->locked = 0;
2564
2565 local_irq_save(handle->flags);
2566 cpu = smp_processor_id();
2567
2568 if (in_nmi() && atomic_read(&data->lock) == cpu)
2569 return;
2570
2571 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 cpu_relax();
2573
2574 handle->locked = 1;
2575}
2576
2577static void perf_output_unlock(struct perf_output_handle *handle)
2578{
2579 struct perf_mmap_data *data = handle->data;
2580 unsigned long head;
2581 int cpu;
2582
2583 data->done_head = data->head;
2584
2585 if (!handle->locked)
2586 goto out;
2587
2588again:
2589 /*
2590 * The xchg implies a full barrier that ensures all writes are done
2591 * before we publish the new head, matched by a rmb() in userspace when
2592 * reading this position.
2593 */
2594 while ((head = atomic_long_xchg(&data->done_head, 0)))
2595 data->user_page->data_head = head;
2596
2597 /*
2598 * NMI can happen here, which means we can miss a done_head update.
2599 */
2600
2601 cpu = atomic_xchg(&data->lock, -1);
2602 WARN_ON_ONCE(cpu != smp_processor_id());
2603
2604 /*
2605 * Therefore we have to validate we did not indeed do so.
2606 */
2607 if (unlikely(atomic_long_read(&data->done_head))) {
2608 /*
2609 * Since we had it locked, we can lock it again.
2610 */
2611 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2612 cpu_relax();
2613
2614 goto again;
2615 }
2616
2617 if (atomic_xchg(&data->wakeup, 0))
2618 perf_output_wakeup(handle);
2619out:
2620 local_irq_restore(handle->flags);
2621}
2622
2623void perf_output_copy(struct perf_output_handle *handle,
2624 const void *buf, unsigned int len)
2625{
2626 unsigned int pages_mask;
2627 unsigned int offset;
2628 unsigned int size;
2629 void **pages;
2630
2631 offset = handle->offset;
2632 pages_mask = handle->data->nr_pages - 1;
2633 pages = handle->data->data_pages;
2634
2635 do {
2636 unsigned int page_offset;
2637 int nr;
2638
2639 nr = (offset >> PAGE_SHIFT) & pages_mask;
2640 page_offset = offset & (PAGE_SIZE - 1);
2641 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2642
2643 memcpy(pages[nr] + page_offset, buf, size);
2644
2645 len -= size;
2646 buf += size;
2647 offset += size;
2648 } while (len);
2649
2650 handle->offset = offset;
2651
2652 /*
2653 * Check we didn't copy past our reservation window, taking the
2654 * possible unsigned int wrap into account.
2655 */
2656 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2657}
2658
2659int perf_output_begin(struct perf_output_handle *handle,
2660 struct perf_event *event, unsigned int size,
2661 int nmi, int sample)
2662{
2663 struct perf_event *output_event;
2664 struct perf_mmap_data *data;
2665 unsigned long tail, offset, head;
2666 int have_lost;
2667 struct {
2668 struct perf_event_header header;
2669 u64 id;
2670 u64 lost;
2671 } lost_event;
2672
2673 rcu_read_lock();
2674 /*
2675 * For inherited events we send all the output towards the parent.
2676 */
2677 if (event->parent)
2678 event = event->parent;
2679
2680 output_event = rcu_dereference(event->output);
2681 if (output_event)
2682 event = output_event;
2683
2684 data = rcu_dereference(event->data);
2685 if (!data)
2686 goto out;
2687
2688 handle->data = data;
2689 handle->event = event;
2690 handle->nmi = nmi;
2691 handle->sample = sample;
2692
2693 if (!data->nr_pages)
2694 goto fail;
2695
2696 have_lost = atomic_read(&data->lost);
2697 if (have_lost)
2698 size += sizeof(lost_event);
2699
2700 perf_output_lock(handle);
2701
2702 do {
2703 /*
2704 * Userspace could choose to issue a mb() before updating the
2705 * tail pointer. So that all reads will be completed before the
2706 * write is issued.
2707 */
2708 tail = ACCESS_ONCE(data->user_page->data_tail);
2709 smp_rmb();
2710 offset = head = atomic_long_read(&data->head);
2711 head += size;
2712 if (unlikely(!perf_output_space(data, tail, offset, head)))
2713 goto fail;
2714 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2715
2716 handle->offset = offset;
2717 handle->head = head;
2718
2719 if (head - tail > data->watermark)
2720 atomic_set(&data->wakeup, 1);
2721
2722 if (have_lost) {
2723 lost_event.header.type = PERF_RECORD_LOST;
2724 lost_event.header.misc = 0;
2725 lost_event.header.size = sizeof(lost_event);
2726 lost_event.id = event->id;
2727 lost_event.lost = atomic_xchg(&data->lost, 0);
2728
2729 perf_output_put(handle, lost_event);
2730 }
2731
2732 return 0;
2733
2734fail:
2735 atomic_inc(&data->lost);
2736 perf_output_unlock(handle);
2737out:
2738 rcu_read_unlock();
2739
2740 return -ENOSPC;
2741}
2742
2743void perf_output_end(struct perf_output_handle *handle)
2744{
2745 struct perf_event *event = handle->event;
2746 struct perf_mmap_data *data = handle->data;
2747
2748 int wakeup_events = event->attr.wakeup_events;
2749
2750 if (handle->sample && wakeup_events) {
2751 int events = atomic_inc_return(&data->events);
2752 if (events >= wakeup_events) {
2753 atomic_sub(wakeup_events, &data->events);
2754 atomic_set(&data->wakeup, 1);
2755 }
2756 }
2757
2758 perf_output_unlock(handle);
2759 rcu_read_unlock();
2760}
2761
2762static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2763{
2764 /*
2765 * only top level events have the pid namespace they were created in
2766 */
2767 if (event->parent)
2768 event = event->parent;
2769
2770 return task_tgid_nr_ns(p, event->ns);
2771}
2772
2773static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2774{
2775 /*
2776 * only top level events have the pid namespace they were created in
2777 */
2778 if (event->parent)
2779 event = event->parent;
2780
2781 return task_pid_nr_ns(p, event->ns);
2782}
2783
2784static void perf_output_read_one(struct perf_output_handle *handle,
2785 struct perf_event *event)
2786{
2787 u64 read_format = event->attr.read_format;
2788 u64 values[4];
2789 int n = 0;
2790
2791 values[n++] = atomic64_read(&event->count);
2792 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2793 values[n++] = event->total_time_enabled +
2794 atomic64_read(&event->child_total_time_enabled);
2795 }
2796 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2797 values[n++] = event->total_time_running +
2798 atomic64_read(&event->child_total_time_running);
2799 }
2800 if (read_format & PERF_FORMAT_ID)
2801 values[n++] = primary_event_id(event);
2802
2803 perf_output_copy(handle, values, n * sizeof(u64));
2804}
2805
2806/*
2807 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2808 */
2809static void perf_output_read_group(struct perf_output_handle *handle,
2810 struct perf_event *event)
2811{
2812 struct perf_event *leader = event->group_leader, *sub;
2813 u64 read_format = event->attr.read_format;
2814 u64 values[5];
2815 int n = 0;
2816
2817 values[n++] = 1 + leader->nr_siblings;
2818
2819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2820 values[n++] = leader->total_time_enabled;
2821
2822 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2823 values[n++] = leader->total_time_running;
2824
2825 if (leader != event)
2826 leader->pmu->read(leader);
2827
2828 values[n++] = atomic64_read(&leader->count);
2829 if (read_format & PERF_FORMAT_ID)
2830 values[n++] = primary_event_id(leader);
2831
2832 perf_output_copy(handle, values, n * sizeof(u64));
2833
2834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2835 n = 0;
2836
2837 if (sub != event)
2838 sub->pmu->read(sub);
2839
2840 values[n++] = atomic64_read(&sub->count);
2841 if (read_format & PERF_FORMAT_ID)
2842 values[n++] = primary_event_id(sub);
2843
2844 perf_output_copy(handle, values, n * sizeof(u64));
2845 }
2846}
2847
2848static void perf_output_read(struct perf_output_handle *handle,
2849 struct perf_event *event)
2850{
2851 if (event->attr.read_format & PERF_FORMAT_GROUP)
2852 perf_output_read_group(handle, event);
2853 else
2854 perf_output_read_one(handle, event);
2855}
2856
2857void perf_output_sample(struct perf_output_handle *handle,
2858 struct perf_event_header *header,
2859 struct perf_sample_data *data,
2860 struct perf_event *event)
2861{
2862 u64 sample_type = data->type;
2863
2864 perf_output_put(handle, *header);
2865
2866 if (sample_type & PERF_SAMPLE_IP)
2867 perf_output_put(handle, data->ip);
2868
2869 if (sample_type & PERF_SAMPLE_TID)
2870 perf_output_put(handle, data->tid_entry);
2871
2872 if (sample_type & PERF_SAMPLE_TIME)
2873 perf_output_put(handle, data->time);
2874
2875 if (sample_type & PERF_SAMPLE_ADDR)
2876 perf_output_put(handle, data->addr);
2877
2878 if (sample_type & PERF_SAMPLE_ID)
2879 perf_output_put(handle, data->id);
2880
2881 if (sample_type & PERF_SAMPLE_STREAM_ID)
2882 perf_output_put(handle, data->stream_id);
2883
2884 if (sample_type & PERF_SAMPLE_CPU)
2885 perf_output_put(handle, data->cpu_entry);
2886
2887 if (sample_type & PERF_SAMPLE_PERIOD)
2888 perf_output_put(handle, data->period);
2889
2890 if (sample_type & PERF_SAMPLE_READ)
2891 perf_output_read(handle, event);
2892
2893 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2894 if (data->callchain) {
2895 int size = 1;
2896
2897 if (data->callchain)
2898 size += data->callchain->nr;
2899
2900 size *= sizeof(u64);
2901
2902 perf_output_copy(handle, data->callchain, size);
2903 } else {
2904 u64 nr = 0;
2905 perf_output_put(handle, nr);
2906 }
2907 }
2908
2909 if (sample_type & PERF_SAMPLE_RAW) {
2910 if (data->raw) {
2911 perf_output_put(handle, data->raw->size);
2912 perf_output_copy(handle, data->raw->data,
2913 data->raw->size);
2914 } else {
2915 struct {
2916 u32 size;
2917 u32 data;
2918 } raw = {
2919 .size = sizeof(u32),
2920 .data = 0,
2921 };
2922 perf_output_put(handle, raw);
2923 }
2924 }
2925}
2926
2927void perf_prepare_sample(struct perf_event_header *header,
2928 struct perf_sample_data *data,
2929 struct perf_event *event,
2930 struct pt_regs *regs)
2931{
2932 u64 sample_type = event->attr.sample_type;
2933
2934 data->type = sample_type;
2935
2936 header->type = PERF_RECORD_SAMPLE;
2937 header->size = sizeof(*header);
2938
2939 header->misc = 0;
2940 header->misc |= perf_misc_flags(regs);
2941
2942 if (sample_type & PERF_SAMPLE_IP) {
2943 data->ip = perf_instruction_pointer(regs);
2944
2945 header->size += sizeof(data->ip);
2946 }
2947
2948 if (sample_type & PERF_SAMPLE_TID) {
2949 /* namespace issues */
2950 data->tid_entry.pid = perf_event_pid(event, current);
2951 data->tid_entry.tid = perf_event_tid(event, current);
2952
2953 header->size += sizeof(data->tid_entry);
2954 }
2955
2956 if (sample_type & PERF_SAMPLE_TIME) {
2957 data->time = perf_clock();
2958
2959 header->size += sizeof(data->time);
2960 }
2961
2962 if (sample_type & PERF_SAMPLE_ADDR)
2963 header->size += sizeof(data->addr);
2964
2965 if (sample_type & PERF_SAMPLE_ID) {
2966 data->id = primary_event_id(event);
2967
2968 header->size += sizeof(data->id);
2969 }
2970
2971 if (sample_type & PERF_SAMPLE_STREAM_ID) {
2972 data->stream_id = event->id;
2973
2974 header->size += sizeof(data->stream_id);
2975 }
2976
2977 if (sample_type & PERF_SAMPLE_CPU) {
2978 data->cpu_entry.cpu = raw_smp_processor_id();
2979 data->cpu_entry.reserved = 0;
2980
2981 header->size += sizeof(data->cpu_entry);
2982 }
2983
2984 if (sample_type & PERF_SAMPLE_PERIOD)
2985 header->size += sizeof(data->period);
2986
2987 if (sample_type & PERF_SAMPLE_READ)
2988 header->size += perf_event_read_size(event);
2989
2990 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2991 int size = 1;
2992
2993 data->callchain = perf_callchain(regs);
2994
2995 if (data->callchain)
2996 size += data->callchain->nr;
2997
2998 header->size += size * sizeof(u64);
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 int size = sizeof(u32);
3003
3004 if (data->raw)
3005 size += data->raw->size;
3006 else
3007 size += sizeof(u32);
3008
3009 WARN_ON_ONCE(size & (sizeof(u64)-1));
3010 header->size += size;
3011 }
3012}
3013
3014static void perf_event_output(struct perf_event *event, int nmi,
3015 struct perf_sample_data *data,
3016 struct pt_regs *regs)
3017{
3018 struct perf_output_handle handle;
3019 struct perf_event_header header;
3020
3021 perf_prepare_sample(&header, data, event, regs);
3022
3023 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3024 return;
3025
3026 perf_output_sample(&handle, &header, data, event);
3027
3028 perf_output_end(&handle);
3029}
3030
3031/*
3032 * read event_id
3033 */
3034
3035struct perf_read_event {
3036 struct perf_event_header header;
3037
3038 u32 pid;
3039 u32 tid;
3040};
3041
3042static void
3043perf_event_read_event(struct perf_event *event,
3044 struct task_struct *task)
3045{
3046 struct perf_output_handle handle;
3047 struct perf_read_event read_event = {
3048 .header = {
3049 .type = PERF_RECORD_READ,
3050 .misc = 0,
3051 .size = sizeof(read_event) + perf_event_read_size(event),
3052 },
3053 .pid = perf_event_pid(event, task),
3054 .tid = perf_event_tid(event, task),
3055 };
3056 int ret;
3057
3058 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3059 if (ret)
3060 return;
3061
3062 perf_output_put(&handle, read_event);
3063 perf_output_read(&handle, event);
3064
3065 perf_output_end(&handle);
3066}
3067
3068/*
3069 * task tracking -- fork/exit
3070 *
3071 * enabled by: attr.comm | attr.mmap | attr.task
3072 */
3073
3074struct perf_task_event {
3075 struct task_struct *task;
3076 struct perf_event_context *task_ctx;
3077
3078 struct {
3079 struct perf_event_header header;
3080
3081 u32 pid;
3082 u32 ppid;
3083 u32 tid;
3084 u32 ptid;
3085 u64 time;
3086 } event_id;
3087};
3088
3089static void perf_event_task_output(struct perf_event *event,
3090 struct perf_task_event *task_event)
3091{
3092 struct perf_output_handle handle;
3093 int size;
3094 struct task_struct *task = task_event->task;
3095 int ret;
3096
3097 size = task_event->event_id.header.size;
3098 ret = perf_output_begin(&handle, event, size, 0, 0);
3099
3100 if (ret)
3101 return;
3102
3103 task_event->event_id.pid = perf_event_pid(event, task);
3104 task_event->event_id.ppid = perf_event_pid(event, current);
3105
3106 task_event->event_id.tid = perf_event_tid(event, task);
3107 task_event->event_id.ptid = perf_event_tid(event, current);
3108
3109 task_event->event_id.time = perf_clock();
3110
3111 perf_output_put(&handle, task_event->event_id);
3112
3113 perf_output_end(&handle);
3114}
3115
3116static int perf_event_task_match(struct perf_event *event)
3117{
3118 if (event->attr.comm || event->attr.mmap || event->attr.task)
3119 return 1;
3120
3121 return 0;
3122}
3123
3124static void perf_event_task_ctx(struct perf_event_context *ctx,
3125 struct perf_task_event *task_event)
3126{
3127 struct perf_event *event;
3128
3129 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3130 return;
3131
3132 rcu_read_lock();
3133 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3134 if (perf_event_task_match(event))
3135 perf_event_task_output(event, task_event);
3136 }
3137 rcu_read_unlock();
3138}
3139
3140static void perf_event_task_event(struct perf_task_event *task_event)
3141{
3142 struct perf_cpu_context *cpuctx;
3143 struct perf_event_context *ctx = task_event->task_ctx;
3144
3145 cpuctx = &get_cpu_var(perf_cpu_context);
3146 perf_event_task_ctx(&cpuctx->ctx, task_event);
3147 put_cpu_var(perf_cpu_context);
3148
3149 rcu_read_lock();
3150 if (!ctx)
3151 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3152 if (ctx)
3153 perf_event_task_ctx(ctx, task_event);
3154 rcu_read_unlock();
3155}
3156
3157static void perf_event_task(struct task_struct *task,
3158 struct perf_event_context *task_ctx,
3159 int new)
3160{
3161 struct perf_task_event task_event;
3162
3163 if (!atomic_read(&nr_comm_events) &&
3164 !atomic_read(&nr_mmap_events) &&
3165 !atomic_read(&nr_task_events))
3166 return;
3167
3168 task_event = (struct perf_task_event){
3169 .task = task,
3170 .task_ctx = task_ctx,
3171 .event_id = {
3172 .header = {
3173 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3174 .misc = 0,
3175 .size = sizeof(task_event.event_id),
3176 },
3177 /* .pid */
3178 /* .ppid */
3179 /* .tid */
3180 /* .ptid */
3181 },
3182 };
3183
3184 perf_event_task_event(&task_event);
3185}
3186
3187void perf_event_fork(struct task_struct *task)
3188{
3189 perf_event_task(task, NULL, 1);
3190}
3191
3192/*
3193 * comm tracking
3194 */
3195
3196struct perf_comm_event {
3197 struct task_struct *task;
3198 char *comm;
3199 int comm_size;
3200
3201 struct {
3202 struct perf_event_header header;
3203
3204 u32 pid;
3205 u32 tid;
3206 } event_id;
3207};
3208
3209static void perf_event_comm_output(struct perf_event *event,
3210 struct perf_comm_event *comm_event)
3211{
3212 struct perf_output_handle handle;
3213 int size = comm_event->event_id.header.size;
3214 int ret = perf_output_begin(&handle, event, size, 0, 0);
3215
3216 if (ret)
3217 return;
3218
3219 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3220 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3221
3222 perf_output_put(&handle, comm_event->event_id);
3223 perf_output_copy(&handle, comm_event->comm,
3224 comm_event->comm_size);
3225 perf_output_end(&handle);
3226}
3227
3228static int perf_event_comm_match(struct perf_event *event)
3229{
3230 if (event->attr.comm)
3231 return 1;
3232
3233 return 0;
3234}
3235
3236static void perf_event_comm_ctx(struct perf_event_context *ctx,
3237 struct perf_comm_event *comm_event)
3238{
3239 struct perf_event *event;
3240
3241 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3242 return;
3243
3244 rcu_read_lock();
3245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3246 if (perf_event_comm_match(event))
3247 perf_event_comm_output(event, comm_event);
3248 }
3249 rcu_read_unlock();
3250}
3251
3252static void perf_event_comm_event(struct perf_comm_event *comm_event)
3253{
3254 struct perf_cpu_context *cpuctx;
3255 struct perf_event_context *ctx;
3256 unsigned int size;
3257 char comm[TASK_COMM_LEN];
3258
3259 memset(comm, 0, sizeof(comm));
3260 strncpy(comm, comm_event->task->comm, sizeof(comm));
3261 size = ALIGN(strlen(comm)+1, sizeof(u64));
3262
3263 comm_event->comm = comm;
3264 comm_event->comm_size = size;
3265
3266 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267
3268 cpuctx = &get_cpu_var(perf_cpu_context);
3269 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270 put_cpu_var(perf_cpu_context);
3271
3272 rcu_read_lock();
3273 /*
3274 * doesn't really matter which of the child contexts the
3275 * events ends up in.
3276 */
3277 ctx = rcu_dereference(current->perf_event_ctxp);
3278 if (ctx)
3279 perf_event_comm_ctx(ctx, comm_event);
3280 rcu_read_unlock();
3281}
3282
3283void perf_event_comm(struct task_struct *task)
3284{
3285 struct perf_comm_event comm_event;
3286
3287 if (task->perf_event_ctxp)
3288 perf_event_enable_on_exec(task);
3289
3290 if (!atomic_read(&nr_comm_events))
3291 return;
3292
3293 comm_event = (struct perf_comm_event){
3294 .task = task,
3295 /* .comm */
3296 /* .comm_size */
3297 .event_id = {
3298 .header = {
3299 .type = PERF_RECORD_COMM,
3300 .misc = 0,
3301 /* .size */
3302 },
3303 /* .pid */
3304 /* .tid */
3305 },
3306 };
3307
3308 perf_event_comm_event(&comm_event);
3309}
3310
3311/*
3312 * mmap tracking
3313 */
3314
3315struct perf_mmap_event {
3316 struct vm_area_struct *vma;
3317
3318 const char *file_name;
3319 int file_size;
3320
3321 struct {
3322 struct perf_event_header header;
3323
3324 u32 pid;
3325 u32 tid;
3326 u64 start;
3327 u64 len;
3328 u64 pgoff;
3329 } event_id;
3330};
3331
3332static void perf_event_mmap_output(struct perf_event *event,
3333 struct perf_mmap_event *mmap_event)
3334{
3335 struct perf_output_handle handle;
3336 int size = mmap_event->event_id.header.size;
3337 int ret = perf_output_begin(&handle, event, size, 0, 0);
3338
3339 if (ret)
3340 return;
3341
3342 mmap_event->event_id.pid = perf_event_pid(event, current);
3343 mmap_event->event_id.tid = perf_event_tid(event, current);
3344
3345 perf_output_put(&handle, mmap_event->event_id);
3346 perf_output_copy(&handle, mmap_event->file_name,
3347 mmap_event->file_size);
3348 perf_output_end(&handle);
3349}
3350
3351static int perf_event_mmap_match(struct perf_event *event,
3352 struct perf_mmap_event *mmap_event)
3353{
3354 if (event->attr.mmap)
3355 return 1;
3356
3357 return 0;
3358}
3359
3360static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3361 struct perf_mmap_event *mmap_event)
3362{
3363 struct perf_event *event;
3364
3365 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3366 return;
3367
3368 rcu_read_lock();
3369 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3370 if (perf_event_mmap_match(event, mmap_event))
3371 perf_event_mmap_output(event, mmap_event);
3372 }
3373 rcu_read_unlock();
3374}
3375
3376static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3377{
3378 struct perf_cpu_context *cpuctx;
3379 struct perf_event_context *ctx;
3380 struct vm_area_struct *vma = mmap_event->vma;
3381 struct file *file = vma->vm_file;
3382 unsigned int size;
3383 char tmp[16];
3384 char *buf = NULL;
3385 const char *name;
3386
3387 memset(tmp, 0, sizeof(tmp));
3388
3389 if (file) {
3390 /*
3391 * d_path works from the end of the buffer backwards, so we
3392 * need to add enough zero bytes after the string to handle
3393 * the 64bit alignment we do later.
3394 */
3395 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3396 if (!buf) {
3397 name = strncpy(tmp, "//enomem", sizeof(tmp));
3398 goto got_name;
3399 }
3400 name = d_path(&file->f_path, buf, PATH_MAX);
3401 if (IS_ERR(name)) {
3402 name = strncpy(tmp, "//toolong", sizeof(tmp));
3403 goto got_name;
3404 }
3405 } else {
3406 if (arch_vma_name(mmap_event->vma)) {
3407 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3408 sizeof(tmp));
3409 goto got_name;
3410 }
3411
3412 if (!vma->vm_mm) {
3413 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3414 goto got_name;
3415 }
3416
3417 name = strncpy(tmp, "//anon", sizeof(tmp));
3418 goto got_name;
3419 }
3420
3421got_name:
3422 size = ALIGN(strlen(name)+1, sizeof(u64));
3423
3424 mmap_event->file_name = name;
3425 mmap_event->file_size = size;
3426
3427 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428
3429 cpuctx = &get_cpu_var(perf_cpu_context);
3430 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 put_cpu_var(perf_cpu_context);
3432
3433 rcu_read_lock();
3434 /*
3435 * doesn't really matter which of the child contexts the
3436 * events ends up in.
3437 */
3438 ctx = rcu_dereference(current->perf_event_ctxp);
3439 if (ctx)
3440 perf_event_mmap_ctx(ctx, mmap_event);
3441 rcu_read_unlock();
3442
3443 kfree(buf);
3444}
3445
3446void __perf_event_mmap(struct vm_area_struct *vma)
3447{
3448 struct perf_mmap_event mmap_event;
3449
3450 if (!atomic_read(&nr_mmap_events))
3451 return;
3452
3453 mmap_event = (struct perf_mmap_event){
3454 .vma = vma,
3455 /* .file_name */
3456 /* .file_size */
3457 .event_id = {
3458 .header = {
3459 .type = PERF_RECORD_MMAP,
3460 .misc = 0,
3461 /* .size */
3462 },
3463 /* .pid */
3464 /* .tid */
3465 .start = vma->vm_start,
3466 .len = vma->vm_end - vma->vm_start,
3467 .pgoff = vma->vm_pgoff,
3468 },
3469 };
3470
3471 perf_event_mmap_event(&mmap_event);
3472}
3473
3474/*
3475 * IRQ throttle logging
3476 */
3477
3478static void perf_log_throttle(struct perf_event *event, int enable)
3479{
3480 struct perf_output_handle handle;
3481 int ret;
3482
3483 struct {
3484 struct perf_event_header header;
3485 u64 time;
3486 u64 id;
3487 u64 stream_id;
3488 } throttle_event = {
3489 .header = {
3490 .type = PERF_RECORD_THROTTLE,
3491 .misc = 0,
3492 .size = sizeof(throttle_event),
3493 },
3494 .time = perf_clock(),
3495 .id = primary_event_id(event),
3496 .stream_id = event->id,
3497 };
3498
3499 if (enable)
3500 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3501
3502 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3503 if (ret)
3504 return;
3505
3506 perf_output_put(&handle, throttle_event);
3507 perf_output_end(&handle);
3508}
3509
3510/*
3511 * Generic event overflow handling, sampling.
3512 */
3513
3514static int __perf_event_overflow(struct perf_event *event, int nmi,
3515 int throttle, struct perf_sample_data *data,
3516 struct pt_regs *regs)
3517{
3518 int events = atomic_read(&event->event_limit);
3519 struct hw_perf_event *hwc = &event->hw;
3520 int ret = 0;
3521
3522 throttle = (throttle && event->pmu->unthrottle != NULL);
3523
3524 if (!throttle) {
3525 hwc->interrupts++;
3526 } else {
3527 if (hwc->interrupts != MAX_INTERRUPTS) {
3528 hwc->interrupts++;
3529 if (HZ * hwc->interrupts >
3530 (u64)sysctl_perf_event_sample_rate) {
3531 hwc->interrupts = MAX_INTERRUPTS;
3532 perf_log_throttle(event, 0);
3533 ret = 1;
3534 }
3535 } else {
3536 /*
3537 * Keep re-disabling events even though on the previous
3538 * pass we disabled it - just in case we raced with a
3539 * sched-in and the event got enabled again:
3540 */
3541 ret = 1;
3542 }
3543 }
3544
3545 if (event->attr.freq) {
3546 u64 now = perf_clock();
3547 s64 delta = now - hwc->freq_stamp;
3548
3549 hwc->freq_stamp = now;
3550
3551 if (delta > 0 && delta < TICK_NSEC)
3552 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3553 }
3554
3555 /*
3556 * XXX event_limit might not quite work as expected on inherited
3557 * events
3558 */
3559
3560 event->pending_kill = POLL_IN;
3561 if (events && atomic_dec_and_test(&event->event_limit)) {
3562 ret = 1;
3563 event->pending_kill = POLL_HUP;
3564 if (nmi) {
3565 event->pending_disable = 1;
3566 perf_pending_queue(&event->pending,
3567 perf_pending_event);
3568 } else
3569 perf_event_disable(event);
3570 }
3571
3572 perf_event_output(event, nmi, data, regs);
3573 return ret;
3574}
3575
3576int perf_event_overflow(struct perf_event *event, int nmi,
3577 struct perf_sample_data *data,
3578 struct pt_regs *regs)
3579{
3580 return __perf_event_overflow(event, nmi, 1, data, regs);
3581}
3582
3583/*
3584 * Generic software event infrastructure
3585 */
3586
3587/*
3588 * We directly increment event->count and keep a second value in
3589 * event->hw.period_left to count intervals. This period event
3590 * is kept in the range [-sample_period, 0] so that we can use the
3591 * sign as trigger.
3592 */
3593
3594static u64 perf_swevent_set_period(struct perf_event *event)
3595{
3596 struct hw_perf_event *hwc = &event->hw;
3597 u64 period = hwc->last_period;
3598 u64 nr, offset;
3599 s64 old, val;
3600
3601 hwc->last_period = hwc->sample_period;
3602
3603again:
3604 old = val = atomic64_read(&hwc->period_left);
3605 if (val < 0)
3606 return 0;
3607
3608 nr = div64_u64(period + val, period);
3609 offset = nr * period;
3610 val -= offset;
3611 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3612 goto again;
3613
3614 return nr;
3615}
3616
3617static void perf_swevent_overflow(struct perf_event *event,
3618 int nmi, struct perf_sample_data *data,
3619 struct pt_regs *regs)
3620{
3621 struct hw_perf_event *hwc = &event->hw;
3622 int throttle = 0;
3623 u64 overflow;
3624
3625 data->period = event->hw.last_period;
3626 overflow = perf_swevent_set_period(event);
3627
3628 if (hwc->interrupts == MAX_INTERRUPTS)
3629 return;
3630
3631 for (; overflow; overflow--) {
3632 if (__perf_event_overflow(event, nmi, throttle,
3633 data, regs)) {
3634 /*
3635 * We inhibit the overflow from happening when
3636 * hwc->interrupts == MAX_INTERRUPTS.
3637 */
3638 break;
3639 }
3640 throttle = 1;
3641 }
3642}
3643
3644static void perf_swevent_unthrottle(struct perf_event *event)
3645{
3646 /*
3647 * Nothing to do, we already reset hwc->interrupts.
3648 */
3649}
3650
3651static void perf_swevent_add(struct perf_event *event, u64 nr,
3652 int nmi, struct perf_sample_data *data,
3653 struct pt_regs *regs)
3654{
3655 struct hw_perf_event *hwc = &event->hw;
3656
3657 atomic64_add(nr, &event->count);
3658
3659 if (!hwc->sample_period)
3660 return;
3661
3662 if (!regs)
3663 return;
3664
3665 if (!atomic64_add_negative(nr, &hwc->period_left))
3666 perf_swevent_overflow(event, nmi, data, regs);
3667}
3668
3669static int perf_swevent_is_counting(struct perf_event *event)
3670{
3671 /*
3672 * The event is active, we're good!
3673 */
3674 if (event->state == PERF_EVENT_STATE_ACTIVE)
3675 return 1;
3676
3677 /*
3678 * The event is off/error, not counting.
3679 */
3680 if (event->state != PERF_EVENT_STATE_INACTIVE)
3681 return 0;
3682
3683 /*
3684 * The event is inactive, if the context is active
3685 * we're part of a group that didn't make it on the 'pmu',
3686 * not counting.
3687 */
3688 if (event->ctx->is_active)
3689 return 0;
3690
3691 /*
3692 * We're inactive and the context is too, this means the
3693 * task is scheduled out, we're counting events that happen
3694 * to us, like migration events.
3695 */
3696 return 1;
3697}
3698
3699static int perf_swevent_match(struct perf_event *event,
3700 enum perf_type_id type,
3701 u32 event_id, struct pt_regs *regs)
3702{
3703 if (!perf_swevent_is_counting(event))
3704 return 0;
3705
3706 if (event->attr.type != type)
3707 return 0;
3708 if (event->attr.config != event_id)
3709 return 0;
3710
3711 if (regs) {
3712 if (event->attr.exclude_user && user_mode(regs))
3713 return 0;
3714
3715 if (event->attr.exclude_kernel && !user_mode(regs))
3716 return 0;
3717 }
3718
3719 return 1;
3720}
3721
3722static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3723 enum perf_type_id type,
3724 u32 event_id, u64 nr, int nmi,
3725 struct perf_sample_data *data,
3726 struct pt_regs *regs)
3727{
3728 struct perf_event *event;
3729
3730 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 return;
3732
3733 rcu_read_lock();
3734 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3735 if (perf_swevent_match(event, type, event_id, regs))
3736 perf_swevent_add(event, nr, nmi, data, regs);
3737 }
3738 rcu_read_unlock();
3739}
3740
3741static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3742{
3743 if (in_nmi())
3744 return &cpuctx->recursion[3];
3745
3746 if (in_irq())
3747 return &cpuctx->recursion[2];
3748
3749 if (in_softirq())
3750 return &cpuctx->recursion[1];
3751
3752 return &cpuctx->recursion[0];
3753}
3754
3755static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756 u64 nr, int nmi,
3757 struct perf_sample_data *data,
3758 struct pt_regs *regs)
3759{
3760 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3761 int *recursion = perf_swevent_recursion_context(cpuctx);
3762 struct perf_event_context *ctx;
3763
3764 if (*recursion)
3765 goto out;
3766
3767 (*recursion)++;
3768 barrier();
3769
3770 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771 nr, nmi, data, regs);
3772 rcu_read_lock();
3773 /*
3774 * doesn't really matter which of the child contexts the
3775 * events ends up in.
3776 */
3777 ctx = rcu_dereference(current->perf_event_ctxp);
3778 if (ctx)
3779 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780 rcu_read_unlock();
3781
3782 barrier();
3783 (*recursion)--;
3784
3785out:
3786 put_cpu_var(perf_cpu_context);
3787}
3788
3789void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790 struct pt_regs *regs, u64 addr)
3791{
3792 struct perf_sample_data data = {
3793 .addr = addr,
3794 };
3795
3796 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3797 &data, regs);
3798}
3799
3800static void perf_swevent_read(struct perf_event *event)
3801{
3802}
3803
3804static int perf_swevent_enable(struct perf_event *event)
3805{
3806 struct hw_perf_event *hwc = &event->hw;
3807
3808 if (hwc->sample_period) {
3809 hwc->last_period = hwc->sample_period;
3810 perf_swevent_set_period(event);
3811 }
3812 return 0;
3813}
3814
3815static void perf_swevent_disable(struct perf_event *event)
3816{
3817}
3818
3819static const struct pmu perf_ops_generic = {
3820 .enable = perf_swevent_enable,
3821 .disable = perf_swevent_disable,
3822 .read = perf_swevent_read,
3823 .unthrottle = perf_swevent_unthrottle,
3824};
3825
3826/*
3827 * hrtimer based swevent callback
3828 */
3829
3830static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3831{
3832 enum hrtimer_restart ret = HRTIMER_RESTART;
3833 struct perf_sample_data data;
3834 struct pt_regs *regs;
3835 struct perf_event *event;
3836 u64 period;
3837
3838 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3839 event->pmu->read(event);
3840
3841 data.addr = 0;
3842 regs = get_irq_regs();
3843 /*
3844 * In case we exclude kernel IPs or are somehow not in interrupt
3845 * context, provide the next best thing, the user IP.
3846 */
3847 if ((event->attr.exclude_kernel || !regs) &&
3848 !event->attr.exclude_user)
3849 regs = task_pt_regs(current);
3850
3851 if (regs) {
3852 if (perf_event_overflow(event, 0, &data, regs))
3853 ret = HRTIMER_NORESTART;
3854 }
3855
3856 period = max_t(u64, 10000, event->hw.sample_period);
3857 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3858
3859 return ret;
3860}
3861
3862/*
3863 * Software event: cpu wall time clock
3864 */
3865
3866static void cpu_clock_perf_event_update(struct perf_event *event)
3867{
3868 int cpu = raw_smp_processor_id();
3869 s64 prev;
3870 u64 now;
3871
3872 now = cpu_clock(cpu);
3873 prev = atomic64_read(&event->hw.prev_count);
3874 atomic64_set(&event->hw.prev_count, now);
3875 atomic64_add(now - prev, &event->count);
3876}
3877
3878static int cpu_clock_perf_event_enable(struct perf_event *event)
3879{
3880 struct hw_perf_event *hwc = &event->hw;
3881 int cpu = raw_smp_processor_id();
3882
3883 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3885 hwc->hrtimer.function = perf_swevent_hrtimer;
3886 if (hwc->sample_period) {
3887 u64 period = max_t(u64, 10000, hwc->sample_period);
3888 __hrtimer_start_range_ns(&hwc->hrtimer,
3889 ns_to_ktime(period), 0,
3890 HRTIMER_MODE_REL, 0);
3891 }
3892
3893 return 0;
3894}
3895
3896static void cpu_clock_perf_event_disable(struct perf_event *event)
3897{
3898 if (event->hw.sample_period)
3899 hrtimer_cancel(&event->hw.hrtimer);
3900 cpu_clock_perf_event_update(event);
3901}
3902
3903static void cpu_clock_perf_event_read(struct perf_event *event)
3904{
3905 cpu_clock_perf_event_update(event);
3906}
3907
3908static const struct pmu perf_ops_cpu_clock = {
3909 .enable = cpu_clock_perf_event_enable,
3910 .disable = cpu_clock_perf_event_disable,
3911 .read = cpu_clock_perf_event_read,
3912};
3913
3914/*
3915 * Software event: task time clock
3916 */
3917
3918static void task_clock_perf_event_update(struct perf_event *event, u64 now)
3919{
3920 u64 prev;
3921 s64 delta;
3922
3923 prev = atomic64_xchg(&event->hw.prev_count, now);
3924 delta = now - prev;
3925 atomic64_add(delta, &event->count);
3926}
3927
3928static int task_clock_perf_event_enable(struct perf_event *event)
3929{
3930 struct hw_perf_event *hwc = &event->hw;
3931 u64 now;
3932
3933 now = event->ctx->time;
3934
3935 atomic64_set(&hwc->prev_count, now);
3936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3937 hwc->hrtimer.function = perf_swevent_hrtimer;
3938 if (hwc->sample_period) {
3939 u64 period = max_t(u64, 10000, hwc->sample_period);
3940 __hrtimer_start_range_ns(&hwc->hrtimer,
3941 ns_to_ktime(period), 0,
3942 HRTIMER_MODE_REL, 0);
3943 }
3944
3945 return 0;
3946}
3947
3948static void task_clock_perf_event_disable(struct perf_event *event)
3949{
3950 if (event->hw.sample_period)
3951 hrtimer_cancel(&event->hw.hrtimer);
3952 task_clock_perf_event_update(event, event->ctx->time);
3953
3954}
3955
3956static void task_clock_perf_event_read(struct perf_event *event)
3957{
3958 u64 time;
3959
3960 if (!in_nmi()) {
3961 update_context_time(event->ctx);
3962 time = event->ctx->time;
3963 } else {
3964 u64 now = perf_clock();
3965 u64 delta = now - event->ctx->timestamp;
3966 time = event->ctx->time + delta;
3967 }
3968
3969 task_clock_perf_event_update(event, time);
3970}
3971
3972static const struct pmu perf_ops_task_clock = {
3973 .enable = task_clock_perf_event_enable,
3974 .disable = task_clock_perf_event_disable,
3975 .read = task_clock_perf_event_read,
3976};
3977
3978#ifdef CONFIG_EVENT_PROFILE
3979void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980 int entry_size)
3981{
3982 struct perf_raw_record raw = {
3983 .size = entry_size,
3984 .data = record,
3985 };
3986
3987 struct perf_sample_data data = {
3988 .addr = addr,
3989 .raw = &raw,
3990 };
3991
3992 struct pt_regs *regs = get_irq_regs();
3993
3994 if (!regs)
3995 regs = task_pt_regs(current);
3996
3997 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998 &data, regs);
3999}
4000EXPORT_SYMBOL_GPL(perf_tp_event);
4001
4002extern int ftrace_profile_enable(int);
4003extern void ftrace_profile_disable(int);
4004
4005static void tp_perf_event_destroy(struct perf_event *event)
4006{
4007 ftrace_profile_disable(event->attr.config);
4008}
4009
4010static const struct pmu *tp_perf_event_init(struct perf_event *event)
4011{
4012 /*
4013 * Raw tracepoint data is a severe data leak, only allow root to
4014 * have these.
4015 */
4016 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4017 perf_paranoid_tracepoint_raw() &&
4018 !capable(CAP_SYS_ADMIN))
4019 return ERR_PTR(-EPERM);
4020
4021 if (ftrace_profile_enable(event->attr.config))
4022 return NULL;
4023
4024 event->destroy = tp_perf_event_destroy;
4025
4026 return &perf_ops_generic;
4027}
4028#else
4029static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030{
4031 return NULL;
4032}
4033#endif
4034
4035atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4036
4037static void sw_perf_event_destroy(struct perf_event *event)
4038{
4039 u64 event_id = event->attr.config;
4040
4041 WARN_ON(event->parent);
4042
4043 atomic_dec(&perf_swevent_enabled[event_id]);
4044}
4045
4046static const struct pmu *sw_perf_event_init(struct perf_event *event)
4047{
4048 const struct pmu *pmu = NULL;
4049 u64 event_id = event->attr.config;
4050
4051 /*
4052 * Software events (currently) can't in general distinguish
4053 * between user, kernel and hypervisor events.
4054 * However, context switches and cpu migrations are considered
4055 * to be kernel events, and page faults are never hypervisor
4056 * events.
4057 */
4058 switch (event_id) {
4059 case PERF_COUNT_SW_CPU_CLOCK:
4060 pmu = &perf_ops_cpu_clock;
4061
4062 break;
4063 case PERF_COUNT_SW_TASK_CLOCK:
4064 /*
4065 * If the user instantiates this as a per-cpu event,
4066 * use the cpu_clock event instead.
4067 */
4068 if (event->ctx->task)
4069 pmu = &perf_ops_task_clock;
4070 else
4071 pmu = &perf_ops_cpu_clock;
4072
4073 break;
4074 case PERF_COUNT_SW_PAGE_FAULTS:
4075 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4076 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4077 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4078 case PERF_COUNT_SW_CPU_MIGRATIONS:
4079 if (!event->parent) {
4080 atomic_inc(&perf_swevent_enabled[event_id]);
4081 event->destroy = sw_perf_event_destroy;
4082 }
4083 pmu = &perf_ops_generic;
4084 break;
4085 }
4086
4087 return pmu;
4088}
4089
4090/*
4091 * Allocate and initialize a event structure
4092 */
4093static struct perf_event *
4094perf_event_alloc(struct perf_event_attr *attr,
4095 int cpu,
4096 struct perf_event_context *ctx,
4097 struct perf_event *group_leader,
4098 struct perf_event *parent_event,
4099 gfp_t gfpflags)
4100{
4101 const struct pmu *pmu;
4102 struct perf_event *event;
4103 struct hw_perf_event *hwc;
4104 long err;
4105
4106 event = kzalloc(sizeof(*event), gfpflags);
4107 if (!event)
4108 return ERR_PTR(-ENOMEM);
4109
4110 /*
4111 * Single events are their own group leaders, with an
4112 * empty sibling list:
4113 */
4114 if (!group_leader)
4115 group_leader = event;
4116
4117 mutex_init(&event->child_mutex);
4118 INIT_LIST_HEAD(&event->child_list);
4119
4120 INIT_LIST_HEAD(&event->group_entry);
4121 INIT_LIST_HEAD(&event->event_entry);
4122 INIT_LIST_HEAD(&event->sibling_list);
4123 init_waitqueue_head(&event->waitq);
4124
4125 mutex_init(&event->mmap_mutex);
4126
4127 event->cpu = cpu;
4128 event->attr = *attr;
4129 event->group_leader = group_leader;
4130 event->pmu = NULL;
4131 event->ctx = ctx;
4132 event->oncpu = -1;
4133
4134 event->parent = parent_event;
4135
4136 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4137 event->id = atomic64_inc_return(&perf_event_id);
4138
4139 event->state = PERF_EVENT_STATE_INACTIVE;
4140
4141 if (attr->disabled)
4142 event->state = PERF_EVENT_STATE_OFF;
4143
4144 pmu = NULL;
4145
4146 hwc = &event->hw;
4147 hwc->sample_period = attr->sample_period;
4148 if (attr->freq && attr->sample_freq)
4149 hwc->sample_period = 1;
4150 hwc->last_period = hwc->sample_period;
4151
4152 atomic64_set(&hwc->period_left, hwc->sample_period);
4153
4154 /*
4155 * we currently do not support PERF_FORMAT_GROUP on inherited events
4156 */
4157 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4158 goto done;
4159
4160 switch (attr->type) {
4161 case PERF_TYPE_RAW:
4162 case PERF_TYPE_HARDWARE:
4163 case PERF_TYPE_HW_CACHE:
4164 pmu = hw_perf_event_init(event);
4165 break;
4166
4167 case PERF_TYPE_SOFTWARE:
4168 pmu = sw_perf_event_init(event);
4169 break;
4170
4171 case PERF_TYPE_TRACEPOINT:
4172 pmu = tp_perf_event_init(event);
4173 break;
4174
4175 default:
4176 break;
4177 }
4178done:
4179 err = 0;
4180 if (!pmu)
4181 err = -EINVAL;
4182 else if (IS_ERR(pmu))
4183 err = PTR_ERR(pmu);
4184
4185 if (err) {
4186 if (event->ns)
4187 put_pid_ns(event->ns);
4188 kfree(event);
4189 return ERR_PTR(err);
4190 }
4191
4192 event->pmu = pmu;
4193
4194 if (!event->parent) {
4195 atomic_inc(&nr_events);
4196 if (event->attr.mmap)
4197 atomic_inc(&nr_mmap_events);
4198 if (event->attr.comm)
4199 atomic_inc(&nr_comm_events);
4200 if (event->attr.task)
4201 atomic_inc(&nr_task_events);
4202 }
4203
4204 return event;
4205}
4206
4207static int perf_copy_attr(struct perf_event_attr __user *uattr,
4208 struct perf_event_attr *attr)
4209{
4210 u32 size;
4211 int ret;
4212
4213 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4214 return -EFAULT;
4215
4216 /*
4217 * zero the full structure, so that a short copy will be nice.
4218 */
4219 memset(attr, 0, sizeof(*attr));
4220
4221 ret = get_user(size, &uattr->size);
4222 if (ret)
4223 return ret;
4224
4225 if (size > PAGE_SIZE) /* silly large */
4226 goto err_size;
4227
4228 if (!size) /* abi compat */
4229 size = PERF_ATTR_SIZE_VER0;
4230
4231 if (size < PERF_ATTR_SIZE_VER0)
4232 goto err_size;
4233
4234 /*
4235 * If we're handed a bigger struct than we know of,
4236 * ensure all the unknown bits are 0 - i.e. new
4237 * user-space does not rely on any kernel feature
4238 * extensions we dont know about yet.
4239 */
4240 if (size > sizeof(*attr)) {
4241 unsigned char __user *addr;
4242 unsigned char __user *end;
4243 unsigned char val;
4244
4245 addr = (void __user *)uattr + sizeof(*attr);
4246 end = (void __user *)uattr + size;
4247
4248 for (; addr < end; addr++) {
4249 ret = get_user(val, addr);
4250 if (ret)
4251 return ret;
4252 if (val)
4253 goto err_size;
4254 }
4255 size = sizeof(*attr);
4256 }
4257
4258 ret = copy_from_user(attr, uattr, size);
4259 if (ret)
4260 return -EFAULT;
4261
4262 /*
4263 * If the type exists, the corresponding creation will verify
4264 * the attr->config.
4265 */
4266 if (attr->type >= PERF_TYPE_MAX)
4267 return -EINVAL;
4268
4269 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4270 return -EINVAL;
4271
4272 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4273 return -EINVAL;
4274
4275 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4276 return -EINVAL;
4277
4278out:
4279 return ret;
4280
4281err_size:
4282 put_user(sizeof(*attr), &uattr->size);
4283 ret = -E2BIG;
4284 goto out;
4285}
4286
4287int perf_event_set_output(struct perf_event *event, int output_fd)
4288{
4289 struct perf_event *output_event = NULL;
4290 struct file *output_file = NULL;
4291 struct perf_event *old_output;
4292 int fput_needed = 0;
4293 int ret = -EINVAL;
4294
4295 if (!output_fd)
4296 goto set;
4297
4298 output_file = fget_light(output_fd, &fput_needed);
4299 if (!output_file)
4300 return -EBADF;
4301
4302 if (output_file->f_op != &perf_fops)
4303 goto out;
4304
4305 output_event = output_file->private_data;
4306
4307 /* Don't chain output fds */
4308 if (output_event->output)
4309 goto out;
4310
4311 /* Don't set an output fd when we already have an output channel */
4312 if (event->data)
4313 goto out;
4314
4315 atomic_long_inc(&output_file->f_count);
4316
4317set:
4318 mutex_lock(&event->mmap_mutex);
4319 old_output = event->output;
4320 rcu_assign_pointer(event->output, output_event);
4321 mutex_unlock(&event->mmap_mutex);
4322
4323 if (old_output) {
4324 /*
4325 * we need to make sure no existing perf_output_*()
4326 * is still referencing this event.
4327 */
4328 synchronize_rcu();
4329 fput(old_output->filp);
4330 }
4331
4332 ret = 0;
4333out:
4334 fput_light(output_file, fput_needed);
4335 return ret;
4336}
4337
4338/**
4339 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4340 *
4341 * @attr_uptr: event_id type attributes for monitoring/sampling
4342 * @pid: target pid
4343 * @cpu: target cpu
4344 * @group_fd: group leader event fd
4345 */
4346SYSCALL_DEFINE5(perf_event_open,
4347 struct perf_event_attr __user *, attr_uptr,
4348 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4349{
4350 struct perf_event *event, *group_leader;
4351 struct perf_event_attr attr;
4352 struct perf_event_context *ctx;
4353 struct file *event_file = NULL;
4354 struct file *group_file = NULL;
4355 int fput_needed = 0;
4356 int fput_needed2 = 0;
4357 int err;
4358
4359 /* for future expandability... */
4360 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4361 return -EINVAL;
4362
4363 err = perf_copy_attr(attr_uptr, &attr);
4364 if (err)
4365 return err;
4366
4367 if (!attr.exclude_kernel) {
4368 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4369 return -EACCES;
4370 }
4371
4372 if (attr.freq) {
4373 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4374 return -EINVAL;
4375 }
4376
4377 /*
4378 * Get the target context (task or percpu):
4379 */
4380 ctx = find_get_context(pid, cpu);
4381 if (IS_ERR(ctx))
4382 return PTR_ERR(ctx);
4383
4384 /*
4385 * Look up the group leader (we will attach this event to it):
4386 */
4387 group_leader = NULL;
4388 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4389 err = -EINVAL;
4390 group_file = fget_light(group_fd, &fput_needed);
4391 if (!group_file)
4392 goto err_put_context;
4393 if (group_file->f_op != &perf_fops)
4394 goto err_put_context;
4395
4396 group_leader = group_file->private_data;
4397 /*
4398 * Do not allow a recursive hierarchy (this new sibling
4399 * becoming part of another group-sibling):
4400 */
4401 if (group_leader->group_leader != group_leader)
4402 goto err_put_context;
4403 /*
4404 * Do not allow to attach to a group in a different
4405 * task or CPU context:
4406 */
4407 if (group_leader->ctx != ctx)
4408 goto err_put_context;
4409 /*
4410 * Only a group leader can be exclusive or pinned
4411 */
4412 if (attr.exclusive || attr.pinned)
4413 goto err_put_context;
4414 }
4415
4416 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417 NULL, GFP_KERNEL);
4418 err = PTR_ERR(event);
4419 if (IS_ERR(event))
4420 goto err_put_context;
4421
4422 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4423 if (err < 0)
4424 goto err_free_put_context;
4425
4426 event_file = fget_light(err, &fput_needed2);
4427 if (!event_file)
4428 goto err_free_put_context;
4429
4430 if (flags & PERF_FLAG_FD_OUTPUT) {
4431 err = perf_event_set_output(event, group_fd);
4432 if (err)
4433 goto err_fput_free_put_context;
4434 }
4435
4436 event->filp = event_file;
4437 WARN_ON_ONCE(ctx->parent_ctx);
4438 mutex_lock(&ctx->mutex);
4439 perf_install_in_context(ctx, event, cpu);
4440 ++ctx->generation;
4441 mutex_unlock(&ctx->mutex);
4442
4443 event->owner = current;
4444 get_task_struct(current);
4445 mutex_lock(&current->perf_event_mutex);
4446 list_add_tail(&event->owner_entry, &current->perf_event_list);
4447 mutex_unlock(&current->perf_event_mutex);
4448
4449err_fput_free_put_context:
4450 fput_light(event_file, fput_needed2);
4451
4452err_free_put_context:
4453 if (err < 0)
4454 kfree(event);
4455
4456err_put_context:
4457 if (err < 0)
4458 put_ctx(ctx);
4459
4460 fput_light(group_file, fput_needed);
4461
4462 return err;
4463}
4464
4465/*
4466 * inherit a event from parent task to child task:
4467 */
4468static struct perf_event *
4469inherit_event(struct perf_event *parent_event,
4470 struct task_struct *parent,
4471 struct perf_event_context *parent_ctx,
4472 struct task_struct *child,
4473 struct perf_event *group_leader,
4474 struct perf_event_context *child_ctx)
4475{
4476 struct perf_event *child_event;
4477
4478 /*
4479 * Instead of creating recursive hierarchies of events,
4480 * we link inherited events back to the original parent,
4481 * which has a filp for sure, which we use as the reference
4482 * count:
4483 */
4484 if (parent_event->parent)
4485 parent_event = parent_event->parent;
4486
4487 child_event = perf_event_alloc(&parent_event->attr,
4488 parent_event->cpu, child_ctx,
4489 group_leader, parent_event,
4490 GFP_KERNEL);
4491 if (IS_ERR(child_event))
4492 return child_event;
4493 get_ctx(child_ctx);
4494
4495 /*
4496 * Make the child state follow the state of the parent event,
4497 * not its attr.disabled bit. We hold the parent's mutex,
4498 * so we won't race with perf_event_{en, dis}able_family.
4499 */
4500 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4501 child_event->state = PERF_EVENT_STATE_INACTIVE;
4502 else
4503 child_event->state = PERF_EVENT_STATE_OFF;
4504
4505 if (parent_event->attr.freq)
4506 child_event->hw.sample_period = parent_event->hw.sample_period;
4507
4508 /*
4509 * Link it up in the child's context:
4510 */
4511 add_event_to_ctx(child_event, child_ctx);
4512
4513 /*
4514 * Get a reference to the parent filp - we will fput it
4515 * when the child event exits. This is safe to do because
4516 * we are in the parent and we know that the filp still
4517 * exists and has a nonzero count:
4518 */
4519 atomic_long_inc(&parent_event->filp->f_count);
4520
4521 /*
4522 * Link this into the parent event's child list
4523 */
4524 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4525 mutex_lock(&parent_event->child_mutex);
4526 list_add_tail(&child_event->child_list, &parent_event->child_list);
4527 mutex_unlock(&parent_event->child_mutex);
4528
4529 return child_event;
4530}
4531
4532static int inherit_group(struct perf_event *parent_event,
4533 struct task_struct *parent,
4534 struct perf_event_context *parent_ctx,
4535 struct task_struct *child,
4536 struct perf_event_context *child_ctx)
4537{
4538 struct perf_event *leader;
4539 struct perf_event *sub;
4540 struct perf_event *child_ctr;
4541
4542 leader = inherit_event(parent_event, parent, parent_ctx,
4543 child, NULL, child_ctx);
4544 if (IS_ERR(leader))
4545 return PTR_ERR(leader);
4546 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4547 child_ctr = inherit_event(sub, parent, parent_ctx,
4548 child, leader, child_ctx);
4549 if (IS_ERR(child_ctr))
4550 return PTR_ERR(child_ctr);
4551 }
4552 return 0;
4553}
4554
4555static void sync_child_event(struct perf_event *child_event,
4556 struct task_struct *child)
4557{
4558 struct perf_event *parent_event = child_event->parent;
4559 u64 child_val;
4560
4561 if (child_event->attr.inherit_stat)
4562 perf_event_read_event(child_event, child);
4563
4564 child_val = atomic64_read(&child_event->count);
4565
4566 /*
4567 * Add back the child's count to the parent's count:
4568 */
4569 atomic64_add(child_val, &parent_event->count);
4570 atomic64_add(child_event->total_time_enabled,
4571 &parent_event->child_total_time_enabled);
4572 atomic64_add(child_event->total_time_running,
4573 &parent_event->child_total_time_running);
4574
4575 /*
4576 * Remove this event from the parent's list
4577 */
4578 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4579 mutex_lock(&parent_event->child_mutex);
4580 list_del_init(&child_event->child_list);
4581 mutex_unlock(&parent_event->child_mutex);
4582
4583 /*
4584 * Release the parent event, if this was the last
4585 * reference to it.
4586 */
4587 fput(parent_event->filp);
4588}
4589
4590static void
4591__perf_event_exit_task(struct perf_event *child_event,
4592 struct perf_event_context *child_ctx,
4593 struct task_struct *child)
4594{
4595 struct perf_event *parent_event;
4596
4597 update_event_times(child_event);
4598 perf_event_remove_from_context(child_event);
4599
4600 parent_event = child_event->parent;
4601 /*
4602 * It can happen that parent exits first, and has events
4603 * that are still around due to the child reference. These
4604 * events need to be zapped - but otherwise linger.
4605 */
4606 if (parent_event) {
4607 sync_child_event(child_event, child);
4608 free_event(child_event);
4609 }
4610}
4611
4612/*
4613 * When a child task exits, feed back event values to parent events.
4614 */
4615void perf_event_exit_task(struct task_struct *child)
4616{
4617 struct perf_event *child_event, *tmp;
4618 struct perf_event_context *child_ctx;
4619 unsigned long flags;
4620
4621 if (likely(!child->perf_event_ctxp)) {
4622 perf_event_task(child, NULL, 0);
4623 return;
4624 }
4625
4626 local_irq_save(flags);
4627 /*
4628 * We can't reschedule here because interrupts are disabled,
4629 * and either child is current or it is a task that can't be
4630 * scheduled, so we are now safe from rescheduling changing
4631 * our context.
4632 */
4633 child_ctx = child->perf_event_ctxp;
4634 __perf_event_task_sched_out(child_ctx);
4635
4636 /*
4637 * Take the context lock here so that if find_get_context is
4638 * reading child->perf_event_ctxp, we wait until it has
4639 * incremented the context's refcount before we do put_ctx below.
4640 */
4641 spin_lock(&child_ctx->lock);
4642 child->perf_event_ctxp = NULL;
4643 /*
4644 * If this context is a clone; unclone it so it can't get
4645 * swapped to another process while we're removing all
4646 * the events from it.
4647 */
4648 unclone_ctx(child_ctx);
4649 spin_unlock_irqrestore(&child_ctx->lock, flags);
4650
4651 /*
4652 * Report the task dead after unscheduling the events so that we
4653 * won't get any samples after PERF_RECORD_EXIT. We can however still
4654 * get a few PERF_RECORD_READ events.
4655 */
4656 perf_event_task(child, child_ctx, 0);
4657
4658 /*
4659 * We can recurse on the same lock type through:
4660 *
4661 * __perf_event_exit_task()
4662 * sync_child_event()
4663 * fput(parent_event->filp)
4664 * perf_release()
4665 * mutex_lock(&ctx->mutex)
4666 *
4667 * But since its the parent context it won't be the same instance.
4668 */
4669 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4670
4671again:
4672 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4673 group_entry)
4674 __perf_event_exit_task(child_event, child_ctx, child);
4675
4676 /*
4677 * If the last event was a group event, it will have appended all
4678 * its siblings to the list, but we obtained 'tmp' before that which
4679 * will still point to the list head terminating the iteration.
4680 */
4681 if (!list_empty(&child_ctx->group_list))
4682 goto again;
4683
4684 mutex_unlock(&child_ctx->mutex);
4685
4686 put_ctx(child_ctx);
4687}
4688
4689/*
4690 * free an unexposed, unused context as created by inheritance by
4691 * init_task below, used by fork() in case of fail.
4692 */
4693void perf_event_free_task(struct task_struct *task)
4694{
4695 struct perf_event_context *ctx = task->perf_event_ctxp;
4696 struct perf_event *event, *tmp;
4697
4698 if (!ctx)
4699 return;
4700
4701 mutex_lock(&ctx->mutex);
4702again:
4703 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4704 struct perf_event *parent = event->parent;
4705
4706 if (WARN_ON_ONCE(!parent))
4707 continue;
4708
4709 mutex_lock(&parent->child_mutex);
4710 list_del_init(&event->child_list);
4711 mutex_unlock(&parent->child_mutex);
4712
4713 fput(parent->filp);
4714
4715 list_del_event(event, ctx);
4716 free_event(event);
4717 }
4718
4719 if (!list_empty(&ctx->group_list))
4720 goto again;
4721
4722 mutex_unlock(&ctx->mutex);
4723
4724 put_ctx(ctx);
4725}
4726
4727/*
4728 * Initialize the perf_event context in task_struct
4729 */
4730int perf_event_init_task(struct task_struct *child)
4731{
4732 struct perf_event_context *child_ctx, *parent_ctx;
4733 struct perf_event_context *cloned_ctx;
4734 struct perf_event *event;
4735 struct task_struct *parent = current;
4736 int inherited_all = 1;
4737 int ret = 0;
4738
4739 child->perf_event_ctxp = NULL;
4740
4741 mutex_init(&child->perf_event_mutex);
4742 INIT_LIST_HEAD(&child->perf_event_list);
4743
4744 if (likely(!parent->perf_event_ctxp))
4745 return 0;
4746
4747 /*
4748 * This is executed from the parent task context, so inherit
4749 * events that have been marked for cloning.
4750 * First allocate and initialize a context for the child.
4751 */
4752
4753 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4754 if (!child_ctx)
4755 return -ENOMEM;
4756
4757 __perf_event_init_context(child_ctx, child);
4758 child->perf_event_ctxp = child_ctx;
4759 get_task_struct(child);
4760
4761 /*
4762 * If the parent's context is a clone, pin it so it won't get
4763 * swapped under us.
4764 */
4765 parent_ctx = perf_pin_task_context(parent);
4766
4767 /*
4768 * No need to check if parent_ctx != NULL here; since we saw
4769 * it non-NULL earlier, the only reason for it to become NULL
4770 * is if we exit, and since we're currently in the middle of
4771 * a fork we can't be exiting at the same time.
4772 */
4773
4774 /*
4775 * Lock the parent list. No need to lock the child - not PID
4776 * hashed yet and not running, so nobody can access it.
4777 */
4778 mutex_lock(&parent_ctx->mutex);
4779
4780 /*
4781 * We dont have to disable NMIs - we are only looking at
4782 * the list, not manipulating it:
4783 */
4784 list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
4785 if (event != event->group_leader)
4786 continue;
4787
4788 if (!event->attr.inherit) {
4789 inherited_all = 0;
4790 continue;
4791 }
4792
4793 ret = inherit_group(event, parent, parent_ctx,
4794 child, child_ctx);
4795 if (ret) {
4796 inherited_all = 0;
4797 break;
4798 }
4799 }
4800
4801 if (inherited_all) {
4802 /*
4803 * Mark the child context as a clone of the parent
4804 * context, or of whatever the parent is a clone of.
4805 * Note that if the parent is a clone, it could get
4806 * uncloned at any point, but that doesn't matter
4807 * because the list of events and the generation
4808 * count can't have changed since we took the mutex.
4809 */
4810 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4811 if (cloned_ctx) {
4812 child_ctx->parent_ctx = cloned_ctx;
4813 child_ctx->parent_gen = parent_ctx->parent_gen;
4814 } else {
4815 child_ctx->parent_ctx = parent_ctx;
4816 child_ctx->parent_gen = parent_ctx->generation;
4817 }
4818 get_ctx(child_ctx->parent_ctx);
4819 }
4820
4821 mutex_unlock(&parent_ctx->mutex);
4822
4823 perf_unpin_context(parent_ctx);
4824
4825 return ret;
4826}
4827
4828static void __cpuinit perf_event_init_cpu(int cpu)
4829{
4830 struct perf_cpu_context *cpuctx;
4831
4832 cpuctx = &per_cpu(perf_cpu_context, cpu);
4833 __perf_event_init_context(&cpuctx->ctx, NULL);
4834
4835 spin_lock(&perf_resource_lock);
4836 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4837 spin_unlock(&perf_resource_lock);
4838
4839 hw_perf_event_setup(cpu);
4840}
4841
4842#ifdef CONFIG_HOTPLUG_CPU
4843static void __perf_event_exit_cpu(void *info)
4844{
4845 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4846 struct perf_event_context *ctx = &cpuctx->ctx;
4847 struct perf_event *event, *tmp;
4848
4849 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4850 __perf_event_remove_from_context(event);
4851}
4852static void perf_event_exit_cpu(int cpu)
4853{
4854 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4855 struct perf_event_context *ctx = &cpuctx->ctx;
4856
4857 mutex_lock(&ctx->mutex);
4858 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4859 mutex_unlock(&ctx->mutex);
4860}
4861#else
4862static inline void perf_event_exit_cpu(int cpu) { }
4863#endif
4864
4865static int __cpuinit
4866perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4867{
4868 unsigned int cpu = (long)hcpu;
4869
4870 switch (action) {
4871
4872 case CPU_UP_PREPARE:
4873 case CPU_UP_PREPARE_FROZEN:
4874 perf_event_init_cpu(cpu);
4875 break;
4876
4877 case CPU_ONLINE:
4878 case CPU_ONLINE_FROZEN:
4879 hw_perf_event_setup_online(cpu);
4880 break;
4881
4882 case CPU_DOWN_PREPARE:
4883 case CPU_DOWN_PREPARE_FROZEN:
4884 perf_event_exit_cpu(cpu);
4885 break;
4886
4887 default:
4888 break;
4889 }
4890
4891 return NOTIFY_OK;
4892}
4893
4894/*
4895 * This has to have a higher priority than migration_notifier in sched.c.
4896 */
4897static struct notifier_block __cpuinitdata perf_cpu_nb = {
4898 .notifier_call = perf_cpu_notify,
4899 .priority = 20,
4900};
4901
4902void __init perf_event_init(void)
4903{
4904 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4905 (void *)(long)smp_processor_id());
4906 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4907 (void *)(long)smp_processor_id());
4908 register_cpu_notifier(&perf_cpu_nb);
4909}
4910
4911static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4912{
4913 return sprintf(buf, "%d\n", perf_reserved_percpu);
4914}
4915
4916static ssize_t
4917perf_set_reserve_percpu(struct sysdev_class *class,
4918 const char *buf,
4919 size_t count)
4920{
4921 struct perf_cpu_context *cpuctx;
4922 unsigned long val;
4923 int err, cpu, mpt;
4924
4925 err = strict_strtoul(buf, 10, &val);
4926 if (err)
4927 return err;
4928 if (val > perf_max_events)
4929 return -EINVAL;
4930
4931 spin_lock(&perf_resource_lock);
4932 perf_reserved_percpu = val;
4933 for_each_online_cpu(cpu) {
4934 cpuctx = &per_cpu(perf_cpu_context, cpu);
4935 spin_lock_irq(&cpuctx->ctx.lock);
4936 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
4937 perf_max_events - perf_reserved_percpu);
4938 cpuctx->max_pertask = mpt;
4939 spin_unlock_irq(&cpuctx->ctx.lock);
4940 }
4941 spin_unlock(&perf_resource_lock);
4942
4943 return count;
4944}
4945
4946static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4947{
4948 return sprintf(buf, "%d\n", perf_overcommit);
4949}
4950
4951static ssize_t
4952perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4953{
4954 unsigned long val;
4955 int err;
4956
4957 err = strict_strtoul(buf, 10, &val);
4958 if (err)
4959 return err;
4960 if (val > 1)
4961 return -EINVAL;
4962
4963 spin_lock(&perf_resource_lock);
4964 perf_overcommit = val;
4965 spin_unlock(&perf_resource_lock);
4966
4967 return count;
4968}
4969
4970static SYSDEV_CLASS_ATTR(
4971 reserve_percpu,
4972 0644,
4973 perf_show_reserve_percpu,
4974 perf_set_reserve_percpu
4975 );
4976
4977static SYSDEV_CLASS_ATTR(
4978 overcommit,
4979 0644,
4980 perf_show_overcommit,
4981 perf_set_overcommit
4982 );
4983
4984static struct attribute *perfclass_attrs[] = {
4985 &attr_reserve_percpu.attr,
4986 &attr_overcommit.attr,
4987 NULL
4988};
4989
4990static struct attribute_group perfclass_attr_group = {
4991 .attrs = perfclass_attrs,
4992 .name = "perf_events",
4993};
4994
4995static int __init perf_event_sysfs_init(void)
4996{
4997 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4998 &perfclass_attr_group);
4999}
5000device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e33a21cb9407..5c9dc228747b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -8,17 +8,18 @@
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h>
11 12
12/* 13/*
13 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
14 */ 15 */
15void update_rlimit_cpu(unsigned long rlim_new) 16void update_rlimit_cpu(unsigned long rlim_new)
16{ 17{
17 cputime_t cputime; 18 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
18 20
19 cputime = secs_to_cputime(rlim_new); 21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) ||
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) {
21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 23 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
@@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
542 now); 543 now);
543} 544}
544 545
546static inline int expires_gt(cputime_t expires, cputime_t new_exp)
547{
548 return cputime_eq(expires, cputime_zero) ||
549 cputime_gt(expires, new_exp);
550}
551
552static inline int expires_le(cputime_t expires, cputime_t new_exp)
553{
554 return !cputime_eq(expires, cputime_zero) &&
555 cputime_le(expires, new_exp);
556}
545/* 557/*
546 * Insert the timer on the appropriate list before any timers that 558 * Insert the timer on the appropriate list before any timers that
547 * expire later. This must be called with the tasklist_lock held 559 * expire later. This must be called with the tasklist_lock held
@@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
586 */ 598 */
587 599
588 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 600 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
601 union cpu_time_count *exp = &nt->expires;
602
589 switch (CPUCLOCK_WHICH(timer->it_clock)) { 603 switch (CPUCLOCK_WHICH(timer->it_clock)) {
590 default: 604 default:
591 BUG(); 605 BUG();
592 case CPUCLOCK_PROF: 606 case CPUCLOCK_PROF:
593 if (cputime_eq(p->cputime_expires.prof_exp, 607 if (expires_gt(p->cputime_expires.prof_exp,
594 cputime_zero) || 608 exp->cpu))
595 cputime_gt(p->cputime_expires.prof_exp, 609 p->cputime_expires.prof_exp = exp->cpu;
596 nt->expires.cpu))
597 p->cputime_expires.prof_exp =
598 nt->expires.cpu;
599 break; 610 break;
600 case CPUCLOCK_VIRT: 611 case CPUCLOCK_VIRT:
601 if (cputime_eq(p->cputime_expires.virt_exp, 612 if (expires_gt(p->cputime_expires.virt_exp,
602 cputime_zero) || 613 exp->cpu))
603 cputime_gt(p->cputime_expires.virt_exp, 614 p->cputime_expires.virt_exp = exp->cpu;
604 nt->expires.cpu))
605 p->cputime_expires.virt_exp =
606 nt->expires.cpu;
607 break; 615 break;
608 case CPUCLOCK_SCHED: 616 case CPUCLOCK_SCHED:
609 if (p->cputime_expires.sched_exp == 0 || 617 if (p->cputime_expires.sched_exp == 0 ||
610 p->cputime_expires.sched_exp > 618 p->cputime_expires.sched_exp > exp->sched)
611 nt->expires.sched)
612 p->cputime_expires.sched_exp = 619 p->cputime_expires.sched_exp =
613 nt->expires.sched; 620 exp->sched;
614 break; 621 break;
615 } 622 }
616 } else { 623 } else {
624 struct signal_struct *const sig = p->signal;
625 union cpu_time_count *exp = &timer->it.cpu.expires;
626
617 /* 627 /*
618 * For a process timer, set the cached expiration time. 628 * For a process timer, set the cached expiration time.
619 */ 629 */
@@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
621 default: 631 default:
622 BUG(); 632 BUG();
623 case CPUCLOCK_VIRT: 633 case CPUCLOCK_VIRT:
624 if (!cputime_eq(p->signal->it_virt_expires, 634 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
625 cputime_zero) && 635 exp->cpu))
626 cputime_lt(p->signal->it_virt_expires,
627 timer->it.cpu.expires.cpu))
628 break; 636 break;
629 p->signal->cputime_expires.virt_exp = 637 sig->cputime_expires.virt_exp = exp->cpu;
630 timer->it.cpu.expires.cpu;
631 break; 638 break;
632 case CPUCLOCK_PROF: 639 case CPUCLOCK_PROF:
633 if (!cputime_eq(p->signal->it_prof_expires, 640 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
634 cputime_zero) && 641 exp->cpu))
635 cputime_lt(p->signal->it_prof_expires,
636 timer->it.cpu.expires.cpu))
637 break; 642 break;
638 i = p->signal->rlim[RLIMIT_CPU].rlim_cur; 643 i = sig->rlim[RLIMIT_CPU].rlim_cur;
639 if (i != RLIM_INFINITY && 644 if (i != RLIM_INFINITY &&
640 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 645 i <= cputime_to_secs(exp->cpu))
641 break; 646 break;
642 p->signal->cputime_expires.prof_exp = 647 sig->cputime_expires.prof_exp = exp->cpu;
643 timer->it.cpu.expires.cpu;
644 break; 648 break;
645 case CPUCLOCK_SCHED: 649 case CPUCLOCK_SCHED:
646 p->signal->cputime_expires.sched_exp = 650 sig->cputime_expires.sched_exp = exp->sched;
647 timer->it.cpu.expires.sched;
648 break; 651 break;
649 } 652 }
650 } 653 }
@@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk)
1071 spin_unlock_irqrestore(&cputimer->lock, flags); 1074 spin_unlock_irqrestore(&cputimer->lock, flags);
1072} 1075}
1073 1076
1077static u32 onecputick;
1078
1079static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1080 cputime_t *expires, cputime_t cur_time, int signo)
1081{
1082 if (cputime_eq(it->expires, cputime_zero))
1083 return;
1084
1085 if (cputime_ge(cur_time, it->expires)) {
1086 if (!cputime_eq(it->incr, cputime_zero)) {
1087 it->expires = cputime_add(it->expires, it->incr);
1088 it->error += it->incr_error;
1089 if (it->error >= onecputick) {
1090 it->expires = cputime_sub(it->expires,
1091 cputime_one_jiffy);
1092 it->error -= onecputick;
1093 }
1094 } else {
1095 it->expires = cputime_zero;
1096 }
1097
1098 trace_itimer_expire(signo == SIGPROF ?
1099 ITIMER_PROF : ITIMER_VIRTUAL,
1100 tsk->signal->leader_pid, cur_time);
1101 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1102 }
1103
1104 if (!cputime_eq(it->expires, cputime_zero) &&
1105 (cputime_eq(*expires, cputime_zero) ||
1106 cputime_lt(it->expires, *expires))) {
1107 *expires = it->expires;
1108 }
1109}
1110
1074/* 1111/*
1075 * Check for any per-thread CPU timers that have fired and move them 1112 * Check for any per-thread CPU timers that have fired and move them
1076 * off the tsk->*_timers list onto the firing list. Per-thread timers 1113 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk,
1090 * Don't sample the current process CPU clocks if there are no timers. 1127 * Don't sample the current process CPU clocks if there are no timers.
1091 */ 1128 */
1092 if (list_empty(&timers[CPUCLOCK_PROF]) && 1129 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1093 cputime_eq(sig->it_prof_expires, cputime_zero) && 1130 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1094 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && 1131 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1095 list_empty(&timers[CPUCLOCK_VIRT]) && 1132 list_empty(&timers[CPUCLOCK_VIRT]) &&
1096 cputime_eq(sig->it_virt_expires, cputime_zero) && 1133 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1097 list_empty(&timers[CPUCLOCK_SCHED])) { 1134 list_empty(&timers[CPUCLOCK_SCHED])) {
1098 stop_process_timers(tsk); 1135 stop_process_timers(tsk);
1099 return; 1136 return;
@@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk,
1153 /* 1190 /*
1154 * Check for the special case process timers. 1191 * Check for the special case process timers.
1155 */ 1192 */
1156 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1193 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime,
1157 if (cputime_ge(ptime, sig->it_prof_expires)) { 1194 SIGPROF);
1158 /* ITIMER_PROF fires and reloads. */ 1195 check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime,
1159 sig->it_prof_expires = sig->it_prof_incr; 1196 SIGVTALRM);
1160 if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { 1197
1161 sig->it_prof_expires = cputime_add(
1162 sig->it_prof_expires, ptime);
1163 }
1164 __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk);
1165 }
1166 if (!cputime_eq(sig->it_prof_expires, cputime_zero) &&
1167 (cputime_eq(prof_expires, cputime_zero) ||
1168 cputime_lt(sig->it_prof_expires, prof_expires))) {
1169 prof_expires = sig->it_prof_expires;
1170 }
1171 }
1172 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1173 if (cputime_ge(utime, sig->it_virt_expires)) {
1174 /* ITIMER_VIRTUAL fires and reloads. */
1175 sig->it_virt_expires = sig->it_virt_incr;
1176 if (!cputime_eq(sig->it_virt_expires, cputime_zero)) {
1177 sig->it_virt_expires = cputime_add(
1178 sig->it_virt_expires, utime);
1179 }
1180 __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk);
1181 }
1182 if (!cputime_eq(sig->it_virt_expires, cputime_zero) &&
1183 (cputime_eq(virt_expires, cputime_zero) ||
1184 cputime_lt(sig->it_virt_expires, virt_expires))) {
1185 virt_expires = sig->it_virt_expires;
1186 }
1187 }
1188 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 1198 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
1189 unsigned long psecs = cputime_to_secs(ptime); 1199 unsigned long psecs = cputime_to_secs(ptime);
1190 cputime_t x; 1200 cputime_t x;
@@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1457 if (!cputime_eq(*oldval, cputime_zero)) { 1467 if (!cputime_eq(*oldval, cputime_zero)) {
1458 if (cputime_le(*oldval, now.cpu)) { 1468 if (cputime_le(*oldval, now.cpu)) {
1459 /* Just about to fire. */ 1469 /* Just about to fire. */
1460 *oldval = jiffies_to_cputime(1); 1470 *oldval = cputime_one_jiffy;
1461 } else { 1471 } else {
1462 *oldval = cputime_sub(*oldval, now.cpu); 1472 *oldval = cputime_sub(*oldval, now.cpu);
1463 } 1473 }
@@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void)
1703 .nsleep = thread_cpu_nsleep, 1713 .nsleep = thread_cpu_nsleep,
1704 .nsleep_restart = thread_cpu_nsleep_restart, 1714 .nsleep_restart = thread_cpu_nsleep_restart,
1705 }; 1715 };
1716 struct timespec ts;
1706 1717
1707 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1718 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
1708 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); 1719 register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
1709 1720
1721 cputime_to_timespec(cputime_one_jiffy, &ts);
1722 onecputick = ts.tv_nsec;
1723 WARN_ON(ts.tv_sec != 0);
1724
1710 return 0; 1725 return 0;
1711} 1726}
1712__initcall(init_posix_cpu_timers); 1727__initcall(init_posix_cpu_timers);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..f38b07f78a4e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -198,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup);
198#ifdef CONFIG_BOOT_PRINTK_DELAY 206#ifdef CONFIG_BOOT_PRINTK_DELAY
199 207
200static unsigned int boot_delay; /* msecs delay after each printk during bootup */ 208static unsigned int boot_delay; /* msecs delay after each printk during bootup */
201static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ 209static unsigned long long loops_per_msec; /* based on boot_delay */
202 210
203static int __init boot_delay_setup(char *str) 211static int __init boot_delay_setup(char *str)
204{ 212{
205 unsigned long lpj; 213 unsigned long lpj;
206 unsigned long long loops_per_msec;
207 214
208 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ 215 lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */
209 loops_per_msec = (unsigned long long)lpj / 1000 * HZ; 216 loops_per_msec = (unsigned long long)lpj / 1000 * HZ;
@@ -212,10 +219,9 @@ static int __init boot_delay_setup(char *str)
212 if (boot_delay > 10 * 1000) 219 if (boot_delay > 10 * 1000)
213 boot_delay = 0; 220 boot_delay = 0;
214 221
215 printk_delay_msec = loops_per_msec; 222 pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
216 printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " 223 "HZ: %d, loops_per_msec: %llu\n",
217 "HZ: %d, printk_delay_msec: %llu\n", 224 boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
218 boot_delay, preset_lpj, lpj, HZ, printk_delay_msec);
219 return 1; 225 return 1;
220} 226}
221__setup("boot_delay=", boot_delay_setup); 227__setup("boot_delay=", boot_delay_setup);
@@ -228,7 +234,7 @@ static void boot_delay_msec(void)
228 if (boot_delay == 0 || system_state != SYSTEM_BOOTING) 234 if (boot_delay == 0 || system_state != SYSTEM_BOOTING)
229 return; 235 return;
230 236
231 k = (unsigned long long)printk_delay_msec * boot_delay; 237 k = (unsigned long long)loops_per_msec * boot_delay;
232 238
233 timeout = jiffies + msecs_to_jiffies(boot_delay); 239 timeout = jiffies + msecs_to_jiffies(boot_delay);
234 while (k) { 240 while (k) {
@@ -372,10 +378,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 378 logged_chars = 0;
373 break; 379 break;
374 case 6: /* Disable logging to console */ 380 case 6: /* Disable logging to console */
381 if (saved_console_loglevel == -1)
382 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 383 console_loglevel = minimum_console_loglevel;
376 break; 384 break;
377 case 7: /* Enable logging to console */ 385 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 386 if (saved_console_loglevel != -1) {
387 console_loglevel = saved_console_loglevel;
388 saved_console_loglevel = -1;
389 }
379 break; 390 break;
380 case 8: /* Set level of messages printed to console */ 391 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 392 error = -EINVAL;
@@ -384,6 +395,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 395 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 396 len = minimum_console_loglevel;
386 console_loglevel = len; 397 console_loglevel = len;
398 /* Implicitly re-enable logging to console */
399 saved_console_loglevel = -1;
387 error = 0; 400 error = 0;
388 break; 401 break;
389 case 9: /* Number of chars in the log buffer */ 402 case 9: /* Number of chars in the log buffer */
@@ -412,7 +425,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 425{
413 struct console *con; 426 struct console *con;
414 427
415 for (con = console_drivers; con; con = con->next) { 428 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 429 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 430 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 431 (con->flags & CON_ANYTIME)))
@@ -544,7 +557,7 @@ static int have_callable_console(void)
544{ 557{
545 struct console *con; 558 struct console *con;
546 559
547 for (con = console_drivers; con; con = con->next) 560 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 561 if (con->flags & CON_ANYTIME)
549 return 1; 562 return 1;
550 563
@@ -640,6 +653,20 @@ static int recursion_bug;
640static int new_text_line = 1; 653static int new_text_line = 1;
641static char printk_buf[1024]; 654static char printk_buf[1024];
642 655
656int printk_delay_msec __read_mostly;
657
658static inline void printk_delay(void)
659{
660 if (unlikely(printk_delay_msec)) {
661 int m = printk_delay_msec;
662
663 while (m--) {
664 mdelay(1);
665 touch_nmi_watchdog();
666 }
667 }
668}
669
643asmlinkage int vprintk(const char *fmt, va_list args) 670asmlinkage int vprintk(const char *fmt, va_list args)
644{ 671{
645 int printed_len = 0; 672 int printed_len = 0;
@@ -649,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
649 char *p; 676 char *p;
650 677
651 boot_delay_msec(); 678 boot_delay_msec();
679 printk_delay();
652 680
653 preempt_disable(); 681 preempt_disable();
654 /* This stops the holder of console_sem just where we want him */ 682 /* This stops the holder of console_sem just where we want him */
@@ -1060,12 +1088,6 @@ void __sched console_conditional_schedule(void)
1060} 1088}
1061EXPORT_SYMBOL(console_conditional_schedule); 1089EXPORT_SYMBOL(console_conditional_schedule);
1062 1090
1063void console_print(const char *s)
1064{
1065 printk(KERN_EMERG "%s", s);
1066}
1067EXPORT_SYMBOL(console_print);
1068
1069void console_unblank(void) 1091void console_unblank(void)
1070{ 1092{
1071 struct console *c; 1093 struct console *c;
@@ -1082,7 +1104,7 @@ void console_unblank(void)
1082 1104
1083 console_locked = 1; 1105 console_locked = 1;
1084 console_may_schedule = 0; 1106 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1107 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1108 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1109 c->unblank();
1088 release_console_sem(); 1110 release_console_sem();
@@ -1097,7 +1119,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1119 struct tty_driver *driver = NULL;
1098 1120
1099 acquire_console_sem(); 1121 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1122 for_each_console(c) {
1101 if (!c->device) 1123 if (!c->device)
1102 continue; 1124 continue;
1103 driver = c->device(c, index); 1125 driver = c->device(c, index);
@@ -1134,25 +1156,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1156 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1157 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1158 * console driver was initialized.
1159 *
1160 * This can happen pretty early during the boot process (because of
1161 * early_printk) - sometimes before setup_arch() completes - be careful
1162 * of what kernel features are used - they may not be initialised yet.
1163 *
1164 * There are two types of consoles - bootconsoles (early_printk) and
1165 * "real" consoles (everything which is not a bootconsole) which are
1166 * handled differently.
1167 * - Any number of bootconsoles can be registered at any time.
1168 * - As soon as a "real" console is registered, all bootconsoles
1169 * will be unregistered automatically.
1170 * - Once a "real" console is registered, any attempt to register a
1171 * bootconsoles will be rejected
1137 */ 1172 */
1138void register_console(struct console *console) 1173void register_console(struct console *newcon)
1139{ 1174{
1140 int i; 1175 int i;
1141 unsigned long flags; 1176 unsigned long flags;
1142 struct console *bootconsole = NULL; 1177 struct console *bcon = NULL;
1143 1178
1144 if (console_drivers) { 1179 /*
1145 if (console->flags & CON_BOOT) 1180 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1181 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1182 */
1148 bootconsole = console_drivers; 1183 if (console_drivers && newcon->flags & CON_BOOT) {
1184 /* find the last or real console */
1185 for_each_console(bcon) {
1186 if (!(bcon->flags & CON_BOOT)) {
1187 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1188 newcon->name, newcon->index);
1189 return;
1190 }
1191 }
1149 } 1192 }
1150 1193
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1194 if (console_drivers && console_drivers->flags & CON_BOOT)
1195 bcon = console_drivers;
1196
1197 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1198 preferred_console = selected_console;
1153 1199
1154 if (console->early_setup) 1200 if (newcon->early_setup)
1155 console->early_setup(); 1201 newcon->early_setup();
1156 1202
1157 /* 1203 /*
1158 * See if we want to use this console driver. If we 1204 * See if we want to use this console driver. If we
@@ -1160,13 +1206,13 @@ void register_console(struct console *console)
1160 * that registers here. 1206 * that registers here.
1161 */ 1207 */
1162 if (preferred_console < 0) { 1208 if (preferred_console < 0) {
1163 if (console->index < 0) 1209 if (newcon->index < 0)
1164 console->index = 0; 1210 newcon->index = 0;
1165 if (console->setup == NULL || 1211 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1212 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1213 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1214 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1215 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1216 preferred_console = 0;
1171 } 1217 }
1172 } 1218 }
@@ -1178,64 +1224,62 @@ void register_console(struct console *console)
1178 */ 1224 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1225 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1226 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1227 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1228 continue;
1183 if (console->index >= 0 && 1229 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1230 newcon->index != console_cmdline[i].index)
1185 continue; 1231 continue;
1186 if (console->index < 0) 1232 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1233 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1234#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1235 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1236 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1237 braille_register_console(newcon,
1192 console_cmdline[i].index, 1238 console_cmdline[i].index,
1193 console_cmdline[i].options, 1239 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1240 console_cmdline[i].brl_options);
1195 return; 1241 return;
1196 } 1242 }
1197#endif 1243#endif
1198 if (console->setup && 1244 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1245 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1246 break;
1201 console->flags |= CON_ENABLED; 1247 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1248 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1249 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1250 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1251 preferred_console = selected_console;
1206 } 1252 }
1207 break; 1253 break;
1208 } 1254 }
1209 1255
1210 if (!(console->flags & CON_ENABLED)) 1256 if (!(newcon->flags & CON_ENABLED))
1211 return; 1257 return;
1212 1258
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1259 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1260 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1261 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1262 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1263 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1264 */
1219 } else { 1265 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1266 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1267
1224 /* 1268 /*
1225 * Put this console in the list - keep the 1269 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1270 * preferred driver at the head of the list.
1227 */ 1271 */
1228 acquire_console_sem(); 1272 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1273 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1274 newcon->next = console_drivers;
1231 console_drivers = console; 1275 console_drivers = newcon;
1232 if (console->next) 1276 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1277 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1278 } else {
1235 console->next = console_drivers->next; 1279 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1280 console_drivers->next = newcon;
1237 } 1281 }
1238 if (console->flags & CON_PRINTBUFFER) { 1282 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1283 /*
1240 * release_console_sem() will print out the buffered messages 1284 * release_console_sem() will print out the buffered messages
1241 * for us. 1285 * for us.
@@ -1245,6 +1289,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1289 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1290 }
1247 release_console_sem(); 1291 release_console_sem();
1292
1293 /*
1294 * By unregistering the bootconsoles after we enable the real console
1295 * we get the "console xxx enabled" message on all the consoles -
1296 * boot consoles, real consoles, etc - this is to ensure that end
1297 * users know there might be something in the kernel's log buffer that
1298 * went to the bootconsole (that they do not see on the real console)
1299 */
1300 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1301 /* we need to iterate through twice, to make sure we print
1302 * everything out, before we unregister the console(s)
1303 */
1304 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1305 newcon->name, newcon->index);
1306 for_each_console(bcon)
1307 if (bcon->flags & CON_BOOT)
1308 unregister_console(bcon);
1309 } else {
1310 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1311 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1312 newcon->name, newcon->index);
1313 }
1248} 1314}
1249EXPORT_SYMBOL(register_console); 1315EXPORT_SYMBOL(register_console);
1250 1316
@@ -1287,11 +1353,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1353
1288static int __init disable_boot_consoles(void) 1354static int __init disable_boot_consoles(void)
1289{ 1355{
1290 if (console_drivers != NULL) { 1356 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1357
1358 for_each_console(con) {
1359 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1360 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1361 con->name, con->index);
1294 return unregister_console(console_drivers); 1362 unregister_console(con);
1295 } 1363 }
1296 } 1364 }
1297 return 0; 1365 return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 419250ebec4d..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -442,48 +442,51 @@ void profile_tick(int type)
442 442
443#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
444#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
445#include <asm/uaccess.h> 446#include <asm/uaccess.h>
446 447
447static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
448 int count, int *eof, void *data)
449{ 449{
450 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
451 if (count - len < 2) 451 seq_putc(m, '\n');
452 return -EINVAL; 452 return 0;
453 len += sprintf(page + len, "\n");
454 return len;
455} 453}
456 454
457static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
458 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
459{ 462{
460 struct cpumask *mask = data;
461 unsigned long full_count = count, err;
462 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
463 465
464 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
465 return -ENOMEM; 467 return -ENOMEM;
466 468
467 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
468 if (!err) { 470 if (!err) {
469 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
470 err = full_count; 472 err = count;
471 } 473 }
472 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
473 return err; 475 return err;
474} 476}
475 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
476void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
477{ 487{
478 struct proc_dir_entry *entry;
479
480 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
481 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
482 if (!entry)
483 return;
484 entry->data = prof_cpu_mask;
485 entry->read_proc = prof_cpu_mask_read_proc;
486 entry->write_proc = prof_cpu_mask_write_proc;
487} 490}
488 491
489/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082c320e4dbf..307c285af59e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..37ac45483082 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head)
74 complete(&rcu->completion); 74 complete(&rcu->completion);
75} 75}
76 76
77#ifdef CONFIG_TREE_PREEMPT_RCU
78
77/** 79/**
78 * synchronize_rcu - wait until a grace period has elapsed. 80 * synchronize_rcu - wait until a grace period has elapsed.
79 * 81 *
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
87{ 89{
88 struct rcu_synchronize rcu; 90 struct rcu_synchronize rcu;
89 91
90 if (rcu_blocking_is_gp()) 92 if (!rcu_scheduler_active)
91 return; 93 return;
92 94
93 init_completion(&rcu.completion); 95 init_completion(&rcu.completion);
@@ -98,6 +100,70 @@ void synchronize_rcu(void)
98} 100}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 101EXPORT_SYMBOL_GPL(synchronize_rcu);
100 102
103#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
104
105/**
106 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
107 *
108 * Control will return to the caller some time after a full rcu-sched
109 * grace period has elapsed, in other words after all currently executing
110 * rcu-sched read-side critical sections have completed. These read-side
111 * critical sections are delimited by rcu_read_lock_sched() and
112 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
113 * local_irq_disable(), and so on may be used in place of
114 * rcu_read_lock_sched().
115 *
116 * This means that all preempt_disable code sequences, including NMI and
117 * hardware-interrupt handlers, in progress on entry will have completed
118 * before this primitive returns. However, this does not guarantee that
119 * softirq handlers will have completed, since in some kernels, these
120 * handlers can run in process context, and can block.
121 *
122 * This primitive provides the guarantees made by the (now removed)
123 * synchronize_kernel() API. In contrast, synchronize_rcu() only
124 * guarantees that rcu_read_lock() sections will have completed.
125 * In "classic RCU", these two guarantees happen to be one and
126 * the same, but can differ in realtime RCU implementations.
127 */
128void synchronize_sched(void)
129{
130 struct rcu_synchronize rcu;
131
132 if (rcu_blocking_is_gp())
133 return;
134
135 init_completion(&rcu.completion);
136 /* Will wake me after RCU finished. */
137 call_rcu_sched(&rcu.head, wakeme_after_rcu);
138 /* Wait for it. */
139 wait_for_completion(&rcu.completion);
140}
141EXPORT_SYMBOL_GPL(synchronize_sched);
142
143/**
144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
145 *
146 * Control will return to the caller some time after a full rcu_bh grace
147 * period has elapsed, in other words after all currently executing rcu_bh
148 * read-side critical sections have completed. RCU read-side critical
149 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
150 * and may be nested.
151 */
152void synchronize_rcu_bh(void)
153{
154 struct rcu_synchronize rcu;
155
156 if (rcu_blocking_is_gp())
157 return;
158
159 init_completion(&rcu.completion);
160 /* Will wake me after RCU finished. */
161 call_rcu_bh(&rcu.head, wakeme_after_rcu);
162 /* Wait for it. */
163 wait_for_completion(&rcu.completion);
164}
165EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
166
101static void rcu_barrier_callback(struct rcu_head *notused) 167static void rcu_barrier_callback(struct rcu_head *notused)
102{ 168{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 169 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +195,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 195static inline void wait_migrated_callbacks(void)
130{ 196{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 197 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
198 smp_mb(); /* In case we didn't sleep. */
132} 199}
133 200
134/* 201/*
@@ -192,9 +259,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 259 wake_up(&rcu_migrate_wq);
193} 260}
194 261
262extern int rcu_cpu_notify(struct notifier_block *self,
263 unsigned long action, void *hcpu);
264
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 265static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 266 unsigned long action, void *hcpu)
197{ 267{
268 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 269 if (action == CPU_DYING) {
199 /* 270 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 271 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +280,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 280 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 281 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 282 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 283 } else if (action == CPU_DOWN_PREPARE) {
284 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 285 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 286 wait_migrated_callbacks();
215 } 287 }
@@ -219,8 +291,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 291
220void __init rcu_init(void) 292void __init rcu_init(void)
221{ 293{
294 int i;
295
222 __rcu_init(); 296 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 297 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
298
299 /*
300 * We don't need protection against CPU-hotplug here because
301 * this is called early in boot, before either interrupts
302 * or the scheduler are operational.
303 */
304 for_each_online_cpu(i)
305 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 306}
225 307
226void rcu_scheduler_starting(void) 308void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..233768f21f97 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -257,17 +257,18 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -320,7 +324,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 324 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 325 rcu_torture_free(rp);
322 } else 326 } else
323 cur_ops->deferredfree(rp); 327 cur_ops->deferred_free(rp);
324} 328}
325 329
326static void rcu_torture_deferred_free(struct rcu_torture *p) 330static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +333,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 333}
330 334
331static struct rcu_torture_ops rcu_ops = { 335static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 336 .init = NULL,
333 .cleanup = NULL, 337 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 338 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 339 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 340 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 341 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 342 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 343 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 344 .cb_barrier = rcu_barrier,
341 .stats = NULL, 345 .stats = NULL,
342 .irqcapable = 1, 346 .irq_capable = 1,
343 .name = "rcu" 347 .name = "rcu"
344}; 348};
345 349
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 350static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +374,18 @@ static void rcu_sync_torture_init(void)
370} 374}
371 375
372static struct rcu_torture_ops rcu_sync_ops = { 376static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 377 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 378 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 379 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 380 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 381 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 382 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 383 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 384 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 385 .cb_barrier = NULL,
382 .stats = NULL, 386 .stats = NULL,
383 .irqcapable = 1, 387 .irq_capable = 1,
384 .name = "rcu_sync" 388 .name = "rcu_sync"
385}; 389};
386 390
387/* 391/*
@@ -432,33 +436,33 @@ static void rcu_bh_torture_synchronize(void)
432} 436}
433 437
434static struct rcu_torture_ops rcu_bh_ops = { 438static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 439 .init = NULL,
436 .cleanup = NULL, 440 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 441 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 442 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 443 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 444 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 445 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 446 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 447 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 448 .stats = NULL,
445 .irqcapable = 1, 449 .irq_capable = 1,
446 .name = "rcu_bh" 450 .name = "rcu_bh"
447}; 451};
448 452
449static struct rcu_torture_ops rcu_bh_sync_ops = { 453static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 455 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 456 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 457 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 458 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 459 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 460 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 461 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 462 .cb_barrier = NULL,
459 .stats = NULL, 463 .stats = NULL,
460 .irqcapable = 1, 464 .irq_capable = 1,
461 .name = "rcu_bh_sync" 465 .name = "rcu_bh_sync"
462}; 466};
463 467
464/* 468/*
@@ -530,17 +534,17 @@ static int srcu_torture_stats(char *page)
530} 534}
531 535
532static struct rcu_torture_ops srcu_ops = { 536static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 537 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 538 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 539 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 540 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 541 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 542 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 543 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 544 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 545 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 546 .stats = srcu_torture_stats,
543 .name = "srcu" 547 .name = "srcu"
544}; 548};
545 549
546/* 550/*
@@ -574,32 +578,49 @@ static void sched_torture_synchronize(void)
574} 578}
575 579
576static struct rcu_torture_ops sched_ops = { 580static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 581 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 582 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 583 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 585 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 586 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 587 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 588 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 589 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 590 .stats = NULL,
587 .irqcapable = 1, 591 .irq_capable = 1,
588 .name = "sched" 592 .name = "sched"
589}; 593};
590 594
591static struct rcu_torture_ops sched_ops_sync = { 595static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 596 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 597 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 598 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 600 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 601 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 602 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 603 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 604 .cb_barrier = NULL,
601 .stats = NULL, 605 .stats = NULL,
602 .name = "sched_sync" 606 .name = "sched_sync"
607};
608
609extern int rcu_expedited_torture_stats(char *page);
610
611static struct rcu_torture_ops sched_expedited_ops = {
612 .init = rcu_sync_torture_init,
613 .cleanup = NULL,
614 .readlock = sched_torture_read_lock,
615 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
616 .readunlock = sched_torture_read_unlock,
617 .completed = sched_torture_completed,
618 .deferred_free = rcu_sync_torture_deferred_free,
619 .sync = synchronize_sched_expedited,
620 .cb_barrier = NULL,
621 .stats = rcu_expedited_torture_stats,
622 .irq_capable = 1,
623 .name = "sched_expedited"
603}; 624};
604 625
605/* 626/*
@@ -621,7 +642,8 @@ rcu_torture_writer(void *arg)
621 642
622 do { 643 do {
623 schedule_timeout_uninterruptible(1); 644 schedule_timeout_uninterruptible(1);
624 if ((rp = rcu_torture_alloc()) == NULL) 645 rp = rcu_torture_alloc();
646 if (rp == NULL)
625 continue; 647 continue;
626 rp->rtort_pipe_count = 0; 648 rp->rtort_pipe_count = 0;
627 udelay(rcu_random(&rand) & 0x3ff); 649 udelay(rcu_random(&rand) & 0x3ff);
@@ -635,7 +657,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 657 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 658 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 659 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 660 cur_ops->deferred_free(old_rp);
639 } 661 }
640 rcu_torture_current_version++; 662 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 663 oldbatch = cur_ops->completed();
@@ -700,7 +722,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 722 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 723 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 724 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 725 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 726 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 727 spin_unlock(&rand_lock);
706 preempt_disable(); 728 preempt_disable();
@@ -738,11 +760,11 @@ rcu_torture_reader(void *arg)
738 760
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 761 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 762 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 763 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 764 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 765
744 do { 766 do {
745 if (irqreader && cur_ops->irqcapable) { 767 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 768 if (!timer_pending(&t))
747 mod_timer(&t, 1); 769 mod_timer(&t, 1);
748 } 770 }
@@ -757,7 +779,7 @@ rcu_torture_reader(void *arg)
757 } 779 }
758 if (p->rtort_mbtest == 0) 780 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 781 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 782 cur_ops->read_delay(&rand);
761 preempt_disable(); 783 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 784 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 785 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +800,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 800 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 801 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 802 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 803 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 804 del_timer_sync(&t);
783 while (!kthread_should_stop()) 805 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 806 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1100,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1100 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1101 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1102 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1103 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1104 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1105
1083 mutex_lock(&fullstop_mutex); 1106 mutex_lock(&fullstop_mutex);
@@ -1092,7 +1115,7 @@ rcu_torture_init(void)
1092 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1093 torture_type); 1116 torture_type);
1094 mutex_unlock(&fullstop_mutex); 1117 mutex_unlock(&fullstop_mutex);
1095 return (-EINVAL); 1118 return -EINVAL;
1096 } 1119 }
1097 if (cur_ops->init) 1120 if (cur_ops->init)
1098 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1121 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1143,7 +1166,7 @@ rcu_torture_init(void)
1143 goto unwind; 1166 goto unwind;
1144 } 1167 }
1145 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1168 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1146 GFP_KERNEL); 1169 GFP_KERNEL);
1147 if (fakewriter_tasks == NULL) { 1170 if (fakewriter_tasks == NULL) {
1148 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1171 VERBOSE_PRINTK_ERRSTRING("out of memory");
1149 firsterr = -ENOMEM; 1172 firsterr = -ENOMEM;
@@ -1152,7 +1175,7 @@ rcu_torture_init(void)
1152 for (i = 0; i < nfakewriters; i++) { 1175 for (i = 0; i < nfakewriters; i++) {
1153 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1176 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1154 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1177 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1155 "rcu_torture_fakewriter"); 1178 "rcu_torture_fakewriter");
1156 if (IS_ERR(fakewriter_tasks[i])) { 1179 if (IS_ERR(fakewriter_tasks[i])) {
1157 firsterr = PTR_ERR(fakewriter_tasks[i]); 1180 firsterr = PTR_ERR(fakewriter_tasks[i]);
1158 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1181 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..52b06f6e158c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,55 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 struct rcu_data *rdp;
90 rdp->passed_quiesc = 1; 111
112 rdp = &per_cpu(rcu_sched_data, cpu);
91 rdp->passed_quiesc_completed = rdp->completed; 113 rdp->passed_quiesc_completed = rdp->completed;
114 barrier();
115 rdp->passed_quiesc = 1;
116 rcu_preempt_note_context_switch(cpu);
92} 117}
93 118
94void rcu_bh_qsctr_inc(int cpu) 119void rcu_bh_qs(int cpu)
95{ 120{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 121 struct rcu_data *rdp;
97 rdp->passed_quiesc = 1; 122
123 rdp = &per_cpu(rcu_bh_data, cpu);
98 rdp->passed_quiesc_completed = rdp->completed; 124 rdp->passed_quiesc_completed = rdp->completed;
125 barrier();
126 rdp->passed_quiesc = 1;
99} 127}
100 128
101#ifdef CONFIG_NO_HZ 129#ifdef CONFIG_NO_HZ
@@ -110,15 +138,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 138static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 139
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 140static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
141static int rcu_pending(int cpu);
113 142
114/* 143/*
115 * Return the number of RCU batches processed thus far for debug & stats. 144 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 145 */
117long rcu_batches_completed(void) 146long rcu_batches_completed_sched(void)
118{ 147{
119 return rcu_state.completed; 148 return rcu_sched_state.completed;
120} 149}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 150EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 151
123/* 152/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 153 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +210,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 210 return 1;
182 } 211 }
183 212
213 /* If preemptable RCU, no point in sending reschedule IPI. */
214 if (rdp->preemptable)
215 return 0;
216
184 /* The CPU is online, so send it a reschedule IPI. */ 217 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 218 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 219 smp_send_reschedule(rdp->cpu);
@@ -193,7 +226,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 226#endif /* #ifdef CONFIG_SMP */
194 227
195#ifdef CONFIG_NO_HZ 228#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 229
198/** 230/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 231 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +245,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 245 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 246 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 247 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 248 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 249 local_irq_restore(flags);
218} 250}
219 251
@@ -232,7 +264,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 264 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 265 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 266 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 267 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 268 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 269 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 270}
@@ -251,7 +283,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 283 if (rdtp->dynticks & 0x1)
252 return; 284 return;
253 rdtp->dynticks_nmi++; 285 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 286 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 287 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 288}
257 289
@@ -270,7 +302,7 @@ void rcu_nmi_exit(void)
270 return; 302 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 303 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 304 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 305 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 306}
275 307
276/** 308/**
@@ -286,7 +318,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 318 if (rdtp->dynticks_nesting++)
287 return; 319 return;
288 rdtp->dynticks++; 320 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 321 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 322 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 323}
292 324
@@ -305,10 +337,10 @@ void rcu_irq_exit(void)
305 return; 337 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 338 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 339 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 340 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 341
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 342 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 343 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 344 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 345 set_need_resched();
314} 346}
@@ -461,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 493
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 494 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 495 for (; rnp_cur < rnp_end; rnp_cur++) {
496 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 497 if (rnp_cur->qsmask == 0)
465 continue; 498 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 499 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +502,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 502 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 503 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 504 smp_processor_id(), (long)(jiffies - rsp->gp_start));
505 trigger_all_cpu_backtrace();
506
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 507 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 508}
474 509
@@ -479,12 +514,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 514
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 516 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 517 trigger_all_cpu_backtrace();
518
483 spin_lock_irqsave(&rnp->lock, flags); 519 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 520 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 521 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 522 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 523 spin_unlock_irqrestore(&rnp->lock, flags);
524
488 set_need_resched(); /* kick ourselves to get things going. */ 525 set_need_resched(); /* kick ourselves to get things going. */
489} 526}
490 527
@@ -564,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
564{ 601{
565 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 602 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
566 struct rcu_node *rnp = rcu_get_root(rsp); 603 struct rcu_node *rnp = rcu_get_root(rsp);
567 struct rcu_node *rnp_cur;
568 struct rcu_node *rnp_end;
569 604
570 if (!cpu_needs_another_gp(rsp, rdp)) { 605 if (!cpu_needs_another_gp(rsp, rdp)) {
571 spin_unlock_irqrestore(&rnp->lock, flags); 606 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -574,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
574 609
575 /* Advance to a new grace period and initialize state. */ 610 /* Advance to a new grace period and initialize state. */
576 rsp->gpnum++; 611 rsp->gpnum++;
612 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
577 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
578 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
579 record_gp_stall_check_time(rsp); 615 record_gp_stall_check_time(rsp);
@@ -590,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
590 626
591 /* Special-case the common single-level case. */ 627 /* Special-case the common single-level case. */
592 if (NUM_RCU_NODES == 1) { 628 if (NUM_RCU_NODES == 1) {
629 rcu_preempt_check_blocked_tasks(rnp);
593 rnp->qsmask = rnp->qsmaskinit; 630 rnp->qsmask = rnp->qsmaskinit;
631 rnp->gpnum = rsp->gpnum;
594 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
595 spin_unlock_irqrestore(&rnp->lock, flags); 633 spin_unlock_irqrestore(&rnp->lock, flags);
596 return; 634 return;
@@ -603,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
603 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 641 spin_lock(&rsp->onofflock); /* irqs already disabled. */
604 642
605 /* 643 /*
606 * Set the quiescent-state-needed bits in all the non-leaf RCU 644 * Set the quiescent-state-needed bits in all the rcu_node
607 * nodes for all currently online CPUs. This operation relies 645 * structures for all currently online CPUs in breadth-first
608 * on the layout of the hierarchy within the rsp->node[] array. 646 * order, starting from the root rcu_node structure. This
609 * Note that other CPUs will access only the leaves of the 647 * operation relies on the layout of the hierarchy within the
610 * hierarchy, which still indicate that no grace period is in 648 * rsp->node[] array. Note that other CPUs will access only
611 * progress. In addition, we have excluded CPU-hotplug operations. 649 * the leaves of the hierarchy, which still indicate that no
612 * 650 * grace period is in progress, at least until the corresponding
613 * We therefore do not need to hold any locks. Any required 651 * leaf node has been initialized. In addition, we have excluded
614 * memory barriers will be supplied by the locks guarding the 652 * CPU-hotplug operations.
615 * leaf rcu_nodes in the hierarchy.
616 */
617
618 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
619 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621
622 /*
623 * Now set up the leaf nodes. Here we must be careful. First,
624 * we need to hold the lock in order to exclude other CPUs, which
625 * might be contending for the leaf nodes' locks. Second, as
626 * soon as we initialize a given leaf node, its CPUs might run
627 * up the rest of the hierarchy. We must therefore acquire locks
628 * for each node that we touch during this stage. (But we still
629 * are excluding CPU-hotplug operations.)
630 * 653 *
631 * Note that the grace period cannot complete until we finish 654 * Note that the grace period cannot complete until we finish
632 * the initialization process, as there will be at least one 655 * the initialization process, as there will be at least one
633 * qsmask bit set in the root node until that time, namely the 656 * qsmask bit set in the root node until that time, namely the
634 * one corresponding to this CPU. 657 * one corresponding to this CPU, due to the fact that we have
658 * irqs disabled.
635 */ 659 */
636 rnp_end = &rsp->node[NUM_RCU_NODES]; 660 for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
637 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 661 spin_lock(&rnp->lock); /* irqs already disabled. */
638 for (; rnp_cur < rnp_end; rnp_cur++) { 662 rcu_preempt_check_blocked_tasks(rnp);
639 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 663 rnp->qsmask = rnp->qsmaskinit;
640 rnp_cur->qsmask = rnp_cur->qsmaskinit; 664 rnp->gpnum = rsp->gpnum;
641 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 665 spin_unlock(&rnp->lock); /* irqs already disabled. */
642 } 666 }
643 667
644 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -674,6 +698,20 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 698}
675 699
676/* 700/*
701 * Clean up after the prior grace period and let rcu_start_gp() start up
702 * the next grace period if one is needed. Note that the caller must
703 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
704 */
705static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
706 __releases(rnp->lock)
707{
708 WARN_ON_ONCE(rsp->completed == rsp->gpnum);
709 rsp->completed = rsp->gpnum;
710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
712}
713
714/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 715 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 716 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 717 * group must be represented by the same leaf rcu_node structure.
@@ -685,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
685 unsigned long flags) 723 unsigned long flags)
686 __releases(rnp->lock) 724 __releases(rnp->lock)
687{ 725{
726 struct rcu_node *rnp_c;
727
688 /* Walk up the rcu_node hierarchy. */ 728 /* Walk up the rcu_node hierarchy. */
689 for (;;) { 729 for (;;) {
690 if (!(rnp->qsmask & mask)) { 730 if (!(rnp->qsmask & mask)) {
@@ -694,7 +734,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 734 return;
695 } 735 }
696 rnp->qsmask &= ~mask; 736 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 737 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 738
699 /* Other bits still set at this level, so done. */ 739 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 740 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -708,28 +748,26 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
708 break; 748 break;
709 } 749 }
710 spin_unlock_irqrestore(&rnp->lock, flags); 750 spin_unlock_irqrestore(&rnp->lock, flags);
751 rnp_c = rnp;
711 rnp = rnp->parent; 752 rnp = rnp->parent;
712 spin_lock_irqsave(&rnp->lock, flags); 753 spin_lock_irqsave(&rnp->lock, flags);
754 WARN_ON_ONCE(rnp_c->qsmask);
713 } 755 }
714 756
715 /* 757 /*
716 * Get here if we are the last CPU to pass through a quiescent 758 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 759 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 760 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 761 */
722 rsp->completed = rsp->gpnum; 762 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 763}
726 764
727/* 765/*
728 * Record a quiescent state for the specified CPU, which must either be 766 * Record a quiescent state for the specified CPU, which must either be
729 * the current CPU or an offline CPU. The lastcomp argument is used to 767 * the current CPU. The lastcomp argument is used to make sure we are
730 * make sure we are still in the grace period of interest. We don't want 768 * still in the grace period of interest. We don't want to end the current
731 * to end the current grace period based on quiescent states detected in 769 * grace period based on quiescent states detected in an earlier grace
732 * an earlier grace period! 770 * period!
733 */ 771 */
734static void 772static void
735cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 773cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -764,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
764 * This GP can't end until cpu checks in, so all of our 802 * This GP can't end until cpu checks in, so all of our
765 * callbacks can be processed during the next GP. 803 * callbacks can be processed during the next GP.
766 */ 804 */
767 rdp = rsp->rda[smp_processor_id()];
768 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
769 806
770 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -822,30 +859,28 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
822 spin_lock_irqsave(&rsp->onofflock, flags); 859 spin_lock_irqsave(&rsp->onofflock, flags);
823 860
824 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 861 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
825 rnp = rdp->mynode; 862 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
826 mask = rdp->grpmask; /* rnp->grplo is constant. */ 863 mask = rdp->grpmask; /* rnp->grplo is constant. */
827 do { 864 do {
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 865 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 866 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 867 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 868 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 869 break;
833 } 870 }
871 rcu_preempt_offline_tasks(rsp, rnp, rdp);
834 mask = rnp->grpmask; 872 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 874 rnp = rnp->parent;
837 } while (rnp != NULL); 875 } while (rnp != NULL);
838 lastcomp = rsp->completed; 876 lastcomp = rsp->completed;
839 877
840 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
841 879
842 /* Being offline is a quiescent state, so go record it. */
843 cpu_quiet(cpu, rsp, rdp, lastcomp);
844
845 /* 880 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 881 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 882 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 883 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 884 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 885 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 886 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +911,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 911 */
877static void rcu_offline_cpu(int cpu) 912static void rcu_offline_cpu(int cpu)
878{ 913{
879 __rcu_offline_cpu(cpu, &rcu_state); 914 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 915 __rcu_offline_cpu(cpu, &rcu_bh_state);
916 rcu_preempt_offline_cpu(cpu);
881} 917}
882 918
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 919#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +999,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 999 */
964void rcu_check_callbacks(int cpu, int user) 1000void rcu_check_callbacks(int cpu, int user)
965{ 1001{
1002 if (!rcu_pending(cpu))
1003 return; /* if nothing for RCU to do. */
966 if (user || 1004 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1005 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1006 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1009,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1009 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1010 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1011 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1012 * a quiescent state, so note it.
975 * 1013 *
976 * No memory barrier is required here because both 1014 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1015 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1016 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1017 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1018 */
982 1019
983 rcu_qsctr_inc(cpu); 1020 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1021 rcu_bh_qs(cpu);
985 1022
986 } else if (!in_softirq()) { 1023 } else if (!in_softirq()) {
987 1024
@@ -989,11 +1026,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1026 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1027 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1028 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1029 * critical section, so note it.
993 */ 1030 */
994 1031
995 rcu_bh_qsctr_inc(cpu); 1032 rcu_bh_qs(cpu);
996 } 1033 }
1034 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1035 raise_softirq(RCU_SOFTIRQ);
998} 1036}
999 1037
@@ -1132,6 +1170,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1170{
1133 unsigned long flags; 1171 unsigned long flags;
1134 1172
1173 WARN_ON_ONCE(rdp->beenonline == 0);
1174
1135 /* 1175 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1176 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1177 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1210,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1210 */
1171 smp_mb(); /* See above block comment. */ 1211 smp_mb(); /* See above block comment. */
1172 1212
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1213 __rcu_process_callbacks(&rcu_sched_state,
1214 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1215 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1216 rcu_preempt_process_callbacks();
1175 1217
1176 /* 1218 /*
1177 * Memory references from any later RCU read-side critical sections 1219 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1269,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1269}
1228 1270
1229/* 1271/*
1230 * Queue an RCU callback for invocation after a grace period. 1272 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1273 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1274void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1275{
1234 __call_rcu(head, func, &rcu_state); 1276 __call_rcu(head, func, &rcu_sched_state);
1235} 1277}
1236EXPORT_SYMBOL_GPL(call_rcu); 1278EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1279
1238/* 1280/*
1239 * Queue an RCU for invocation after a quicker grace period. 1281 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1347,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1347 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1348 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1349 */
1308int rcu_pending(int cpu) 1350static int rcu_pending(int cpu)
1309{ 1351{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1352 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1353 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1354 rcu_preempt_pending(cpu);
1312} 1355}
1313 1356
1314/* 1357/*
@@ -1320,27 +1363,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1363int rcu_needs_cpu(int cpu)
1321{ 1364{
1322 /* RCU callbacks either ready or pending? */ 1365 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1366 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1367 per_cpu(rcu_bh_data, cpu).nxtlist ||
1368 rcu_preempt_needs_cpu(cpu);
1325} 1369}
1326 1370
1327/* 1371/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1372 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1373 */
1339static void __cpuinit 1374static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1375rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1376{
1342 unsigned long flags; 1377 unsigned long flags;
1343 int i; 1378 int i;
1379 struct rcu_data *rdp = rsp->rda[cpu];
1380 struct rcu_node *rnp = rcu_get_root(rsp);
1381
1382 /* Set up local state, ensuring consistent view of global state. */
1383 spin_lock_irqsave(&rnp->lock, flags);
1384 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1385 rdp->nxtlist = NULL;
1386 for (i = 0; i < RCU_NEXT_SIZE; i++)
1387 rdp->nxttail[i] = &rdp->nxtlist;
1388 rdp->qlen = 0;
1389#ifdef CONFIG_NO_HZ
1390 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1391#endif /* #ifdef CONFIG_NO_HZ */
1392 rdp->cpu = cpu;
1393 spin_unlock_irqrestore(&rnp->lock, flags);
1394}
1395
1396/*
1397 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1398 * offline event can be happening at a given time. Note also that we
1399 * can accept some slop in the rsp->completed access due to the fact
1400 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1401 */
1402static void __cpuinit
1403rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1404{
1405 unsigned long flags;
1344 long lastcomp; 1406 long lastcomp;
1345 unsigned long mask; 1407 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1408 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1416,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1416 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1417 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1418 rdp->beenonline = 1; /* We have now been online. */
1419 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1420 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1421 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1422 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1423
1370 /* 1424 /*
@@ -1387,34 +1441,21 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1387 rnp = rnp->parent; 1441 rnp = rnp->parent;
1388 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1389 1443
1390 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1444 spin_unlock_irqrestore(&rsp->onofflock, flags);
1391
1392 /*
1393 * A new grace period might start here. If so, we will be part of
1394 * it, and its gpnum will be greater than ours, so we will
1395 * participate. It is also possible for the gpnum to have been
1396 * incremented before this function was called, and the bitmasks
1397 * to not be filled out until now, in which case we will also
1398 * participate due to our gpnum being behind.
1399 */
1400
1401 /* Since it is coming online, the CPU is in a quiescent state. */
1402 cpu_quiet(cpu, rsp, rdp, lastcomp);
1403 local_irq_restore(flags);
1404} 1445}
1405 1446
1406static void __cpuinit rcu_online_cpu(int cpu) 1447static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1448{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1449 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1450 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1451 rcu_preempt_init_percpu_data(cpu);
1411} 1452}
1412 1453
1413/* 1454/*
1414 * Handle CPU online/offline notifcation events. 1455 * Handle CPU online/offline notification events.
1415 */ 1456 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1457int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1458 unsigned long action, void *hcpu)
1418{ 1459{
1419 long cpu = (long)hcpu; 1460 long cpu = (long)hcpu;
1420 1461
@@ -1486,6 +1527,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1527 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1528 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1529 spin_lock_init(&rnp->lock);
1530 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1531 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1532 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1533 rnp->grplo = j * cpustride;
@@ -1503,16 +1545,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1545 j / rsp->levelspread[i - 1];
1504 } 1546 }
1505 rnp->level = i; 1547 rnp->level = i;
1548 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1549 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1550 }
1507 } 1551 }
1508} 1552}
1509 1553
1510/* 1554/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1555 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1556 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1557 * structure.
1513 */ 1558 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1559#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1560do { \
1561 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1562 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1563 j = 0; \
1518 for_each_possible_cpu(i) { \ 1564 for_each_possible_cpu(i) { \
@@ -1520,32 +1566,43 @@ do { \
1520 j++; \ 1566 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1567 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1568 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1569 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1570 } \
1524} while (0) 1571} while (0)
1525 1572
1526static struct notifier_block __cpuinitdata rcu_nb = { 1573#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1574
1528}; 1575void __init __rcu_init_preempt(void)
1576{
1577 int i; /* All used by RCU_INIT_FLAVOR(). */
1578 int j;
1579 struct rcu_node *rnp;
1580
1581 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1582}
1583
1584#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1585
1586void __init __rcu_init_preempt(void)
1587{
1588}
1589
1590#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1591
1530void __init __rcu_init(void) 1592void __init __rcu_init(void)
1531{ 1593{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1594 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1595 int j;
1534 struct rcu_node *rnp; 1596 struct rcu_node *rnp;
1535 1597
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1598 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1599#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1600 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1601#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1602 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1603 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1604 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1605 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1606}
1550 1607
1551module_param(blimit, int, 0); 1608module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..8e8287a983c2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..1cee04f627eb
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,566 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed;
71 barrier();
72 rdp->passed_quiesc = 1;
73}
74
75/*
76 * We have entered the scheduler, and the current task might soon be
77 * context-switched away from. If this task is in an RCU read-side
78 * critical section, we will no longer be able to rely on the CPU to
79 * record that fact, so we enqueue the task on the appropriate entry
80 * of the blocked_tasks[] array. The task will dequeue itself when
81 * it exits the outermost enclosing RCU read-side critical section.
82 * Therefore, the current grace period cannot be permitted to complete
83 * until the blocked_tasks[] entry indexed by the low-order bit of
84 * rnp->gpnum empties.
85 *
86 * Caller must disable preemption.
87 */
88static void rcu_preempt_note_context_switch(int cpu)
89{
90 struct task_struct *t = current;
91 unsigned long flags;
92 int phase;
93 struct rcu_data *rdp;
94 struct rcu_node *rnp;
95
96 if (t->rcu_read_lock_nesting &&
97 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
98
99 /* Possibly blocking in an RCU read-side critical section. */
100 rdp = rcu_preempt_state.rda[cpu];
101 rnp = rdp->mynode;
102 spin_lock_irqsave(&rnp->lock, flags);
103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
104 t->rcu_blocked_node = rnp;
105
106 /*
107 * If this CPU has already checked in, then this task
108 * will hold up the next grace period rather than the
109 * current grace period. Queue the task accordingly.
110 * If the task is queued for the current grace period
111 * (i.e., this CPU has not yet passed through a quiescent
112 * state for the current grace period), then as long
113 * as that task remains queued, the current grace period
114 * cannot end.
115 *
116 * But first, note that the current CPU must still be
117 * on line!
118 */
119 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
123 spin_unlock_irqrestore(&rnp->lock, flags);
124 }
125
126 /*
127 * Either we were not in an RCU read-side critical section to
128 * begin with, or we have now recorded that critical section
129 * globally. Either way, we can now note a quiescent state
130 * for this CPU. Again, if we were in an RCU read-side critical
131 * section, and if that critical section was blocking the current
132 * grace period, then the fact that the task has been enqueued
133 * means that we continue to block the current grace period.
134 */
135 rcu_preempt_qs(cpu);
136 local_irq_save(flags);
137 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
138 local_irq_restore(flags);
139}
140
141/*
142 * Tree-preemptable RCU implementation for rcu_read_lock().
143 * Just increment ->rcu_read_lock_nesting, shared state will be updated
144 * if we block.
145 */
146void __rcu_read_lock(void)
147{
148 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
149 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
150}
151EXPORT_SYMBOL_GPL(__rcu_read_lock);
152
153static void rcu_read_unlock_special(struct task_struct *t)
154{
155 int empty;
156 unsigned long flags;
157 unsigned long mask;
158 struct rcu_node *rnp;
159 int special;
160
161 /* NMI handlers cannot block and cannot safely manipulate state. */
162 if (in_nmi())
163 return;
164
165 local_irq_save(flags);
166
167 /*
168 * If RCU core is waiting for this CPU to exit critical section,
169 * let it know that we have done so.
170 */
171 special = t->rcu_read_unlock_special;
172 if (special & RCU_READ_UNLOCK_NEED_QS) {
173 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
174 rcu_preempt_qs(smp_processor_id());
175 }
176
177 /* Hardware IRQ handlers cannot block. */
178 if (in_irq()) {
179 local_irq_restore(flags);
180 return;
181 }
182
183 /* Clean up if blocked during RCU read-side critical section. */
184 if (special & RCU_READ_UNLOCK_BLOCKED) {
185 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
186
187 /*
188 * Remove this task from the list it blocked on. The
189 * task can migrate while we acquire the lock, but at
190 * most one time. So at most two passes through loop.
191 */
192 for (;;) {
193 rnp = t->rcu_blocked_node;
194 spin_lock(&rnp->lock); /* irqs already disabled. */
195 if (rnp == t->rcu_blocked_node)
196 break;
197 spin_unlock(&rnp->lock); /* irqs remain disabled. */
198 }
199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
200 list_del_init(&t->rcu_node_entry);
201 t->rcu_blocked_node = NULL;
202
203 /*
204 * If this was the last task on the current list, and if
205 * we aren't waiting on any CPUs, report the quiescent state.
206 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
207 * drop rnp->lock and restore irq.
208 */
209 if (!empty && rnp->qsmask == 0 &&
210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
211 struct rcu_node *rnp_p;
212
213 if (rnp->parent == NULL) {
214 /* Only one rcu_node in the tree. */
215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
216 return;
217 }
218 /* Report up the rest of the hierarchy. */
219 mask = rnp->grpmask;
220 spin_unlock_irqrestore(&rnp->lock, flags);
221 rnp_p = rnp->parent;
222 spin_lock_irqsave(&rnp_p->lock, flags);
223 WARN_ON_ONCE(rnp->qsmask);
224 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
225 return;
226 }
227 spin_unlock(&rnp->lock);
228 }
229 local_irq_restore(flags);
230}
231
232/*
233 * Tree-preemptable RCU implementation for rcu_read_unlock().
234 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
235 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
236 * invoke rcu_read_unlock_special() to clean up after a context switch
237 * in an RCU read-side critical section and other special cases.
238 */
239void __rcu_read_unlock(void)
240{
241 struct task_struct *t = current;
242
243 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
244 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
245 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
246 rcu_read_unlock_special(t);
247}
248EXPORT_SYMBOL_GPL(__rcu_read_unlock);
249
250#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
251
252/*
253 * Scan the current list of tasks blocked within RCU read-side critical
254 * sections, printing out the tid of each.
255 */
256static void rcu_print_task_stall(struct rcu_node *rnp)
257{
258 unsigned long flags;
259 struct list_head *lp;
260 int phase = rnp->gpnum & 0x1;
261 struct task_struct *t;
262
263 if (!list_empty(&rnp->blocked_tasks[phase])) {
264 spin_lock_irqsave(&rnp->lock, flags);
265 phase = rnp->gpnum & 0x1; /* re-read under lock. */
266 lp = &rnp->blocked_tasks[phase];
267 list_for_each_entry(t, lp, rcu_node_entry)
268 printk(" P%d", t->pid);
269 spin_unlock_irqrestore(&rnp->lock, flags);
270 }
271}
272
273#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
274
275/*
276 * Check that the list of blocked tasks for the newly completed grace
277 * period is in fact empty. It is a serious bug to complete a grace
278 * period that still has RCU readers blocked! This function must be
279 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
280 * must be held by the caller.
281 */
282static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
283{
284 WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
285 WARN_ON_ONCE(rnp->qsmask);
286}
287
288/*
289 * Check for preempted RCU readers for the specified rcu_node structure.
290 * If the caller needs a reliable answer, it must hold the rcu_node's
291 * >lock.
292 */
293static int rcu_preempted_readers(struct rcu_node *rnp)
294{
295 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
296}
297
298#ifdef CONFIG_HOTPLUG_CPU
299
300/*
301 * Handle tasklist migration for case in which all CPUs covered by the
302 * specified rcu_node have gone offline. Move them up to the root
303 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock.
306 *
307 * The caller must hold rnp->lock with irqs disabled.
308 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp,
311 struct rcu_data *rdp)
312{
313 int i;
314 struct list_head *lp;
315 struct list_head *lp_root;
316 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp;
318
319 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */
322 }
323 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1])));
326
327 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the
329 * root rcu_node can be at most one ahead of the rest of the
330 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element.
332 */
333 for (i = 0; i < 2; i++) {
334 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) {
337 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
338 spin_lock(&rnp_root->lock); /* irqs already disabled */
339 list_del(&tp->rcu_node_entry);
340 tp->rcu_blocked_node = rnp_root;
341 list_add(&tp->rcu_node_entry, lp_root);
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 }
344 }
345}
346
347/*
348 * Do CPU-offline processing for preemptable RCU.
349 */
350static void rcu_preempt_offline_cpu(int cpu)
351{
352 __rcu_offline_cpu(cpu, &rcu_preempt_state);
353}
354
355#endif /* #ifdef CONFIG_HOTPLUG_CPU */
356
357/*
358 * Check for a quiescent state from the current CPU. When a task blocks,
359 * the task is recorded in the corresponding CPU's rcu_node structure,
360 * which is checked elsewhere.
361 *
362 * Caller must disable hard irqs.
363 */
364static void rcu_preempt_check_callbacks(int cpu)
365{
366 struct task_struct *t = current;
367
368 if (t->rcu_read_lock_nesting == 0) {
369 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
370 rcu_preempt_qs(cpu);
371 return;
372 }
373 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
374 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
375}
376
377/*
378 * Process callbacks for preemptable RCU.
379 */
380static void rcu_preempt_process_callbacks(void)
381{
382 __rcu_process_callbacks(&rcu_preempt_state,
383 &__get_cpu_var(rcu_preempt_data));
384}
385
386/*
387 * Queue a preemptable-RCU callback for invocation after a grace period.
388 */
389void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
390{
391 __call_rcu(head, func, &rcu_preempt_state);
392}
393EXPORT_SYMBOL_GPL(call_rcu);
394
395/*
396 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done.
398 */
399static int rcu_preempt_pending(int cpu)
400{
401 return __rcu_pending(&rcu_preempt_state,
402 &per_cpu(rcu_preempt_data, cpu));
403}
404
405/*
406 * Does preemptable RCU need the CPU to stay out of dynticks mode?
407 */
408static int rcu_preempt_needs_cpu(int cpu)
409{
410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
411}
412
413/*
414 * Initialize preemptable RCU's per-CPU data.
415 */
416static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
417{
418 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
419}
420
421/*
422 * Check for a task exiting while in a preemptable-RCU read-side
423 * critical section, clean up if so. No need to issue warnings,
424 * as debug_check_no_locks_held() already does this if lockdep
425 * is enabled.
426 */
427void exit_rcu(void)
428{
429 struct task_struct *t = current;
430
431 if (t->rcu_read_lock_nesting == 0)
432 return;
433 t->rcu_read_lock_nesting = 1;
434 rcu_read_unlock();
435}
436
437#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
438
439/*
440 * Tell them what RCU they are running.
441 */
442static inline void rcu_bootup_announce(void)
443{
444 printk(KERN_INFO "Hierarchical RCU implementation.\n");
445}
446
447/*
448 * Return the number of RCU batches processed thus far for debug & stats.
449 */
450long rcu_batches_completed(void)
451{
452 return rcu_batches_completed_sched();
453}
454EXPORT_SYMBOL_GPL(rcu_batches_completed);
455
456/*
457 * Because preemptable RCU does not exist, we never have to check for
458 * CPUs being in quiescent states.
459 */
460static void rcu_preempt_note_context_switch(int cpu)
461{
462}
463
464#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
465
466/*
467 * Because preemptable RCU does not exist, we never have to check for
468 * tasks blocked within RCU read-side critical sections.
469 */
470static void rcu_print_task_stall(struct rcu_node *rnp)
471{
472}
473
474#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
475
476/*
477 * Because there is no preemptable RCU, there can be no readers blocked,
478 * so there is no need to check for blocked tasks. So check only for
479 * bogus qsmask values.
480 */
481static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
482{
483 WARN_ON_ONCE(rnp->qsmask);
484}
485
486/*
487 * Because preemptable RCU does not exist, there are never any preempted
488 * RCU readers.
489 */
490static int rcu_preempted_readers(struct rcu_node *rnp)
491{
492 return 0;
493}
494
495#ifdef CONFIG_HOTPLUG_CPU
496
497/*
498 * Because preemptable RCU does not exist, it never needs to migrate
499 * tasks that were blocked within RCU read-side critical sections.
500 */
501static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
502 struct rcu_node *rnp,
503 struct rcu_data *rdp)
504{
505}
506
507/*
508 * Because preemptable RCU does not exist, it never needs CPU-offline
509 * processing.
510 */
511static void rcu_preempt_offline_cpu(int cpu)
512{
513}
514
515#endif /* #ifdef CONFIG_HOTPLUG_CPU */
516
517/*
518 * Because preemptable RCU does not exist, it never has any callbacks
519 * to check.
520 */
521void rcu_preempt_check_callbacks(int cpu)
522{
523}
524
525/*
526 * Because preemptable RCU does not exist, it never has any callbacks
527 * to process.
528 */
529void rcu_preempt_process_callbacks(void)
530{
531}
532
533/*
534 * In classic RCU, call_rcu() is just call_rcu_sched().
535 */
536void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
537{
538 call_rcu_sched(head, func);
539}
540EXPORT_SYMBOL_GPL(call_rcu);
541
542/*
543 * Because preemptable RCU does not exist, it never has any work to do.
544 */
545static int rcu_preempt_pending(int cpu)
546{
547 return 0;
548}
549
550/*
551 * Because preemptable RCU does not exist, it never needs any CPU.
552 */
553static int rcu_preempt_needs_cpu(int cpu)
554{
555 return 0;
556}
557
558/*
559 * Because preemptable RCU does not exist, there is no per-CPU
560 * data to initialize.
561 */
562static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
563{
564}
565
566#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..c89f5e9fd173 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/resource.c b/kernel/resource.c
index 78b087221c15..fb11a58b9594 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -223,13 +223,13 @@ int release_resource(struct resource *old)
223 223
224EXPORT_SYMBOL(release_resource); 224EXPORT_SYMBOL(release_resource);
225 225
226#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) 226#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
227/* 227/*
228 * Finds the lowest memory reosurce exists within [res->start.res->end) 228 * Finds the lowest memory reosurce exists within [res->start.res->end)
229 * the caller must specify res->start, res->end, res->flags. 229 * the caller must specify res->start, res->end, res->flags and "name".
230 * If found, returns 0, res is overwritten, if not found, returns -1. 230 * If found, returns 0, res is overwritten, if not found, returns -1.
231 */ 231 */
232static int find_next_system_ram(struct resource *res) 232static int find_next_system_ram(struct resource *res, char *name)
233{ 233{
234 resource_size_t start, end; 234 resource_size_t start, end;
235 struct resource *p; 235 struct resource *p;
@@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res)
245 /* system ram is just marked as IORESOURCE_MEM */ 245 /* system ram is just marked as IORESOURCE_MEM */
246 if (p->flags != res->flags) 246 if (p->flags != res->flags)
247 continue; 247 continue;
248 if (name && strcmp(p->name, name))
249 continue;
248 if (p->start > end) { 250 if (p->start > end) {
249 p = NULL; 251 p = NULL;
250 break; 252 break;
@@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res)
262 res->end = p->end; 264 res->end = p->end;
263 return 0; 265 return 0;
264} 266}
265int 267
266walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, 268/*
267 int (*func)(unsigned long, unsigned long, void *)) 269 * This function calls callback against all memory range of "System RAM"
270 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
271 * Now, this function is only for "System RAM".
272 */
273int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
274 void *arg, int (*func)(unsigned long, unsigned long, void *))
268{ 275{
269 struct resource res; 276 struct resource res;
270 unsigned long pfn, len; 277 unsigned long pfn, len;
271 u64 orig_end; 278 u64 orig_end;
272 int ret = -1; 279 int ret = -1;
280
273 res.start = (u64) start_pfn << PAGE_SHIFT; 281 res.start = (u64) start_pfn << PAGE_SHIFT;
274 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 282 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
275 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 283 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
276 orig_end = res.end; 284 orig_end = res.end;
277 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 285 while ((res.start < res.end) &&
286 (find_next_system_ram(&res, "System RAM") >= 0)) {
278 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 287 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
279 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); 288 len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT);
280 ret = (*func)(pfn, len, arg); 289 ret = (*func)(pfn, len, arg);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..2f76e06bea58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -537,14 +505,6 @@ struct root_domain {
537#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
538 struct cpupri cpupri; 506 struct cpupri cpupri;
539#endif 507#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541 /*
542 * Preferred wake up cpu nominated by sched_mc balance that will be
543 * used when most cpus are idle in the system indicating overall very
544 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
545 */
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548}; 508};
549 509
550/* 510/*
@@ -616,6 +576,7 @@ struct rq {
616 576
617 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
618 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
619 int active_balance; 580 int active_balance;
620 int push_cpu; 581 int push_cpu;
621 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -626,6 +587,9 @@ struct rq {
626 587
627 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
628 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
629#endif 593#endif
630 594
631 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -665,9 +629,10 @@ struct rq {
665 629
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667 631
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
669{ 634{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
671} 636}
672 637
673static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 662
697inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
698{ 664{
@@ -715,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
715 * This interface allows printk to be called with the runqueue lock 681 * This interface allows printk to be called with the runqueue lock
716 * held and know whether or not it is OK to wake up the klogd. 682 * held and know whether or not it is OK to wake up the klogd.
717 */ 683 */
718int runqueue_is_locked(void) 684int runqueue_is_locked(int cpu)
719{ 685{
720 int cpu = get_cpu(); 686 return spin_is_locked(&cpu_rq(cpu)->lock);
721 struct rq *rq = cpu_rq(cpu);
722 int ret;
723
724 ret = spin_is_locked(&rq->lock);
725 put_cpu();
726 return ret;
727} 687}
728 688
729/* 689/*
@@ -861,6 +821,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 821unsigned int sysctl_sched_shares_thresh = 4;
862 822
863/* 823/*
824 * period over which we average the RT time consumption, measured
825 * in ms.
826 *
827 * default: 1s
828 */
829const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
830
831/*
864 * period over which we measure -rt task cpu usage in us. 832 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 833 * default: 1s
866 */ 834 */
@@ -1278,12 +1246,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1246}
1279#endif /* CONFIG_NO_HZ */ 1247#endif /* CONFIG_NO_HZ */
1280 1248
1249static u64 sched_avg_period(void)
1250{
1251 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1252}
1253
1254static void sched_avg_update(struct rq *rq)
1255{
1256 s64 period = sched_avg_period();
1257
1258 while ((s64)(rq->clock - rq->age_stamp) > period) {
1259 rq->age_stamp += period;
1260 rq->rt_avg /= 2;
1261 }
1262}
1263
1264static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1265{
1266 rq->rt_avg += rt_delta;
1267 sched_avg_update(rq);
1268}
1269
1281#else /* !CONFIG_SMP */ 1270#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1271static void resched_task(struct task_struct *p)
1283{ 1272{
1284 assert_spin_locked(&task_rq(p)->lock); 1273 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1274 set_tsk_need_resched(p);
1286} 1275}
1276
1277static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1278{
1279}
1287#endif /* CONFIG_SMP */ 1280#endif /* CONFIG_SMP */
1288 1281
1289#if BITS_PER_LONG == 32 1282#if BITS_PER_LONG == 32
@@ -1494,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
1494#endif 1487#endif
1495 1488
1496#ifdef CONFIG_SMP 1489#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type); 1490/* Used instead of source_load when we know the type == 0 */
1498static unsigned long target_load(int cpu, int type); 1491static unsigned long weighted_cpuload(const int cpu)
1492{
1493 return cpu_rq(cpu)->load.weight;
1494}
1495
1496/*
1497 * Return a low guess at the load of a migration-source cpu weighted
1498 * according to the scheduling class and "nice" value.
1499 *
1500 * We want to under-estimate the load of migration sources, to
1501 * balance conservatively.
1502 */
1503static unsigned long source_load(int cpu, int type)
1504{
1505 struct rq *rq = cpu_rq(cpu);
1506 unsigned long total = weighted_cpuload(cpu);
1507
1508 if (type == 0 || !sched_feat(LB_BIAS))
1509 return total;
1510
1511 return min(rq->cpu_load[type-1], total);
1512}
1513
1514/*
1515 * Return a high guess at the load of a migration-target cpu weighted
1516 * according to the scheduling class and "nice" value.
1517 */
1518static unsigned long target_load(int cpu, int type)
1519{
1520 struct rq *rq = cpu_rq(cpu);
1521 unsigned long total = weighted_cpuload(cpu);
1522
1523 if (type == 0 || !sched_feat(LB_BIAS))
1524 return total;
1525
1526 return max(rq->cpu_load[type-1], total);
1527}
1528
1529static struct sched_group *group_of(int cpu)
1530{
1531 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1532
1533 if (!sd)
1534 return NULL;
1535
1536 return sd->groups;
1537}
1538
1539static unsigned long power_of(int cpu)
1540{
1541 struct sched_group *group = group_of(cpu);
1542
1543 if (!group)
1544 return SCHED_LOAD_SCALE;
1545
1546 return group->cpu_power;
1547}
1548
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1549static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500 1550
1501static unsigned long cpu_avg_load_per_task(int cpu) 1551static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1563
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1564#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1565
1566struct update_shares_data {
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1572static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1573
1518/* 1574/*
1519 * Calculate and set the cpu's group shares. 1575 * Calculate and set the cpu's group shares.
1520 */ 1576 */
1521static void 1577static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1578 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1579 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd)
1524{ 1581{
1525 unsigned long shares; 1582 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1583 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1584
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1585 rq_weight = usd->rq_weight[cpu];
1586 if (!rq_weight) {
1587 boost = 1;
1588 rq_weight = NICE_0_LOAD;
1589 }
1532 1590
1533 /* 1591 /*
1534 * \Sum shares * rq_weight 1592 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1593 * shares_i = -----------------------------
1536 * \Sum rq_weight 1594 * \Sum_j rq_weight_j
1537 *
1538 */ 1595 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1596 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1597 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1602 unsigned long flags;
1546 1603
1547 spin_lock_irqsave(&rq->lock, flags); 1604 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1607 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1608 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1609 }
@@ -1559,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1616 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1617static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1618{
1562 unsigned long weight, rq_weight = 0; 1619 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1620 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1621 struct sched_domain *sd = data;
1622 unsigned long flags;
1565 int i; 1623 int i;
1566 1624
1625 if (!tg->se[0])
1626 return 0;
1627
1628 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data);
1630
1567 for_each_cpu(i, sched_domain_span(sd)) { 1631 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight;
1634
1568 /* 1635 /*
1569 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1638 * run here it will not get delayed by group starvation.
1572 */ 1639 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1640 if (!weight)
1575 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1576 1642
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1643 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1580 } 1645 }
@@ -1586,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1651 shares = tg->shares;
1587 1652
1588 for_each_cpu(i, sched_domain_span(sd)) 1653 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1655
1656 local_irq_restore(flags);
1590 1657
1591 return 0; 1658 return 0;
1592} 1659}
@@ -1616,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1683
1617static void update_shares(struct sched_domain *sd) 1684static void update_shares(struct sched_domain *sd)
1618{ 1685{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1686 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1687 u64 now;
1688
1689 if (root_task_group_empty())
1690 return;
1691
1692 now = cpu_clock(raw_smp_processor_id());
1693 elapsed = now - sd->last_update;
1621 1694
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1695 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1696 sd->last_update = now;
@@ -1627,6 +1700,9 @@ static void update_shares(struct sched_domain *sd)
1627 1700
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1701static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1702{
1703 if (root_task_group_empty())
1704 return;
1705
1630 spin_unlock(&rq->lock); 1706 spin_unlock(&rq->lock);
1631 update_shares(sd); 1707 update_shares(sd);
1632 spin_lock(&rq->lock); 1708 spin_lock(&rq->lock);
@@ -1634,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1710
1635static void update_h_load(long cpu) 1711static void update_h_load(long cpu)
1636{ 1712{
1713 if (root_task_group_empty())
1714 return;
1715
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1716 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1717}
1639 1718
@@ -1651,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1651 1730
1652#ifdef CONFIG_PREEMPT 1731#ifdef CONFIG_PREEMPT
1653 1732
1733static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1734
1654/* 1735/*
1655 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1736 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1656 * way at the expense of forcing extra atomic operations in all 1737 * way at the expense of forcing extra atomic operations in all
@@ -1915,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1915} 1996}
1916 1997
1917#ifdef CONFIG_SMP 1998#ifdef CONFIG_SMP
1918
1919/* Used instead of source_load when we know the type == 0 */
1920static unsigned long weighted_cpuload(const int cpu)
1921{
1922 return cpu_rq(cpu)->load.weight;
1923}
1924
1925/* 1999/*
1926 * Is this task likely cache-hot: 2000 * Is this task likely cache-hot:
1927 */ 2001 */
@@ -1979,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1979 if (task_hot(p, old_rq->clock, NULL)) 2053 if (task_hot(p, old_rq->clock, NULL))
1980 schedstat_inc(p, se.nr_forced2_migrations); 2054 schedstat_inc(p, se.nr_forced2_migrations);
1981#endif 2055#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0); 2057 1, 1, NULL, 0);
1984 } 2058 }
1985 p->se.vruntime -= old_cfsrq->min_vruntime - 2059 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2195,186 +2269,6 @@ void kick_process(struct task_struct *p)
2195 preempt_enable(); 2269 preempt_enable();
2196} 2270}
2197EXPORT_SYMBOL_GPL(kick_process); 2271EXPORT_SYMBOL_GPL(kick_process);
2198
2199/*
2200 * Return a low guess at the load of a migration-source cpu weighted
2201 * according to the scheduling class and "nice" value.
2202 *
2203 * We want to under-estimate the load of migration sources, to
2204 * balance conservatively.
2205 */
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217/*
2218 * Return a high guess at the load of a migration-target cpu weighted
2219 * according to the scheduling class and "nice" value.
2220 */
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232/*
2233 * find_idlest_group finds and returns the least busy CPU group within the
2234 * domain.
2235 */
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249 /* Skip over this group if it has no CPUs allowed */
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257 /* Tally up the load of all CPUs in the group */
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261 /* Bias balancing toward cpus of our domain */
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288/*
2289 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2290 */
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298 /* Traverse only the allowed CPUs */
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311/*
2312 * sched_balance_self: balance the current task (running on cpu) in domains
2313 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2314 * SD_BALANCE_EXEC.
2315 *
2316 * Balance, ie. select the least loaded group.
2317 *
2318 * Returns the target CPU number, or the same CPU if no balancing is needed.
2319 *
2320 * preempt must be disabled.
2321 */
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328 /*
2329 * If power savings logic is enabled for a domain, stop there.
2330 */
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357 /* Now try balancing at a lower domain level of cpu */
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362 /* Now try balancing at a lower domain level of new_cpu */
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372 /* while loop will break here if sd == NULL */
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif /* CONFIG_SMP */ 2272#endif /* CONFIG_SMP */
2379 2273
2380/** 2274/**
@@ -2412,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
2412 * 2306 *
2413 * returns failure only if the task is already active. 2307 * returns failure only if the task is already active.
2414 */ 2308 */
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2309static int try_to_wake_up(struct task_struct *p, unsigned int state,
2310 int wake_flags)
2416{ 2311{
2417 int cpu, orig_cpu, this_cpu, success = 0; 2312 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2313 unsigned long flags;
2419 long old_state;
2420 struct rq *rq; 2314 struct rq *rq;
2421 2315
2422 if (!sched_feat(SYNC_WAKEUPS)) 2316 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0; 2317 wake_flags &= ~WF_SYNC;
2424
2425#ifdef CONFIG_SMP
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428 2318
2429 this_cpu = raw_smp_processor_id(); 2319 this_cpu = get_cpu();
2430 cpu = task_cpu(p);
2431
2432 for_each_domain(this_cpu, sd) {
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440 2320
2441 smp_wmb(); 2321 smp_wmb();
2442 rq = task_rq_lock(p, &flags); 2322 rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq); 2323 update_rq_clock(rq);
2444 old_state = p->state; 2324 if (!(p->state & state))
2445 if (!(old_state & state))
2446 goto out; 2325 goto out;
2447 2326
2448 if (p->se.on_rq) 2327 if (p->se.on_rq)
@@ -2450,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2450 2329
2451 cpu = task_cpu(p); 2330 cpu = task_cpu(p);
2452 orig_cpu = cpu; 2331 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454 2332
2455#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p))) 2334 if (unlikely(task_running(rq, p)))
2457 goto out_activate; 2335 goto out_activate;
2458 2336
2459 cpu = p->sched_class->select_task_rq(p, sync); 2337 /*
2460 if (cpu != orig_cpu) { 2338 * In order to handle concurrent wakeups and release the rq->lock
2339 * we put the task in TASK_WAKING state.
2340 *
2341 * First fix up the nr_uninterruptible count:
2342 */
2343 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags);
2347
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu)
2461 set_task_cpu(p, cpu); 2350 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags);
2463 /* might preempt at this point */
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470 2351
2471 this_cpu = smp_processor_id(); 2352 rq = task_rq_lock(p, &flags);
2472 cpu = task_cpu(p); 2353 WARN_ON(p->state != TASK_WAKING);
2473 } 2354 cpu = task_cpu(p);
2474 2355
2475#ifdef CONFIG_SCHEDSTATS 2356#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count); 2357 schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2490out_activate: 2371out_activate:
2491#endif /* CONFIG_SMP */ 2372#endif /* CONFIG_SMP */
2492 schedstat_inc(p, se.nr_wakeups); 2373 schedstat_inc(p, se.nr_wakeups);
2493 if (sync) 2374 if (wake_flags & WF_SYNC)
2494 schedstat_inc(p, se.nr_wakeups_sync); 2375 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu) 2376 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate); 2377 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,7 +2400,7 @@ out_activate:
2519 2400
2520out_running: 2401out_running:
2521 trace_sched_wakeup(rq, p, success); 2402 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync); 2403 check_preempt_curr(rq, p, wake_flags);
2523 2404
2524 p->state = TASK_RUNNING; 2405 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP 2406#ifdef CONFIG_SMP
@@ -2528,6 +2409,7 @@ out_running:
2528#endif 2409#endif
2529out: 2410out:
2530 task_rq_unlock(rq, &flags); 2411 task_rq_unlock(rq, &flags);
2412 put_cpu();
2531 2413
2532 return success; 2414 return success;
2533} 2415}
@@ -2570,6 +2452,7 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_overlap = 0; 2452 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0; 2453 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2573 2456
2574#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0; 2458 p->se.wait_start = 0;
@@ -2631,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2631 2514
2632 __sched_fork(p); 2515 __sched_fork(p);
2633 2516
2634#ifdef CONFIG_SMP
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2636#endif
2637 set_task_cpu(p, cpu);
2638
2639 /* 2517 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2518 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2519 */
2642 p->prio = current->normal_prio; 2520 p->prio = current->normal_prio;
2521
2522 /*
2523 * Revert to default priority/policy on fork if requested.
2524 */
2525 if (unlikely(p->sched_reset_on_fork)) {
2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2527 p->policy = SCHED_NORMAL;
2528
2529 if (p->normal_prio < DEFAULT_PRIO)
2530 p->prio = DEFAULT_PRIO;
2531
2532 if (PRIO_TO_NICE(p->static_prio) < 0) {
2533 p->static_prio = NICE_TO_PRIO(0);
2534 set_load_weight(p);
2535 }
2536
2537 /*
2538 * We don't need the reset flag anymore after the fork. It has
2539 * fulfilled its duty:
2540 */
2541 p->sched_reset_on_fork = 0;
2542 }
2543
2643 if (!rt_prio(p->prio)) 2544 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2545 p->sched_class = &fair_sched_class;
2645 2546
2547#ifdef CONFIG_SMP
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2549#endif
2550 set_task_cpu(p, cpu);
2551
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on())) 2553 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2554 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2688,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2688 inc_nr_running(rq); 2594 inc_nr_running(rq);
2689 } 2595 }
2690 trace_sched_wakeup_new(rq, p, 1); 2596 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0); 2597 check_preempt_curr(rq, p, WF_FORK);
2692#ifdef CONFIG_SMP 2598#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up) 2599 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p); 2600 p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2702{
2797 struct mm_struct *mm = rq->prev_mm; 2703 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2704 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2705
2806 rq->prev_mm = NULL; 2706 rq->prev_mm = NULL;
2807 2707
@@ -2818,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2818 */ 2718 */
2819 prev_state = prev->state; 2719 prev_state = prev->state;
2820 finish_arch_switch(prev); 2720 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2721 perf_event_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2722 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2723
2828 fire_sched_in_preempt_notifiers(current); 2724 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2725 if (mm)
@@ -2838,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2734 }
2839} 2735}
2840 2736
2737#ifdef CONFIG_SMP
2738
2739/* assumes rq->lock is held */
2740static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2741{
2742 if (prev->sched_class->pre_schedule)
2743 prev->sched_class->pre_schedule(rq, prev);
2744}
2745
2746/* rq->lock is NOT held, but preemption is disabled */
2747static inline void post_schedule(struct rq *rq)
2748{
2749 if (rq->post_schedule) {
2750 unsigned long flags;
2751
2752 spin_lock_irqsave(&rq->lock, flags);
2753 if (rq->curr->sched_class->post_schedule)
2754 rq->curr->sched_class->post_schedule(rq);
2755 spin_unlock_irqrestore(&rq->lock, flags);
2756
2757 rq->post_schedule = 0;
2758 }
2759}
2760
2761#else
2762
2763static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2764{
2765}
2766
2767static inline void post_schedule(struct rq *rq)
2768{
2769}
2770
2771#endif
2772
2841/** 2773/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2774 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2775 * @prev: the thread we just switched away from.
@@ -2848,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2780 struct rq *rq = this_rq();
2849 2781
2850 finish_task_switch(rq, prev); 2782 finish_task_switch(rq, prev);
2783
2784 /*
2785 * FIXME: do we need to worry about rq being invalidated by the
2786 * task_switch?
2787 */
2788 post_schedule(rq);
2789
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2790#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2791 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2792 preempt_enable();
@@ -2965,6 +2904,19 @@ unsigned long nr_iowait(void)
2965 return sum; 2904 return sum;
2966} 2905}
2967 2906
2907unsigned long nr_iowait_cpu(void)
2908{
2909 struct rq *this = this_rq();
2910 return atomic_read(&this->nr_iowait);
2911}
2912
2913unsigned long this_cpu_load(void)
2914{
2915 struct rq *this = this_rq();
2916 return this->cpu_load[0];
2917}
2918
2919
2968/* Variables and functions for calc_load */ 2920/* Variables and functions for calc_load */
2969static atomic_long_t calc_load_tasks; 2921static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update; 2922static unsigned long calc_load_update;
@@ -3164,7 +3116,7 @@ out:
3164void sched_exec(void) 3116void sched_exec(void)
3165{ 3117{
3166 int new_cpu, this_cpu = get_cpu(); 3118 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3119 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3168 put_cpu(); 3120 put_cpu();
3169 if (new_cpu != this_cpu) 3121 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu); 3122 sched_migrate_task(current, new_cpu);
@@ -3379,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3331{
3380 const struct sched_class *class; 3332 const struct sched_class *class;
3381 3333
3382 for (class = sched_class_highest; class; class = class->next) 3334 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3335 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3336 return 1;
3337 }
3385 3338
3386 return 0; 3339 return 0;
3387} 3340}
@@ -3544,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3497 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3498 * from other group and save more power
3546 */ 3499 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3500 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3501 return;
3549 3502
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3503 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3583 *imbalance = sds->min_load_per_task; 3536 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min; 3537 sds->busiest = sds->group_min;
3585 3538
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1; 3539 return 1;
3592 3540
3593} 3541}
@@ -3612,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3560#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3561
3614 3562
3563unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3564{
3565 return SCHED_LOAD_SCALE;
3566}
3567
3568unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3569{
3570 return default_scale_freq_power(sd, cpu);
3571}
3572
3573unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3574{
3575 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3576 unsigned long smt_gain = sd->smt_gain;
3577
3578 smt_gain /= weight;
3579
3580 return smt_gain;
3581}
3582
3583unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3584{
3585 return default_scale_smt_power(sd, cpu);
3586}
3587
3588unsigned long scale_rt_power(int cpu)
3589{
3590 struct rq *rq = cpu_rq(cpu);
3591 u64 total, available;
3592
3593 sched_avg_update(rq);
3594
3595 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3596 available = total - rq->rt_avg;
3597
3598 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3599 total = SCHED_LOAD_SCALE;
3600
3601 total >>= SCHED_LOAD_SHIFT;
3602
3603 return div_u64(available, total);
3604}
3605
3606static void update_cpu_power(struct sched_domain *sd, int cpu)
3607{
3608 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3609 unsigned long power = SCHED_LOAD_SCALE;
3610 struct sched_group *sdg = sd->groups;
3611
3612 if (sched_feat(ARCH_POWER))
3613 power *= arch_scale_freq_power(sd, cpu);
3614 else
3615 power *= default_scale_freq_power(sd, cpu);
3616
3617 power >>= SCHED_LOAD_SHIFT;
3618
3619 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3620 if (sched_feat(ARCH_POWER))
3621 power *= arch_scale_smt_power(sd, cpu);
3622 else
3623 power *= default_scale_smt_power(sd, cpu);
3624
3625 power >>= SCHED_LOAD_SHIFT;
3626 }
3627
3628 power *= scale_rt_power(cpu);
3629 power >>= SCHED_LOAD_SHIFT;
3630
3631 if (!power)
3632 power = 1;
3633
3634 sdg->cpu_power = power;
3635}
3636
3637static void update_group_power(struct sched_domain *sd, int cpu)
3638{
3639 struct sched_domain *child = sd->child;
3640 struct sched_group *group, *sdg = sd->groups;
3641 unsigned long power;
3642
3643 if (!child) {
3644 update_cpu_power(sd, cpu);
3645 return;
3646 }
3647
3648 power = 0;
3649
3650 group = child->groups;
3651 do {
3652 power += group->cpu_power;
3653 group = group->next;
3654 } while (group != child->groups);
3655
3656 sdg->cpu_power = power;
3657}
3658
3615/** 3659/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3660 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3617 * @group: sched_group whose statistics are to be updated. 3661 * @group: sched_group whose statistics are to be updated.
@@ -3624,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3668 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3669 * @sgs: variable to hold the statistics for this group.
3626 */ 3670 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3671static inline void update_sg_lb_stats(struct sched_domain *sd,
3672 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3673 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3674 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3675 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3680 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3681 unsigned long avg_load_per_task;
3637 3682
3638 if (local_group) 3683 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3684 balance_cpu = group_first_cpu(group);
3685 if (balance_cpu == this_cpu)
3686 update_group_power(sd, this_cpu);
3687 }
3640 3688
3641 /* Tally up the load of all CPUs in the group */ 3689 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3690 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3733 }
3686 3734
3687 /* Adjust by relative CPU power of the group */ 3735 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3736 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3737
3691 3738
3692 /* 3739 /*
@@ -3698,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3745 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3746 * the hierarchy?
3700 */ 3747 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3748 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3749 group->cpu_power;
3703 3750
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3751 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3752 sgs->group_imb = 1;
3706 3753
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3754 sgs->group_capacity =
3708 3755 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3756}
3710 3757
3711/** 3758/**
@@ -3723,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3770 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3771 struct sd_lb_stats *sds)
3725{ 3772{
3773 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3774 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3775 struct sg_lb_stats sgs;
3728 int load_idx; 3776 int load_idx, prefer_sibling = 0;
3777
3778 if (child && child->flags & SD_PREFER_SIBLING)
3779 prefer_sibling = 1;
3729 3780
3730 init_sd_power_savings_stats(sd, sds, idle); 3781 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3782 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3787 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3788 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3790 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3791 local_group, cpus, balance, &sgs);
3741 3792
3742 if (local_group && balance && !(*balance)) 3793 if (local_group && balance && !(*balance))
3743 return; 3794 return;
3744 3795
3745 sds->total_load += sgs.group_load; 3796 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3797 sds->total_pwr += group->cpu_power;
3798
3799 /*
3800 * In case the child domain prefers tasks go to siblings
3801 * first, lower the group capacity to one so that we'll try
3802 * and move all the excess tasks away.
3803 */
3804 if (prefer_sibling)
3805 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3806
3748 if (local_group) { 3807 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3808 sds->this_load = sgs.avg_load;
@@ -3763,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3822 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3823 group = group->next;
3765 } while (group != sd->groups); 3824 } while (group != sd->groups);
3766
3767} 3825}
3768 3826
3769/** 3827/**
@@ -3801,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3859 * moving them.
3802 */ 3860 */
3803 3861
3804 pwr_now += sds->busiest->__cpu_power * 3862 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3863 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3864 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3865 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3866 pwr_now /= SCHED_LOAD_SCALE;
3809 3867
3810 /* Amount of load we'd subtract */ 3868 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3869 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3870 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3871 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3872 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 3873 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 3874
3817 /* Amount of load we'd add */ 3875 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 3876 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3877 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 3878 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 3879 sds->this->cpu_power;
3822 else 3880 else
3823 tmp = sg_div_cpu_power(sds->this, 3881 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3882 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 3883 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 3884 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 3885 pwr_move /= SCHED_LOAD_SCALE;
3828 3886
@@ -3857,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 3915 sds->max_load - sds->busiest_load_per_task);
3858 3916
3859 /* How much load to actually move to equalise the imbalance */ 3917 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3918 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3919 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 3920 / SCHED_LOAD_SCALE;
3863 3921
3864 /* 3922 /*
@@ -3988,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4046 int i;
3989 4047
3990 for_each_cpu(i, sched_group_cpus(group)) { 4048 for_each_cpu(i, sched_group_cpus(group)) {
4049 unsigned long power = power_of(i);
4050 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4051 unsigned long wl;
3992 4052
3993 if (!cpumask_test_cpu(i, cpus)) 4053 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4054 continue;
3995 4055
3996 rq = cpu_rq(i); 4056 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4057 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4058 wl /= power;
3998 4059
3999 if (rq->nr_running == 1 && wl > imbalance) 4060 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4061 continue;
4001 4062
4002 if (wl > max_load) { 4063 if (wl > max_load) {
@@ -5031,17 +5092,16 @@ void account_idle_time(cputime_t cputime)
5031 */ 5092 */
5032void account_process_tick(struct task_struct *p, int user_tick) 5093void account_process_tick(struct task_struct *p, int user_tick)
5033{ 5094{
5034 cputime_t one_jiffy = jiffies_to_cputime(1); 5095 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5035 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5036 struct rq *rq = this_rq(); 5096 struct rq *rq = this_rq();
5037 5097
5038 if (user_tick) 5098 if (user_tick)
5039 account_user_time(p, one_jiffy, one_jiffy_scaled); 5099 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5040 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5100 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5041 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5101 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5042 one_jiffy_scaled); 5102 one_jiffy_scaled);
5043 else 5103 else
5044 account_idle_time(one_jiffy); 5104 account_idle_time(cputime_one_jiffy);
5045} 5105}
5046 5106
5047/* 5107/*
@@ -5145,7 +5205,7 @@ void scheduler_tick(void)
5145 curr->sched_class->task_tick(rq, curr, 0); 5205 curr->sched_class->task_tick(rq, curr, 0);
5146 spin_unlock(&rq->lock); 5206 spin_unlock(&rq->lock);
5147 5207
5148 perf_counter_task_tick(curr, cpu); 5208 perf_event_task_tick(curr, cpu);
5149 5209
5150#ifdef CONFIG_SMP 5210#ifdef CONFIG_SMP
5151 rq->idle_at_tick = idle_cpu(cpu); 5211 rq->idle_at_tick = idle_cpu(cpu);
@@ -5257,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev)
5257#endif 5317#endif
5258} 5318}
5259 5319
5260static void put_prev_task(struct rq *rq, struct task_struct *prev) 5320static void put_prev_task(struct rq *rq, struct task_struct *p)
5261{ 5321{
5262 if (prev->state == TASK_RUNNING) { 5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5263 u64 runtime = prev->se.sum_exec_runtime;
5264 5323
5265 runtime -= prev->se.prev_sum_exec_runtime; 5324 update_avg(&p->se.avg_running, runtime);
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267 5325
5326 if (p->state == TASK_RUNNING) {
5268 /* 5327 /*
5269 * In order to avoid avg_overlap growing stale when we are 5328 * In order to avoid avg_overlap growing stale when we are
5270 * indeed overlapping and hence not getting put to sleep, grow 5329 * indeed overlapping and hence not getting put to sleep, grow
@@ -5274,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5274 * correlates to the amount of cache footprint a task can 5333 * correlates to the amount of cache footprint a task can
5275 * build up. 5334 * build up.
5276 */ 5335 */
5277 update_avg(&prev->se.avg_overlap, runtime); 5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5337 update_avg(&p->se.avg_overlap, runtime);
5338 } else {
5339 update_avg(&p->se.avg_running, 0);
5278 } 5340 }
5279 prev->sched_class->put_prev_task(rq, prev); 5341 p->sched_class->put_prev_task(rq, p);
5280} 5342}
5281 5343
5282/* 5344/*
@@ -5325,7 +5387,7 @@ need_resched:
5325 preempt_disable(); 5387 preempt_disable();
5326 cpu = smp_processor_id(); 5388 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5389 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5390 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5391 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5392 switch_count = &prev->nivcsw;
5331 5393
@@ -5349,10 +5411,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5411 switch_count = &prev->nvcsw;
5350 } 5412 }
5351 5413
5352#ifdef CONFIG_SMP 5414 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5415
5357 if (unlikely(!rq->nr_running)) 5416 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5417 idle_balance(cpu, rq);
@@ -5362,7 +5421,7 @@ need_resched_nonpreemptible:
5362 5421
5363 if (likely(prev != next)) { 5422 if (likely(prev != next)) {
5364 sched_info_switch(prev, next); 5423 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu); 5424 perf_event_task_sched_out(prev, next, cpu);
5366 5425
5367 rq->nr_switches++; 5426 rq->nr_switches++;
5368 rq->curr = next; 5427 rq->curr = next;
@@ -5378,6 +5437,8 @@ need_resched_nonpreemptible:
5378 } else 5437 } else
5379 spin_unlock_irq(&rq->lock); 5438 spin_unlock_irq(&rq->lock);
5380 5439
5440 post_schedule(rq);
5441
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5442 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5443 goto need_resched_nonpreemptible;
5383 5444
@@ -5509,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5509 5570
5510#endif /* CONFIG_PREEMPT */ 5571#endif /* CONFIG_PREEMPT */
5511 5572
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5573int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5513 void *key) 5574 void *key)
5514{ 5575{
5515 return try_to_wake_up(curr->private, mode, sync); 5576 return try_to_wake_up(curr->private, mode, wake_flags);
5516} 5577}
5517EXPORT_SYMBOL(default_wake_function); 5578EXPORT_SYMBOL(default_wake_function);
5518 5579
@@ -5526,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function);
5526 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5587 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5527 */ 5588 */
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5589static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key) 5590 int nr_exclusive, int wake_flags, void *key)
5530{ 5591{
5531 wait_queue_t *curr, *next; 5592 wait_queue_t *curr, *next;
5532 5593
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags; 5595 unsigned flags = curr->flags;
5535 5596
5536 if (curr->func(curr, mode, sync, key) && 5597 if (curr->func(curr, mode, wake_flags, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break; 5599 break;
5539 } 5600 }
@@ -5594,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key) 5655 int nr_exclusive, void *key)
5595{ 5656{
5596 unsigned long flags; 5657 unsigned long flags;
5597 int sync = 1; 5658 int wake_flags = WF_SYNC;
5598 5659
5599 if (unlikely(!q)) 5660 if (unlikely(!q))
5600 return; 5661 return;
5601 5662
5602 if (unlikely(!nr_exclusive)) 5663 if (unlikely(!nr_exclusive))
5603 sync = 0; 5664 wake_flags = 0;
5604 5665
5605 spin_lock_irqsave(&q->lock, flags); 5666 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key); 5667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5607 spin_unlock_irqrestore(&q->lock, flags); 5668 spin_unlock_irqrestore(&q->lock, flags);
5608} 5669}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6123,17 +6184,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6184 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6185 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6186 struct rq *rq;
6187 int reset_on_fork;
6126 6188
6127 /* may grab non-irq protected spin_locks */ 6189 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6190 BUG_ON(in_interrupt());
6129recheck: 6191recheck:
6130 /* double check policy once rq lock held */ 6192 /* double check policy once rq lock held */
6131 if (policy < 0) 6193 if (policy < 0) {
6194 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6195 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6196 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6197 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6198 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6199
6200 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6201 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6202 policy != SCHED_IDLE)
6203 return -EINVAL;
6204 }
6205
6137 /* 6206 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6207 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6208 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6246,10 @@ recheck:
6177 /* can't change other user's priorities */ 6246 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6247 if (!check_same_owner(p))
6179 return -EPERM; 6248 return -EPERM;
6249
6250 /* Normal users shall not reset the sched_reset_on_fork flag */
6251 if (p->sched_reset_on_fork && !reset_on_fork)
6252 return -EPERM;
6180 } 6253 }
6181 6254
6182 if (user) { 6255 if (user) {
@@ -6220,6 +6293,8 @@ recheck:
6220 if (running) 6293 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6294 p->sched_class->put_prev_task(rq, p);
6222 6295
6296 p->sched_reset_on_fork = reset_on_fork;
6297
6223 oldprio = p->prio; 6298 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6299 __setscheduler(rq, p, policy, param->sched_priority);
6225 6300
@@ -6336,14 +6411,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6411 if (p) {
6337 retval = security_task_getscheduler(p); 6412 retval = security_task_getscheduler(p);
6338 if (!retval) 6413 if (!retval)
6339 retval = p->policy; 6414 retval = p->policy
6415 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6416 }
6341 read_unlock(&tasklist_lock); 6417 read_unlock(&tasklist_lock);
6342 return retval; 6418 return retval;
6343} 6419}
6344 6420
6345/** 6421/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6422 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6423 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6424 * @param: structure containing the RT priority.
6349 */ 6425 */
@@ -6571,19 +6647,9 @@ static inline int should_resched(void)
6571 6647
6572static void __cond_resched(void) 6648static void __cond_resched(void)
6573{ 6649{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6650 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6651 schedule();
6576#endif 6652 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6653}
6588 6654
6589int __sched _cond_resched(void) 6655int __sched _cond_resched(void)
@@ -6597,18 +6663,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6663EXPORT_SYMBOL(_cond_resched);
6598 6664
6599/* 6665/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6666 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6667 * call schedule, and on return reacquire the lock.
6602 * 6668 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6669 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6670 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6671 * spin_unlock(), once by hand).
6606 */ 6672 */
6607int cond_resched_lock(spinlock_t *lock) 6673int __cond_resched_lock(spinlock_t *lock)
6608{ 6674{
6609 int resched = should_resched(); 6675 int resched = should_resched();
6610 int ret = 0; 6676 int ret = 0;
6611 6677
6678 lockdep_assert_held(lock);
6679
6612 if (spin_needbreak(lock) || resched) { 6680 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6681 spin_unlock(lock);
6614 if (resched) 6682 if (resched)
@@ -6620,9 +6688,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6688 }
6621 return ret; 6689 return ret;
6622} 6690}
6623EXPORT_SYMBOL(cond_resched_lock); 6691EXPORT_SYMBOL(__cond_resched_lock);
6624 6692
6625int __sched cond_resched_softirq(void) 6693int __sched __cond_resched_softirq(void)
6626{ 6694{
6627 BUG_ON(!in_softirq()); 6695 BUG_ON(!in_softirq());
6628 6696
@@ -6634,7 +6702,7 @@ int __sched cond_resched_softirq(void)
6634 } 6702 }
6635 return 0; 6703 return 0;
6636} 6704}
6637EXPORT_SYMBOL(cond_resched_softirq); 6705EXPORT_SYMBOL(__cond_resched_softirq);
6638 6706
6639/** 6707/**
6640 * yield - yield the current processor to other threads. 6708 * yield - yield the current processor to other threads.
@@ -6658,11 +6726,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6726 */
6659void __sched io_schedule(void) 6727void __sched io_schedule(void)
6660{ 6728{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6729 struct rq *rq = raw_rq();
6662 6730
6663 delayacct_blkio_start(); 6731 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6732 atomic_inc(&rq->nr_iowait);
6733 current->in_iowait = 1;
6665 schedule(); 6734 schedule();
6735 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6736 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6737 delayacct_blkio_end();
6668} 6738}
@@ -6670,12 +6740,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6740
6671long __sched io_schedule_timeout(long timeout) 6741long __sched io_schedule_timeout(long timeout)
6672{ 6742{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6743 struct rq *rq = raw_rq();
6674 long ret; 6744 long ret;
6675 6745
6676 delayacct_blkio_start(); 6746 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6747 atomic_inc(&rq->nr_iowait);
6748 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6749 ret = schedule_timeout(timeout);
6750 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6751 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6752 delayacct_blkio_end();
6681 return ret; 6753 return ret;
@@ -6759,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6759 if (retval) 6831 if (retval)
6760 goto out_unlock; 6832 goto out_unlock;
6761 6833
6762 /* 6834 time_slice = p->sched_class->get_rr_interval(p);
6763 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6764 * tasks that are on an otherwise idle runqueue:
6765 */
6766 time_slice = 0;
6767 if (p->policy == SCHED_RR) {
6768 time_slice = DEF_TIMESLICE;
6769 } else if (p->policy != SCHED_FIFO) {
6770 struct sched_entity *se = &p->se;
6771 unsigned long flags;
6772 struct rq *rq;
6773 6835
6774 rq = task_rq_lock(p, &flags);
6775 if (rq->cfs.load.weight)
6776 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6777 task_rq_unlock(rq, &flags);
6778 }
6779 read_unlock(&tasklist_lock); 6836 read_unlock(&tasklist_lock);
6780 jiffies_to_timespec(time_slice, &t); 6837 jiffies_to_timespec(time_slice, &t);
6781 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6838 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -6992,8 +7049,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7049
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7050 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7051 /* Need help from migration thread: drop lock and wait. */
7052 struct task_struct *mt = rq->migration_thread;
7053
7054 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7055 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7056 wake_up_process(rq->migration_thread);
7057 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7058 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7059 tlb_migrate_finish(p->mm);
6999 return 0; 7060 return 0;
@@ -7051,6 +7112,11 @@ fail:
7051 return ret; 7112 return ret;
7052} 7113}
7053 7114
7115#define RCU_MIGRATION_IDLE 0
7116#define RCU_MIGRATION_NEED_QS 1
7117#define RCU_MIGRATION_GOT_QS 2
7118#define RCU_MIGRATION_MUST_SYNC 3
7119
7054/* 7120/*
7055 * migration_thread - this is a highprio system thread that performs 7121 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7122 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7124,7 @@ fail:
7058 */ 7124 */
7059static int migration_thread(void *data) 7125static int migration_thread(void *data)
7060{ 7126{
7127 int badcpu;
7061 int cpu = (long)data; 7128 int cpu = (long)data;
7062 struct rq *rq; 7129 struct rq *rq;
7063 7130
@@ -7092,8 +7159,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7159 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7160 list_del_init(head->next);
7094 7161
7095 spin_unlock(&rq->lock); 7162 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7163 spin_unlock(&rq->lock);
7164 __migrate_task(req->task, cpu, req->dest_cpu);
7165 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7166 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7167 spin_unlock(&rq->lock);
7168 } else {
7169 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7170 spin_unlock(&rq->lock);
7171 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7172 }
7097 local_irq_enable(); 7173 local_irq_enable();
7098 7174
7099 complete(&req->done); 7175 complete(&req->done);
@@ -7607,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7607/* 7683/*
7608 * Register at high priority so that task migration (migrate_all_tasks) 7684 * Register at high priority so that task migration (migrate_all_tasks)
7609 * happens before everything else. This has to be lower priority than 7685 * happens before everything else. This has to be lower priority than
7610 * the notifier in the perf_counter subsystem, though. 7686 * the notifier in the perf_event subsystem, though.
7611 */ 7687 */
7612static struct notifier_block __cpuinitdata migration_notifier = { 7688static struct notifier_block __cpuinitdata migration_notifier = {
7613 .notifier_call = migration_call, 7689 .notifier_call = migration_call,
@@ -7625,7 +7701,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7701 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7702 register_cpu_notifier(&migration_notifier);
7627 7703
7628 return err; 7704 return 0;
7629} 7705}
7630early_initcall(migration_init); 7706early_initcall(migration_init);
7631#endif 7707#endif
@@ -7672,7 +7748,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7748 break;
7673 } 7749 }
7674 7750
7675 if (!group->__cpu_power) { 7751 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7752 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7753 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7754 "set\n");
@@ -7696,9 +7772,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7772 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7773
7698 printk(KERN_CONT " %s", str); 7774 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7775 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7776 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7777 group->cpu_power);
7702 } 7778 }
7703 7779
7704 group = group->next; 7780 group = group->next;
@@ -7763,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd)
7763 } 7839 }
7764 7840
7765 /* Following flags don't use groups */ 7841 /* Following flags don't use groups */
7766 if (sd->flags & (SD_WAKE_IDLE | 7842 if (sd->flags & (SD_WAKE_AFFINE))
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0; 7843 return 0;
7770 7844
7771 return 1; 7845 return 1;
@@ -7782,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7856 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0; 7857 return 0;
7784 7858
7785 /* Does parent contain flags not in child? */
7786 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789 /* Flags needing groups don't count if only 1 group in parent */ 7859 /* Flags needing groups don't count if only 1 group in parent */
7790 if (parent->groups == parent->groups->next) { 7860 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE | 7861 pflags &= ~(SD_LOAD_BALANCE |
@@ -7841,7 +7911,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7911 rq->rd = rd;
7842 7912
7843 cpumask_set_cpu(rq->cpu, rd->span); 7913 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7914 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7915 set_rq_online(rq);
7846 7916
7847 spin_unlock_irqrestore(&rq->lock, flags); 7917 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8053,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8053 continue;
7984 8054
7985 cpumask_clear(sched_group_cpus(sg)); 8055 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8056 sg->cpu_power = 0;
7987 8057
7988 for_each_cpu(j, span) { 8058 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8059 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8161,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8161 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8162};
8093 8163
8164struct s_data {
8165#ifdef CONFIG_NUMA
8166 int sd_allnodes;
8167 cpumask_var_t domainspan;
8168 cpumask_var_t covered;
8169 cpumask_var_t notcovered;
8170#endif
8171 cpumask_var_t nodemask;
8172 cpumask_var_t this_sibling_map;
8173 cpumask_var_t this_core_map;
8174 cpumask_var_t send_covered;
8175 cpumask_var_t tmpmask;
8176 struct sched_group **sched_group_nodes;
8177 struct root_domain *rd;
8178};
8179
8180enum s_alloc {
8181 sa_sched_groups = 0,
8182 sa_rootdomain,
8183 sa_tmpmask,
8184 sa_send_covered,
8185 sa_this_core_map,
8186 sa_this_sibling_map,
8187 sa_nodemask,
8188 sa_sched_group_nodes,
8189#ifdef CONFIG_NUMA
8190 sa_notcovered,
8191 sa_covered,
8192 sa_domainspan,
8193#endif
8194 sa_none,
8195};
8196
8094/* 8197/*
8095 * SMT sched-domains: 8198 * SMT sched-domains:
8096 */ 8199 */
@@ -8208,11 +8311,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8311 continue;
8209 } 8312 }
8210 8313
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8314 sg->cpu_power += sd->groups->cpu_power;
8212 } 8315 }
8213 sg = sg->next; 8316 sg = sg->next;
8214 } while (sg != group_head); 8317 } while (sg != group_head);
8215} 8318}
8319
8320static int build_numa_sched_groups(struct s_data *d,
8321 const struct cpumask *cpu_map, int num)
8322{
8323 struct sched_domain *sd;
8324 struct sched_group *sg, *prev;
8325 int n, j;
8326
8327 cpumask_clear(d->covered);
8328 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8329 if (cpumask_empty(d->nodemask)) {
8330 d->sched_group_nodes[num] = NULL;
8331 goto out;
8332 }
8333
8334 sched_domain_node_span(num, d->domainspan);
8335 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8336
8337 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8338 GFP_KERNEL, num);
8339 if (!sg) {
8340 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8341 num);
8342 return -ENOMEM;
8343 }
8344 d->sched_group_nodes[num] = sg;
8345
8346 for_each_cpu(j, d->nodemask) {
8347 sd = &per_cpu(node_domains, j).sd;
8348 sd->groups = sg;
8349 }
8350
8351 sg->cpu_power = 0;
8352 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8353 sg->next = sg;
8354 cpumask_or(d->covered, d->covered, d->nodemask);
8355
8356 prev = sg;
8357 for (j = 0; j < nr_node_ids; j++) {
8358 n = (num + j) % nr_node_ids;
8359 cpumask_complement(d->notcovered, d->covered);
8360 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8361 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8362 if (cpumask_empty(d->tmpmask))
8363 break;
8364 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8365 if (cpumask_empty(d->tmpmask))
8366 continue;
8367 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8368 GFP_KERNEL, num);
8369 if (!sg) {
8370 printk(KERN_WARNING
8371 "Can not alloc domain group for node %d\n", j);
8372 return -ENOMEM;
8373 }
8374 sg->cpu_power = 0;
8375 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8376 sg->next = prev->next;
8377 cpumask_or(d->covered, d->covered, d->tmpmask);
8378 prev->next = sg;
8379 prev = sg;
8380 }
8381out:
8382 return 0;
8383}
8216#endif /* CONFIG_NUMA */ 8384#endif /* CONFIG_NUMA */
8217 8385
8218#ifdef CONFIG_NUMA 8386#ifdef CONFIG_NUMA
@@ -8266,15 +8434,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8434 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8435 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8436 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8437 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8438static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8439{
8276 struct sched_domain *child; 8440 struct sched_domain *child;
8277 struct sched_group *group; 8441 struct sched_group *group;
8442 long power;
8443 int weight;
8278 8444
8279 WARN_ON(!sd || !sd->groups); 8445 WARN_ON(!sd || !sd->groups);
8280 8446
@@ -8283,28 +8449,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8449
8284 child = sd->child; 8450 child = sd->child;
8285 8451
8286 sd->groups->__cpu_power = 0; 8452 sd->groups->cpu_power = 0;
8287 8453
8288 /* 8454 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8455 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8456 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8457 /*
8292 * can handle only one task, when there are other idle groups in the 8458 * SMT siblings share the power of a single core.
8293 * same sched domain. 8459 * Usually multiple threads get a better yield out of
8294 */ 8460 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8461 * reflect that in sd->smt_gain.
8296 (child->flags & 8462 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8463 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8464 power *= sd->smt_gain;
8465 power /= weight;
8466 power >>= SCHED_LOAD_SHIFT;
8467 }
8468 sd->groups->cpu_power += power;
8299 return; 8469 return;
8300 } 8470 }
8301 8471
8302 /* 8472 /*
8303 * add cpu_power of each child group to this groups cpu_power 8473 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8474 */
8305 group = child->groups; 8475 group = child->groups;
8306 do { 8476 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8477 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8478 group = group->next;
8309 } while (group != child->groups); 8479 } while (group != child->groups);
8310} 8480}
@@ -8371,287 +8541,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8371 request = attr->relax_domain_level; 8541 request = attr->relax_domain_level;
8372 if (request < sd->level) { 8542 if (request < sd->level) {
8373 /* turn off idle balance on this domain */ 8543 /* turn off idle balance on this domain */
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8544 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8375 } else { 8545 } else {
8376 /* turn on idle balance on this domain */ 8546 /* turn on idle balance on this domain */
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8547 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8548 }
8549}
8550
8551static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8552 const struct cpumask *cpu_map)
8553{
8554 switch (what) {
8555 case sa_sched_groups:
8556 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8557 d->sched_group_nodes = NULL;
8558 case sa_rootdomain:
8559 free_rootdomain(d->rd); /* fall through */
8560 case sa_tmpmask:
8561 free_cpumask_var(d->tmpmask); /* fall through */
8562 case sa_send_covered:
8563 free_cpumask_var(d->send_covered); /* fall through */
8564 case sa_this_core_map:
8565 free_cpumask_var(d->this_core_map); /* fall through */
8566 case sa_this_sibling_map:
8567 free_cpumask_var(d->this_sibling_map); /* fall through */
8568 case sa_nodemask:
8569 free_cpumask_var(d->nodemask); /* fall through */
8570 case sa_sched_group_nodes:
8571#ifdef CONFIG_NUMA
8572 kfree(d->sched_group_nodes); /* fall through */
8573 case sa_notcovered:
8574 free_cpumask_var(d->notcovered); /* fall through */
8575 case sa_covered:
8576 free_cpumask_var(d->covered); /* fall through */
8577 case sa_domainspan:
8578 free_cpumask_var(d->domainspan); /* fall through */
8579#endif
8580 case sa_none:
8581 break;
8378 } 8582 }
8379} 8583}
8380 8584
8381/* 8585static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8586 const struct cpumask *cpu_map)
8383 * to the individual cpus
8384 */
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{ 8587{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA 8588#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered; 8589 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8394 struct sched_group **sched_group_nodes = NULL; 8590 return sa_none;
8395 int sd_allnodes = 0; 8591 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8396 8592 return sa_domainspan;
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8593 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8398 goto out; 8594 return sa_covered;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8595 /* Allocate the per-node list of sched groups */
8400 goto free_domainspan; 8596 d->sched_group_nodes = kcalloc(nr_node_ids,
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8597 sizeof(struct sched_group *), GFP_KERNEL);
8402 goto free_covered; 8598 if (!d->sched_group_nodes) {
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA
8417 /*
8418 * Allocate the per-node list of sched groups
8419 */
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
8421 GFP_KERNEL);
8422 if (!sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8599 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8600 return sa_notcovered;
8425 } 8601 }
8426#endif 8602 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8603#endif
8428 rd = alloc_rootdomain(); 8604 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8605 return sa_sched_group_nodes;
8606 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8607 return sa_nodemask;
8608 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8609 return sa_this_sibling_map;
8610 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8611 return sa_this_core_map;
8612 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8613 return sa_send_covered;
8614 d->rd = alloc_rootdomain();
8615 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8616 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8617 return sa_tmpmask;
8432 } 8618 }
8619 return sa_rootdomain;
8620}
8433 8621
8622static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8623 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8624{
8625 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8626#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8627 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8628
8459 sd = &per_cpu(node_domains, i).sd; 8629 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8630 if (cpumask_weight(cpu_map) >
8631 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8632 sd = &per_cpu(allnodes_domains, i).sd;
8633 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8634 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8635 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8636 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8637 d->sd_allnodes = 1;
8465 p->child = sd; 8638 }
8466 cpumask_and(sched_domain_span(sd), 8639 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8640
8641 sd = &per_cpu(node_domains, i).sd;
8642 SD_INIT(sd, NODE);
8643 set_domain_attribute(sd, attr);
8644 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8645 sd->parent = parent;
8646 if (parent)
8647 parent->child = sd;
8648 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8649#endif
8650 return sd;
8651}
8469 8652
8470 p = sd; 8653static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8654 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8655 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8656{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8657 struct sched_domain *sd;
8475 sd->parent = p; 8658 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8659 SD_INIT(sd, CPU);
8477 p->child = sd; 8660 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8661 cpumask_copy(sched_domain_span(sd), d->nodemask);
8662 sd->parent = parent;
8663 if (parent)
8664 parent->child = sd;
8665 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8666 return sd;
8667}
8479 8668
8669static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8670 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8671 struct sched_domain *parent, int i)
8672{
8673 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8674#ifdef CONFIG_SCHED_MC
8481 p = sd; 8675 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8676 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8677 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8678 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8679 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8680 parent->child = sd;
8487 sd->parent = p; 8681 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8682#endif
8683 return sd;
8684}
8491 8685
8686static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8687 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8688 struct sched_domain *parent, int i)
8689{
8690 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8691#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8692 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8693 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8694 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8695 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8696 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8697 parent->child = sd;
8499 sd->parent = p; 8698 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8699#endif
8503 } 8700 return sd;
8701}
8504 8702
8703static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8704 const struct cpumask *cpu_map, int cpu)
8705{
8706 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8707#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8708 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8709 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8710 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8711 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8712 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8713 &cpu_to_cpu_group,
8512 8714 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8715 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8716#endif
8518
8519#ifdef CONFIG_SCHED_MC 8717#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8718 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8719 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8720 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8721 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8722 &cpu_to_core_group,
8525 8723 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8724 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8725#endif
8531 8726 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8727 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8728 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8729 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8730 &cpu_to_phys_group,
8536 continue; 8731 d->send_covered, d->tmpmask);
8537 8732 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8733#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8734 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8735 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8736 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8737 break;
8548 send_covered, tmpmask); 8738#endif
8739 default:
8740 break;
8549 } 8741 }
8742}
8550 8743
8551 for (i = 0; i < nr_node_ids; i++) { 8744/*
8552 /* Set up node groups */ 8745 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8746 * to the individual cpus
8554 int j; 8747 */
8555 8748static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8749 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8750{
8558 if (cpumask_empty(nodemask)) { 8751 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8752 struct s_data d;
8560 continue; 8753 struct sched_domain *sd;
8561 } 8754 int i;
8755#ifdef CONFIG_NUMA
8756 d.sd_allnodes = 0;
8757#endif
8562 8758
8563 sched_domain_node_span(i, domainspan); 8759 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8760 if (alloc_state != sa_rootdomain)
8761 goto error;
8762 alloc_state = sa_sched_groups;
8565 8763
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8764 /*
8567 GFP_KERNEL, i); 8765 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8766 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8767 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8768 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8769 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8770
8577 sd = &per_cpu(node_domains, j).sd; 8771 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8772 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8773 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8774 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8775 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8776
8586 for (j = 0; j < nr_node_ids; j++) { 8777 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8778 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8779 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8780 }
8588 8781
8589 cpumask_complement(notcovered, covered); 8782 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8783 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8784 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8785
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8786#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8787 /* Set up node groups */
8597 continue; 8788 if (d.sd_allnodes)
8789 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8790
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8791 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8792 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8793 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8794#endif
8616 8795
8617 /* Calculate CPU power for physical packages and nodes */ 8796 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8797#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8798 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8799 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8800 init_sched_groups_power(i, sd);
8623 } 8801 }
8624#endif 8802#endif
8625#ifdef CONFIG_SCHED_MC 8803#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8804 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8805 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8806 init_sched_groups_power(i, sd);
8630 } 8807 }
8631#endif 8808#endif
8632 8809
8633 for_each_cpu(i, cpu_map) { 8810 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8811 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8812 init_sched_groups_power(i, sd);
8637 } 8813 }
8638 8814
8639#ifdef CONFIG_NUMA 8815#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8816 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8817 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8818
8643 if (sd_allnodes) { 8819 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8820 struct sched_group *sg;
8645 8821
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8822 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8823 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8824 init_numa_sched_groups_power(sg);
8649 } 8825 }
8650#endif 8826#endif
8651 8827
8652 /* Attach the domains */ 8828 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8829 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8830#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8831 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8832#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8834,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8834#else
8660 sd = &per_cpu(phys_domains, i).sd; 8835 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8836#endif
8662 cpu_attach_domain(sd, rd, i); 8837 cpu_attach_domain(sd, d.rd, i);
8663 } 8838 }
8664 8839
8665 err = 0; 8840 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8841 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8842 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8843
8694#ifdef CONFIG_NUMA
8695error: 8844error:
8696 free_sched_groups(cpu_map, tmpmask); 8845 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8846 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8847}
8701 8848
8702static int build_sched_domains(const struct cpumask *cpu_map) 8849static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9015,6 +9162,7 @@ void __init sched_init_smp(void)
9015 cpumask_var_t non_isolated_cpus; 9162 cpumask_var_t non_isolated_cpus;
9016 9163
9017 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9164 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9165 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9018 9166
9019#if defined(CONFIG_NUMA) 9167#if defined(CONFIG_NUMA)
9020 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9168 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9046,7 +9194,6 @@ void __init sched_init_smp(void)
9046 sched_init_granularity(); 9194 sched_init_granularity();
9047 free_cpumask_var(non_isolated_cpus); 9195 free_cpumask_var(non_isolated_cpus);
9048 9196
9049 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9050 init_sched_rt_class(); 9197 init_sched_rt_class();
9051} 9198}
9052#else 9199#else
@@ -9304,11 +9451,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9451 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9452 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9453 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9454 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9455 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9456 */
9310 init_tg_cfs_entry(&init_task_group, 9457 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9458 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9459 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9460 root_task_group.se[i]);
9314 9461
@@ -9334,6 +9481,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9481#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9482 rq->sd = NULL;
9336 rq->rd = NULL; 9483 rq->rd = NULL;
9484 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9485 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9486 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9487 rq->push_cpu = 0;
@@ -9392,19 +9540,26 @@ void __init sched_init(void)
9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9393#endif /* SMP */ 9541#endif /* SMP */
9394 9542
9395 perf_counter_init(); 9543 perf_event_init();
9396 9544
9397 scheduler_running = 1; 9545 scheduler_running = 1;
9398} 9546}
9399 9547
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9548#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9549static inline int preempt_count_equals(int preempt_offset)
9550{
9551 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9552
9553 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9554}
9555
9556void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9557{
9403#ifdef in_atomic 9558#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9559 static unsigned long prev_jiffy; /* ratelimiting */
9405 9560
9406 if ((!in_atomic() && !irqs_disabled()) || 9561 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9562 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9563 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9564 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9565 return;
@@ -10581,3 +10736,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10736 .subsys_id = cpuacct_subsys_id,
10582}; 10737};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10738#endif /* CONFIG_CGROUP_CPUACCT */
10739
10740#ifndef CONFIG_SMP
10741
10742int rcu_expedited_torture_stats(char *page)
10743{
10744 return 0;
10745}
10746EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10747
10748void synchronize_sched_expedited(void)
10749{
10750}
10751EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10752
10753#else /* #ifndef CONFIG_SMP */
10754
10755static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10756static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10757
10758#define RCU_EXPEDITED_STATE_POST -2
10759#define RCU_EXPEDITED_STATE_IDLE -1
10760
10761static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10762
10763int rcu_expedited_torture_stats(char *page)
10764{
10765 int cnt = 0;
10766 int cpu;
10767
10768 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10769 for_each_online_cpu(cpu) {
10770 cnt += sprintf(&page[cnt], " %d:%d",
10771 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10772 }
10773 cnt += sprintf(&page[cnt], "\n");
10774 return cnt;
10775}
10776EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10777
10778static long synchronize_sched_expedited_count;
10779
10780/*
10781 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10782 * approach to force grace period to end quickly. This consumes
10783 * significant time on all CPUs, and is thus not recommended for
10784 * any sort of common-case code.
10785 *
10786 * Note that it is illegal to call this function while holding any
10787 * lock that is acquired by a CPU-hotplug notifier. Failing to
10788 * observe this restriction will result in deadlock.
10789 */
10790void synchronize_sched_expedited(void)
10791{
10792 int cpu;
10793 unsigned long flags;
10794 bool need_full_sync = 0;
10795 struct rq *rq;
10796 struct migration_req *req;
10797 long snap;
10798 int trycount = 0;
10799
10800 smp_mb(); /* ensure prior mod happens before capturing snap. */
10801 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10802 get_online_cpus();
10803 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10804 put_online_cpus();
10805 if (trycount++ < 10)
10806 udelay(trycount * num_online_cpus());
10807 else {
10808 synchronize_sched();
10809 return;
10810 }
10811 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10812 smp_mb(); /* ensure test happens before caller kfree */
10813 return;
10814 }
10815 get_online_cpus();
10816 }
10817 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10818 for_each_online_cpu(cpu) {
10819 rq = cpu_rq(cpu);
10820 req = &per_cpu(rcu_migration_req, cpu);
10821 init_completion(&req->done);
10822 req->task = NULL;
10823 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10824 spin_lock_irqsave(&rq->lock, flags);
10825 list_add(&req->list, &rq->migration_queue);
10826 spin_unlock_irqrestore(&rq->lock, flags);
10827 wake_up_process(rq->migration_thread);
10828 }
10829 for_each_online_cpu(cpu) {
10830 rcu_expedited_state = cpu;
10831 req = &per_cpu(rcu_migration_req, cpu);
10832 rq = cpu_rq(cpu);
10833 wait_for_completion(&req->done);
10834 spin_lock_irqsave(&rq->lock, flags);
10835 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10836 need_full_sync = 1;
10837 req->dest_cpu = RCU_MIGRATION_IDLE;
10838 spin_unlock_irqrestore(&rq->lock, flags);
10839 }
10840 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10841 mutex_unlock(&rcu_sched_expedited_mutex);
10842 put_online_cpus();
10843 if (need_full_sync)
10844 synchronize_sched();
10845}
10846EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10847
10848#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
@@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 410 PN(se.wait_max);
410 PN(se.wait_sum); 411 PN(se.wait_sum);
411 P(se.wait_count); 412 P(se.wait_count);
413 PN(se.iowait_sum);
414 P(se.iowait_count);
412 P(sched_info.bkl_count); 415 P(sched_info.bkl_count);
413 P(se.nr_migrations); 416 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 417 P(se.nr_migrations_cold);
@@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 482 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 483 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 484 p->se.wait_count = 0;
485 p->se.iowait_sum = 0;
486 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 487 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 488 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 489 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..ecc637a0d591 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -505,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
505 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
506 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
507 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
508 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
509 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
510 } 519 }
@@ -537,6 +546,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 546 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 547 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 548 rq_of(cfs_rq)->clock - se->wait_start);
549#ifdef CONFIG_SCHEDSTATS
550 if (entity_is_task(se)) {
551 trace_sched_stat_wait(task_of(se),
552 rq_of(cfs_rq)->clock - se->wait_start);
553 }
554#endif
540 schedstat_set(se->wait_start, 0); 555 schedstat_set(se->wait_start, 0);
541} 556}
542 557
@@ -628,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 643 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
630 645
631 if (tsk) 646 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 647 account_scheduler_latency(tsk, delta >> 10, 1);
648 trace_sched_stat_sleep(tsk, delta);
649 }
633 } 650 }
634 if (se->block_start) { 651 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 652 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +661,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 661 se->sum_sleep_runtime += delta;
645 662
646 if (tsk) { 663 if (tsk) {
664 if (tsk->in_iowait) {
665 se->iowait_sum += delta;
666 se->iowait_count++;
667 trace_sched_stat_iowait(tsk, delta);
668 }
669
647 /* 670 /*
648 * Blocking time is in units of nanosecs, so shift by 671 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 672 * 20 to get a milliseconds-range estimation of the
@@ -687,29 +710,33 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
687 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
688 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
689 712
690 if (!initial) { 713 /* sleeps up to a single latency don't count. */
691 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
692 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
693 unsigned long thresh = sysctl_sched_latency;
694 716
695 /* 717 /*
696 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
697 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
698 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
699 * all of which have the same weight. 721 * all of which have the same weight.
700 */ 722 */
701 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
702 (!entity_is_task(se) || 724 task_of(se)->policy != SCHED_IDLE))
703 task_of(se)->policy != SCHED_IDLE)) 725 thresh = calc_delta_fair(thresh, se);
704 thresh = calc_delta_fair(thresh, se);
705 726
706 vruntime -= thresh; 727 /*
707 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
708 733
709 /* ensure we never gain time by being placed backwards. */ 734 vruntime -= thresh;
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 735 }
712 736
737 /* ensure we never gain time by being placed backwards. */
738 vruntime = max_vruntime(se->vruntime, vruntime);
739
713 se->vruntime = vruntime; 740 se->vruntime = vruntime;
714} 741}
715 742
@@ -735,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
735 762
736static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
737{ 764{
738 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
739 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
740 767
741 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
742 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
743} 770}
744 771
@@ -1040,79 +1067,6 @@ static void yield_task_fair(struct rq *rq)
1040 se->vruntime = rightmost->vruntime + 1; 1067 se->vruntime = rightmost->vruntime + 1;
1041} 1068}
1042 1069
1043/*
1044 * wake_idle() will wake a task on an idle cpu if task->cpu is
1045 * not idle and an idle cpu is available. The span of cpus to
1046 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask)
1050 *
1051 * Returns the CPU we should wake onto.
1052 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1054static int wake_idle(int cpu, struct task_struct *p)
1055{
1056 struct sched_domain *sd;
1057 int i;
1058 unsigned int chosen_wakeup_cpu;
1059 int this_cpu;
1060
1061 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1063 * are idle and this is not a kernel thread and this task's affinity
1064 * allows it to be moved to preferred cpu, then just move!
1065 */
1066
1067 this_cpu = smp_processor_id();
1068 chosen_wakeup_cpu =
1069 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1070
1071 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1072 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1073 p->mm && !(p->flags & PF_KTHREAD) &&
1074 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1075 return chosen_wakeup_cpu;
1076
1077 /*
1078 * If it is idle, then it is the best cpu to run this task.
1079 *
1080 * This cpu is also the best, if it has more than one task already.
1081 * Siblings must be also busy(in most cases) as they didn't already
1082 * pickup the extra load from this cpu and hence we need not check
1083 * sibling runqueue info. This will avoid the checks and cache miss
1084 * penalities associated with that.
1085 */
1086 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1087 return cpu;
1088
1089 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) {
1097 schedstat_inc(p,
1098 se.nr_wakeups_idle);
1099 }
1100 return i;
1101 }
1102 }
1103 } else {
1104 break;
1105 }
1106 }
1107 return cpu;
1108}
1109#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1110static inline int wake_idle(int cpu, struct task_struct *p)
1111{
1112 return cpu;
1113}
1114#endif
1115
1116#ifdef CONFIG_SMP 1070#ifdef CONFIG_SMP
1117 1071
1118#ifdef CONFIG_FAIR_GROUP_SCHED 1072#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1199,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1199 1153
1200#endif 1154#endif
1201 1155
1202static int 1156static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1203wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1204 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1205 int idx, unsigned long load, unsigned long this_load,
1206 unsigned int imbalance)
1207{ 1157{
1208 struct task_struct *curr = this_rq->curr; 1158 struct task_struct *curr = current;
1209 struct task_group *tg; 1159 unsigned long this_load, load;
1210 unsigned long tl = this_load; 1160 int idx, this_cpu, prev_cpu;
1211 unsigned long tl_per_task; 1161 unsigned long tl_per_task;
1162 unsigned int imbalance;
1163 struct task_group *tg;
1212 unsigned long weight; 1164 unsigned long weight;
1213 int balanced; 1165 int balanced;
1214 1166
1215 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1167 idx = sd->wake_idx;
1216 return 0; 1168 this_cpu = smp_processor_id();
1169 prev_cpu = task_cpu(p);
1170 load = source_load(prev_cpu, idx);
1171 this_load = target_load(this_cpu, idx);
1217 1172
1218 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1173 if (sync) {
1219 p->se.avg_overlap > sysctl_sched_migration_cost)) 1174 if (sched_feat(SYNC_LESS) &&
1220 sync = 0; 1175 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178 } else {
1179 if (sched_feat(SYNC_MORE) &&
1180 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1181 p->se.avg_overlap < sysctl_sched_migration_cost))
1182 sync = 1;
1183 }
1221 1184
1222 /* 1185 /*
1223 * If sync wakeup then subtract the (maximum possible) 1186 * If sync wakeup then subtract the (maximum possible)
@@ -1228,14 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1228 tg = task_group(current); 1191 tg = task_group(current);
1229 weight = current->se.load.weight; 1192 weight = current->se.load.weight;
1230 1193
1231 tl += effective_load(tg, this_cpu, -weight, -weight); 1194 this_load += effective_load(tg, this_cpu, -weight, -weight);
1232 load += effective_load(tg, prev_cpu, 0, -weight); 1195 load += effective_load(tg, prev_cpu, 0, -weight);
1233 } 1196 }
1234 1197
1235 tg = task_group(p); 1198 tg = task_group(p);
1236 weight = p->se.load.weight; 1199 weight = p->se.load.weight;
1237 1200
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1201 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1202
1203 /*
1204 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1205 * due to the sync cause above having dropped this_load to 0, we'll
1206 * always have an imbalance, but there's really nothing you can do
1207 * about that, so that's good too.
1208 *
1209 * Otherwise check if either cpus are near enough in load to allow this
1210 * task to be woken on this_cpu.
1211 */
1212 balanced = !this_load ||
1213 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1214 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1215
1241 /* 1216 /*
@@ -1249,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1249 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1224 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1250 tl_per_task = cpu_avg_load_per_task(this_cpu); 1225 tl_per_task = cpu_avg_load_per_task(this_cpu);
1251 1226
1252 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1227 if (balanced ||
1253 tl_per_task)) { 1228 (this_load <= load &&
1229 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1254 /* 1230 /*
1255 * This domain has SD_WAKE_AFFINE and 1231 * This domain has SD_WAKE_AFFINE and
1256 * p is cache cold in this domain, and 1232 * p is cache cold in this domain, and
1257 * there is no bad imbalance. 1233 * there is no bad imbalance.
1258 */ 1234 */
1259 schedstat_inc(this_sd, ttwu_move_affine); 1235 schedstat_inc(sd, ttwu_move_affine);
1260 schedstat_inc(p, se.nr_wakeups_affine); 1236 schedstat_inc(p, se.nr_wakeups_affine);
1261 1237
1262 return 1; 1238 return 1;
@@ -1264,67 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1264 return 0; 1240 return 0;
1265} 1241}
1266 1242
1267static int select_task_rq_fair(struct task_struct *p, int sync) 1243/*
1244 * find_idlest_group finds and returns the least busy CPU group within the
1245 * domain.
1246 */
1247static struct sched_group *
1248find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1249 int this_cpu, int load_idx)
1268{ 1250{
1269 struct sched_domain *sd, *this_sd = NULL; 1251 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270 int prev_cpu, this_cpu, new_cpu; 1252 unsigned long min_load = ULONG_MAX, this_load = 0;
1271 unsigned long load, this_load; 1253 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1272 struct rq *this_rq;
1273 unsigned int imbalance;
1274 int idx;
1275 1254
1276 prev_cpu = task_cpu(p); 1255 do {
1277 this_cpu = smp_processor_id(); 1256 unsigned long load, avg_load;
1278 this_rq = cpu_rq(this_cpu); 1257 int local_group;
1279 new_cpu = prev_cpu; 1258 int i;
1280 1259
1281 if (prev_cpu == this_cpu) 1260 /* Skip over this group if it has no CPUs allowed */
1282 goto out; 1261 if (!cpumask_intersects(sched_group_cpus(group),
1283 /* 1262 &p->cpus_allowed))
1284 * 'this_sd' is the first domain that both 1263 continue;
1285 * this_cpu and prev_cpu are present in: 1264
1286 */ 1265 local_group = cpumask_test_cpu(this_cpu,
1287 for_each_domain(this_cpu, sd) { 1266 sched_group_cpus(group));
1288 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1267
1289 this_sd = sd; 1268 /* Tally up the load of all CPUs in the group */
1290 break; 1269 avg_load = 0;
1270
1271 for_each_cpu(i, sched_group_cpus(group)) {
1272 /* Bias balancing toward cpus of our domain */
1273 if (local_group)
1274 load = source_load(i, load_idx);
1275 else
1276 load = target_load(i, load_idx);
1277
1278 avg_load += load;
1279 }
1280
1281 /* Adjust by relative CPU power of the group */
1282 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1283
1284 if (local_group) {
1285 this_load = avg_load;
1286 this = group;
1287 } else if (avg_load < min_load) {
1288 min_load = avg_load;
1289 idlest = group;
1290 }
1291 } while (group = group->next, group != sd->groups);
1292
1293 if (!idlest || 100*this_load < imbalance*min_load)
1294 return NULL;
1295 return idlest;
1296}
1297
1298/*
1299 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1300 */
1301static int
1302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1303{
1304 unsigned long load, min_load = ULONG_MAX;
1305 int idlest = -1;
1306 int i;
1307
1308 /* Traverse only the allowed CPUs */
1309 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1310 load = weighted_cpuload(i);
1311
1312 if (load < min_load || (load == min_load && i == this_cpu)) {
1313 min_load = load;
1314 idlest = i;
1291 } 1315 }
1292 } 1316 }
1293 1317
1294 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1318 return idlest;
1295 goto out; 1319}
1296 1320
1297 /* 1321/*
1298 * Check for affine wakeup and passive balancing possibilities. 1322 * sched_balance_self: balance the current task (running on cpu) in domains
1299 */ 1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1300 if (!this_sd) 1324 * SD_BALANCE_EXEC.
1325 *
1326 * Balance, ie. select the least loaded group.
1327 *
1328 * Returns the target CPU number, or the same CPU if no balancing is needed.
1329 *
1330 * preempt must be disabled.
1331 */
1332static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1333{
1334 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1335 int cpu = smp_processor_id();
1336 int prev_cpu = task_cpu(p);
1337 int new_cpu = cpu;
1338 int want_affine = 0;
1339 int want_sd = 1;
1340 int sync = wake_flags & WF_SYNC;
1341
1342 if (sd_flag & SD_BALANCE_WAKE) {
1343 if (sched_feat(AFFINE_WAKEUPS) &&
1344 cpumask_test_cpu(cpu, &p->cpus_allowed))
1345 want_affine = 1;
1346 new_cpu = prev_cpu;
1347 }
1348
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) {
1351 /*
1352 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider.
1354 */
1355 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1356 unsigned long power = 0;
1357 unsigned long nr_running = 0;
1358 unsigned long capacity;
1359 int i;
1360
1361 for_each_cpu(i, sched_domain_span(tmp)) {
1362 power += power_of(i);
1363 nr_running += cpu_rq(i)->cfs.nr_running;
1364 }
1365
1366 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1367
1368 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1369 nr_running /= 2;
1370
1371 if (nr_running < capacity)
1372 want_sd = 0;
1373 }
1374
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1377
1378 affine_sd = tmp;
1379 want_affine = 0;
1380 }
1381
1382 if (!want_sd && !want_affine)
1383 break;
1384
1385 if (!(tmp->flags & sd_flag))
1386 continue;
1387
1388 if (want_sd)
1389 sd = tmp;
1390 }
1391
1392 if (sched_feat(LB_SHARES_UPDATE)) {
1393 /*
1394 * Pick the largest domain to update shares over
1395 */
1396 tmp = sd;
1397 if (affine_sd && (!tmp ||
1398 cpumask_weight(sched_domain_span(affine_sd)) >
1399 cpumask_weight(sched_domain_span(sd))))
1400 tmp = affine_sd;
1401
1402 if (tmp)
1403 update_shares(tmp);
1404 }
1405
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1407 new_cpu = cpu;
1301 goto out; 1408 goto out;
1409 }
1410
1411 while (sd) {
1412 int load_idx = sd->forkexec_idx;
1413 struct sched_group *group;
1414 int weight;
1302 1415
1303 idx = this_sd->wake_idx; 1416 if (!(sd->flags & sd_flag)) {
1417 sd = sd->child;
1418 continue;
1419 }
1304 1420
1305 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1421 if (sd_flag & SD_BALANCE_WAKE)
1422 load_idx = sd->wake_idx;
1306 1423
1307 load = source_load(prev_cpu, idx); 1424 group = find_idlest_group(sd, p, cpu, load_idx);
1308 this_load = target_load(this_cpu, idx); 1425 if (!group) {
1426 sd = sd->child;
1427 continue;
1428 }
1309 1429
1310 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1430 new_cpu = find_idlest_cpu(group, p, cpu);
1311 load, this_load, imbalance)) 1431 if (new_cpu == -1 || new_cpu == cpu) {
1312 return this_cpu; 1432 /* Now try balancing at a lower domain level of cpu */
1433 sd = sd->child;
1434 continue;
1435 }
1313 1436
1314 /* 1437 /* Now try balancing at a lower domain level of new_cpu */
1315 * Start passive balancing when half the imbalance_pct 1438 cpu = new_cpu;
1316 * limit is reached. 1439 weight = cpumask_weight(sched_domain_span(sd));
1317 */ 1440 sd = NULL;
1318 if (this_sd->flags & SD_WAKE_BALANCE) { 1441 for_each_domain(cpu, tmp) {
1319 if (imbalance*this_load <= 100*load) { 1442 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1320 schedstat_inc(this_sd, ttwu_move_balance); 1443 break;
1321 schedstat_inc(p, se.nr_wakeups_passive); 1444 if (tmp->flags & sd_flag)
1322 return this_cpu; 1445 sd = tmp;
1323 } 1446 }
1447 /* while loop will break here if sd == NULL */
1324 } 1448 }
1325 1449
1326out: 1450out:
1327 return wake_idle(new_cpu, p); 1451 rcu_read_unlock();
1452 return new_cpu;
1328} 1453}
1329#endif /* CONFIG_SMP */ 1454#endif /* CONFIG_SMP */
1330 1455
@@ -1437,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
1437/* 1562/*
1438 * Preempt the current task with a newly woken task if needed: 1563 * Preempt the current task with a newly woken task if needed:
1439 */ 1564 */
1440static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1565static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1441{ 1566{
1442 struct task_struct *curr = rq->curr; 1567 struct task_struct *curr = rq->curr;
1443 struct sched_entity *se = &curr->se, *pse = &p->se; 1568 struct sched_entity *se = &curr->se, *pse = &p->se;
1444 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC;
1445 1571
1446 update_curr(cfs_rq); 1572 update_curr(cfs_rq);
1447 1573
@@ -1467,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1467 */ 1593 */
1468 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1469 set_last_buddy(se); 1595 set_last_buddy(se);
1470 set_next_buddy(pse); 1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse);
1471 1598
1472 /* 1599 /*
1473 * We can come here with TIF_NEED_RESCHED already set from new task 1600 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1489,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1489 return; 1616 return;
1490 } 1617 }
1491 1618
1492 if (!sched_feat(WAKEUP_PREEMPT)) 1619 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1493 return; 1620 (sched_feat(WAKEUP_OVERLAP) &&
1494 1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1495 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1496 (se->avg_overlap < sysctl_sched_migration_cost &&
1497 pse->avg_overlap < sysctl_sched_migration_cost))) {
1498 resched_task(curr); 1623 resched_task(curr);
1499 return; 1624 return;
1500 } 1625 }
1501 1626
1627 if (sched_feat(WAKEUP_RUNNING)) {
1628 if (pse->avg_running < se->avg_running) {
1629 set_next_buddy(pse);
1630 resched_task(curr);
1631 return;
1632 }
1633 }
1634
1635 if (!sched_feat(WAKEUP_PREEMPT))
1636 return;
1637
1502 find_matching_se(&se, &pse); 1638 find_matching_se(&se, &pse);
1503 1639
1504 BUG_ON(!pse); 1640 BUG_ON(!pse);
@@ -1521,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1521 /* 1657 /*
1522 * If se was a buddy, clear it so that it will have to earn 1658 * If se was a buddy, clear it so that it will have to earn
1523 * the favour again. 1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1524 */ 1665 */
1525 __clear_buddies(cfs_rq, se); 1666 __clear_buddies(cfs_rq, NULL);
1526 set_next_entity(cfs_rq, se); 1667 set_next_entity(cfs_rq, se);
1527 cfs_rq = group_cfs_rq(se); 1668 cfs_rq = group_cfs_rq(se);
1528 } while (cfs_rq); 1669 } while (cfs_rq);
@@ -1721,6 +1862,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1862 sched_info_queued(p);
1722 1863
1723 update_curr(cfs_rq); 1864 update_curr(cfs_rq);
1865 if (curr)
1866 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1867 place_entity(cfs_rq, se, 1);
1725 1868
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1869 /* 'curr' will be NULL if the child belongs to a different group */
@@ -1796,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
1796} 1939}
1797#endif 1940#endif
1798 1941
1942unsigned int get_rr_interval_fair(struct task_struct *task)
1943{
1944 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0;
1948
1949 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue:
1952 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957
1958 return rr_interval;
1959}
1960
1799/* 1961/*
1800 * All the scheduling class methods: 1962 * All the scheduling class methods:
1801 */ 1963 */
@@ -1824,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
1824 .prio_changed = prio_changed_fair, 1986 .prio_changed = prio_changed_fair,
1825 .switched_to = switched_to_fair, 1987 .switched_to = switched_to_fair,
1826 1988
1989 .get_rr_interval = get_rr_interval_fair,
1990
1827#ifdef CONFIG_FAIR_GROUP_SCHED 1991#ifdef CONFIG_FAIR_GROUP_SCHED
1828 .moved_group = moved_group_fair, 1992 .moved_group = moved_group_fair,
1829#endif 1993#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -927,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
927#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
928static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
929 940
930static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
931{ 942{
932 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
933 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
934 /* 948 /*
935 * If the current task is an RT task, then 949 * If the current task is an RT task, then
936 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -988,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
988/* 1002/*
989 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
990 */ 1004 */
991static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
992{ 1006{
993 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
994 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1064,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1078 if (p)
1065 dequeue_pushable_task(rq, p); 1079 dequeue_pushable_task(rq, p);
1066 1080
1081#ifdef CONFIG_SMP
1082 /*
1083 * We detect this state here so that we can avoid taking the RQ
1084 * lock again later if there is no need to push
1085 */
1086 rq->post_schedule = has_pushable_tasks(rq);
1087#endif
1088
1067 return p; 1089 return p;
1068} 1090}
1069 1091
@@ -1162,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1184 return -1; /* No targets found */
1163 1185
1164 /* 1186 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1187 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1188 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1189 * the best one based on our affinity and topology.
@@ -1262,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1277 return lowest_rq;
1263} 1278}
1264 1279
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1280static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1281{
1272 struct task_struct *p; 1282 struct task_struct *p;
@@ -1466,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1476 pull_rt_task(rq);
1467} 1477}
1468 1478
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1479static void post_schedule_rt(struct rq *rq)
1478{ 1480{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1482}
1487 1483
1488/* 1484/*
@@ -1738,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
1738 dequeue_pushable_task(rq, p); 1734 dequeue_pushable_task(rq, p);
1739} 1735}
1740 1736
1737unsigned int get_rr_interval_rt(struct task_struct *task)
1738{
1739 /*
1740 * Time slice is 0 for SCHED_FIFO tasks
1741 */
1742 if (task->policy == SCHED_RR)
1743 return DEF_TIMESLICE;
1744 else
1745 return 0;
1746}
1747
1741static const struct sched_class rt_sched_class = { 1748static const struct sched_class rt_sched_class = {
1742 .next = &fair_sched_class, 1749 .next = &fair_sched_class,
1743 .enqueue_task = enqueue_task_rt, 1750 .enqueue_task = enqueue_task_rt,
@@ -1758,7 +1765,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1765 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1766 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1767 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1768 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1769 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1770 .switched_from = switched_from_rt,
@@ -1767,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
1767 .set_curr_task = set_curr_task_rt, 1773 .set_curr_task = set_curr_task_rt,
1768 .task_tick = task_tick_rt, 1774 .task_tick = task_tick_rt,
1769 1775
1776 .get_rr_interval = get_rr_interval_rt,
1777
1770 .prio_changed = prio_changed_rt, 1778 .prio_changed = prio_changed_rt,
1771 .switched_to = switched_to_rt, 1779 .switched_to = switched_to_rt,
1772}; 1780};
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..fd47a256a24e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,8 +29,7 @@ enum {
29 29
30struct call_function_data { 30struct call_function_data {
31 struct call_single_data csd; 31 struct call_single_data csd;
32 spinlock_t lock; 32 atomic_t refs;
33 unsigned int refs;
34 cpumask_var_t cpumask; 33 cpumask_var_t cpumask;
35}; 34};
36 35
@@ -39,9 +38,7 @@ struct call_single_queue {
39 spinlock_t lock; 38 spinlock_t lock;
40}; 39};
41 40
42static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
43 .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
44};
45 42
46static int 43static int
47hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -177,6 +174,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 174 int cpu = get_cpu();
178 175
179 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online.
178 */
179 WARN_ON_ONCE(!cpu_online(cpu));
180
181 /*
180 * Ensure entry is visible on call_function_queue after we have 182 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 183 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 184 * If we don't have this, then we may miss an entry on the list
@@ -191,25 +193,18 @@ void generic_smp_call_function_interrupt(void)
191 list_for_each_entry_rcu(data, &call_function.queue, csd.list) { 193 list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
192 int refs; 194 int refs;
193 195
194 spin_lock(&data->lock); 196 if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
195 if (!cpumask_test_cpu(cpu, data->cpumask)) {
196 spin_unlock(&data->lock);
197 continue; 197 continue;
198 }
199 cpumask_clear_cpu(cpu, data->cpumask);
200 spin_unlock(&data->lock);
201 198
202 data->csd.func(data->csd.info); 199 data->csd.func(data->csd.info);
203 200
204 spin_lock(&data->lock); 201 refs = atomic_dec_return(&data->refs);
205 WARN_ON(data->refs == 0); 202 WARN_ON(refs < 0);
206 refs = --data->refs;
207 if (!refs) { 203 if (!refs) {
208 spin_lock(&call_function.lock); 204 spin_lock(&call_function.lock);
209 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
210 spin_unlock(&call_function.lock); 206 spin_unlock(&call_function.lock);
211 } 207 }
212 spin_unlock(&data->lock);
213 208
214 if (refs) 209 if (refs)
215 continue; 210 continue;
@@ -230,6 +225,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 225 unsigned int data_flags;
231 LIST_HEAD(list); 226 LIST_HEAD(list);
232 227
228 /*
229 * Shouldn't receive this interrupt on a cpu that is not yet online.
230 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232
233 spin_lock(&q->lock); 233 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 234 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 235 spin_unlock(&q->lock);
@@ -285,8 +285,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 285 */
286 this_cpu = get_cpu(); 286 this_cpu = get_cpu();
287 287
288 /* Can deadlock when called with interrupts disabled */ 288 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 289 * Can deadlock when called with interrupts disabled.
290 * We allow cpu's that are not yet online though, as no one else can
291 * send smp call function interrupt to this cpu and as such deadlocks
292 * can't happen.
293 */
294 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
295 && !oops_in_progress);
290 296
291 if (cpu == this_cpu) { 297 if (cpu == this_cpu) {
292 local_irq_save(flags); 298 local_irq_save(flags);
@@ -329,8 +335,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 335{
330 csd_lock(data); 336 csd_lock(data);
331 337
332 /* Can deadlock when called with interrupts disabled */ 338 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 339 * Can deadlock when called with interrupts disabled.
340 * We allow cpu's that are not yet online though, as no one else can
341 * send smp call function interrupt to this cpu and as such deadlocks
342 * can't happen.
343 */
344 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
345 && !oops_in_progress);
334 346
335 generic_exec_single(cpu, data, wait); 347 generic_exec_single(cpu, data, wait);
336} 348}
@@ -365,8 +377,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 377 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 378 int cpu, next_cpu, this_cpu = smp_processor_id();
367 379
368 /* Can deadlock when called with interrupts disabled */ 380 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 381 * Can deadlock when called with interrupts disabled.
382 * We allow cpu's that are not yet online though, as no one else can
383 * send smp call function interrupt to this cpu and as such deadlocks
384 * can't happen.
385 */
386 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
387 && !oops_in_progress);
370 388
371 /* So, what's a CPU they want? Ignoring this one. */ 389 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 390 cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -391,23 +409,20 @@ void smp_call_function_many(const struct cpumask *mask,
391 data = &__get_cpu_var(cfd_data); 409 data = &__get_cpu_var(cfd_data);
392 csd_lock(&data->csd); 410 csd_lock(&data->csd);
393 411
394 spin_lock_irqsave(&data->lock, flags);
395 data->csd.func = func; 412 data->csd.func = func;
396 data->csd.info = info; 413 data->csd.info = info;
397 cpumask_and(data->cpumask, mask, cpu_online_mask); 414 cpumask_and(data->cpumask, mask, cpu_online_mask);
398 cpumask_clear_cpu(this_cpu, data->cpumask); 415 cpumask_clear_cpu(this_cpu, data->cpumask);
399 data->refs = cpumask_weight(data->cpumask); 416 atomic_set(&data->refs, cpumask_weight(data->cpumask));
400 417
401 spin_lock(&call_function.lock); 418 spin_lock_irqsave(&call_function.lock, flags);
402 /* 419 /*
403 * Place entry at the _HEAD_ of the list, so that any cpu still 420 * Place entry at the _HEAD_ of the list, so that any cpu still
404 * observing the entry in generic_smp_call_function_interrupt() 421 * observing the entry in generic_smp_call_function_interrupt()
405 * will not miss any other list entries: 422 * will not miss any other list entries:
406 */ 423 */
407 list_add_rcu(&data->csd.list, &call_function.queue); 424 list_add_rcu(&data->csd.list, &call_function.queue);
408 spin_unlock(&call_function.lock); 425 spin_unlock_irqrestore(&call_function.lock, flags);
409
410 spin_unlock_irqrestore(&data->lock, flags);
411 426
412 /* 427 /*
413 * Make the list addition visible before sending the ipi. 428 * Make the list addition visible before sending the ipi.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..ebcb15611728 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1338 unsigned long flags; 1338 unsigned long flags;
1339 cputime_t utime, stime; 1339 cputime_t utime, stime;
1340 struct task_cputime cputime; 1340 struct task_cputime cputime;
1341 unsigned long maxrss = 0;
1341 1342
1342 memset((char *) r, 0, sizeof *r); 1343 memset((char *) r, 0, sizeof *r);
1343 utime = stime = cputime_zero; 1344 utime = stime = cputime_zero;
@@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1346 utime = task_utime(current); 1347 utime = task_utime(current);
1347 stime = task_stime(current); 1348 stime = task_stime(current);
1348 accumulate_thread_rusage(p, r); 1349 accumulate_thread_rusage(p, r);
1350 maxrss = p->signal->maxrss;
1349 goto out; 1351 goto out;
1350 } 1352 }
1351 1353
@@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1363 r->ru_majflt = p->signal->cmaj_flt; 1365 r->ru_majflt = p->signal->cmaj_flt;
1364 r->ru_inblock = p->signal->cinblock; 1366 r->ru_inblock = p->signal->cinblock;
1365 r->ru_oublock = p->signal->coublock; 1367 r->ru_oublock = p->signal->coublock;
1368 maxrss = p->signal->cmaxrss;
1366 1369
1367 if (who == RUSAGE_CHILDREN) 1370 if (who == RUSAGE_CHILDREN)
1368 break; 1371 break;
@@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1377 r->ru_majflt += p->signal->maj_flt; 1380 r->ru_majflt += p->signal->maj_flt;
1378 r->ru_inblock += p->signal->inblock; 1381 r->ru_inblock += p->signal->inblock;
1379 r->ru_oublock += p->signal->oublock; 1382 r->ru_oublock += p->signal->oublock;
1383 if (maxrss < p->signal->maxrss)
1384 maxrss = p->signal->maxrss;
1380 t = p; 1385 t = p;
1381 do { 1386 do {
1382 accumulate_thread_rusage(t, r); 1387 accumulate_thread_rusage(t, r);
@@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1392out: 1397out:
1393 cputime_to_timeval(utime, &r->ru_utime); 1398 cputime_to_timeval(utime, &r->ru_utime);
1394 cputime_to_timeval(stime, &r->ru_stime); 1399 cputime_to_timeval(stime, &r->ru_stime);
1400
1401 if (who != RUSAGE_CHILDREN) {
1402 struct mm_struct *mm = get_task_mm(p);
1403 if (mm) {
1404 setmax_mm_hiwater_rss(&maxrss, mm);
1405 mmput(mm);
1406 }
1407 }
1408 r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
1395} 1409}
1396 1410
1397int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1411int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1525 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1526 error = SET_TSC_CTL(arg2);
1513 break; 1527 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1528 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1529 error = perf_event_task_disable();
1516 break; 1530 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1531 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1532 error = perf_event_task_enable();
1519 break; 1533 break;
1520 case PR_GET_TIMERSLACK: 1534 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1535 error = current->timer_slack_ns;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..515bc230ac2a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -177,4 +177,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178 178
179/* performance counters: */ 179/* performance counters: */
180cond_syscall(sys_perf_counter_open); 180cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..0dfaa47d7cb6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,9 +49,8 @@
49#include <linux/acpi.h> 49#include <linux/acpi.h>
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
53#include <linux/slow-work.h> 52#include <linux/slow-work.h>
54#include <linux/perf_counter.h> 53#include <linux/perf_event.h>
55 54
56#include <asm/uaccess.h> 55#include <asm/uaccess.h>
57#include <asm/processor.h> 56#include <asm/processor.h>
@@ -92,6 +91,9 @@ extern int sysctl_nr_trim_pages;
92#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
93extern int rcutorture_runnable; 92extern int rcutorture_runnable;
94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
95 97
96/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -104,6 +106,9 @@ static int __maybe_unused one = 1;
104static int __maybe_unused two = 2; 106static int __maybe_unused two = 2;
105static unsigned long one_ul = 1; 107static unsigned long one_ul = 1;
106static int one_hundred = 100; 108static int one_hundred = 100;
109#ifdef CONFIG_PRINTK
110static int ten_thousand = 10000;
111#endif
107 112
108/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ 113/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
109static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; 114static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -246,6 +251,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
246#endif 251#endif
247 252
248static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
254 {
255 .ctl_name = CTL_UNNUMBERED,
256 .procname = "sched_child_runs_first",
257 .data = &sysctl_sched_child_runs_first,
258 .maxlen = sizeof(unsigned int),
259 .mode = 0644,
260 .proc_handler = &proc_dointvec,
261 },
249#ifdef CONFIG_SCHED_DEBUG 262#ifdef CONFIG_SCHED_DEBUG
250 { 263 {
251 .ctl_name = CTL_UNNUMBERED, 264 .ctl_name = CTL_UNNUMBERED,
@@ -300,14 +313,6 @@ static struct ctl_table kern_table[] = {
300 }, 313 },
301 { 314 {
302 .ctl_name = CTL_UNNUMBERED, 315 .ctl_name = CTL_UNNUMBERED,
303 .procname = "sched_child_runs_first",
304 .data = &sysctl_sched_child_runs_first,
305 .maxlen = sizeof(unsigned int),
306 .mode = 0644,
307 .proc_handler = &proc_dointvec,
308 },
309 {
310 .ctl_name = CTL_UNNUMBERED,
311 .procname = "sched_features", 316 .procname = "sched_features",
312 .data = &sysctl_sched_features, 317 .data = &sysctl_sched_features,
313 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
@@ -332,6 +337,14 @@ static struct ctl_table kern_table[] = {
332 }, 337 },
333 { 338 {
334 .ctl_name = CTL_UNNUMBERED, 339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_time_avg",
341 .data = &sysctl_sched_time_avg,
342 .maxlen = sizeof(unsigned int),
343 .mode = 0644,
344 .proc_handler = &proc_dointvec,
345 },
346 {
347 .ctl_name = CTL_UNNUMBERED,
335 .procname = "timer_migration", 348 .procname = "timer_migration",
336 .data = &sysctl_timer_migration, 349 .data = &sysctl_timer_migration,
337 .maxlen = sizeof(unsigned int), 350 .maxlen = sizeof(unsigned int),
@@ -712,6 +725,17 @@ static struct ctl_table kern_table[] = {
712 .mode = 0644, 725 .mode = 0644,
713 .proc_handler = &proc_dointvec, 726 .proc_handler = &proc_dointvec,
714 }, 727 },
728 {
729 .ctl_name = CTL_UNNUMBERED,
730 .procname = "printk_delay",
731 .data = &printk_delay_msec,
732 .maxlen = sizeof(int),
733 .mode = 0644,
734 .proc_handler = &proc_dointvec_minmax,
735 .strategy = &sysctl_intvec,
736 .extra1 = &zero,
737 .extra2 = &ten_thousand,
738 },
715#endif 739#endif
716 { 740 {
717 .ctl_name = KERN_NGROUPS_MAX, 741 .ctl_name = KERN_NGROUPS_MAX,
@@ -954,28 +978,28 @@ static struct ctl_table kern_table[] = {
954 .child = slow_work_sysctls, 978 .child = slow_work_sysctls,
955 }, 979 },
956#endif 980#endif
957#ifdef CONFIG_PERF_COUNTERS 981#ifdef CONFIG_PERF_EVENTS
958 { 982 {
959 .ctl_name = CTL_UNNUMBERED, 983 .ctl_name = CTL_UNNUMBERED,
960 .procname = "perf_counter_paranoid", 984 .procname = "perf_event_paranoid",
961 .data = &sysctl_perf_counter_paranoid, 985 .data = &sysctl_perf_event_paranoid,
962 .maxlen = sizeof(sysctl_perf_counter_paranoid), 986 .maxlen = sizeof(sysctl_perf_event_paranoid),
963 .mode = 0644, 987 .mode = 0644,
964 .proc_handler = &proc_dointvec, 988 .proc_handler = &proc_dointvec,
965 }, 989 },
966 { 990 {
967 .ctl_name = CTL_UNNUMBERED, 991 .ctl_name = CTL_UNNUMBERED,
968 .procname = "perf_counter_mlock_kb", 992 .procname = "perf_event_mlock_kb",
969 .data = &sysctl_perf_counter_mlock, 993 .data = &sysctl_perf_event_mlock,
970 .maxlen = sizeof(sysctl_perf_counter_mlock), 994 .maxlen = sizeof(sysctl_perf_event_mlock),
971 .mode = 0644, 995 .mode = 0644,
972 .proc_handler = &proc_dointvec, 996 .proc_handler = &proc_dointvec,
973 }, 997 },
974 { 998 {
975 .ctl_name = CTL_UNNUMBERED, 999 .ctl_name = CTL_UNNUMBERED,
976 .procname = "perf_counter_max_sample_rate", 1000 .procname = "perf_event_max_sample_rate",
977 .data = &sysctl_perf_counter_sample_rate, 1001 .data = &sysctl_perf_event_sample_rate,
978 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 1002 .maxlen = sizeof(sysctl_perf_event_sample_rate),
979 .mode = 0644, 1003 .mode = 0644,
980 .proc_handler = &proc_dointvec, 1004 .proc_handler = &proc_dointvec,
981 }, 1005 },
@@ -990,7 +1014,16 @@ static struct ctl_table kern_table[] = {
990 .proc_handler = &proc_dointvec, 1014 .proc_handler = &proc_dointvec,
991 }, 1015 },
992#endif 1016#endif
993 1017#ifdef CONFIG_BLOCK
1018 {
1019 .ctl_name = CTL_UNNUMBERED,
1020 .procname = "blk_iopoll",
1021 .data = &blk_iopoll_enabled,
1022 .maxlen = sizeof(int),
1023 .mode = 0644,
1024 .proc_handler = &proc_dointvec,
1025 },
1026#endif
994/* 1027/*
995 * NOTE: do not add new entries to this table unless you have read 1028 * NOTE: do not add new entries to this table unless you have read
996 * Documentation/sysctl/ctl_unnumbered.txt 1029 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
165 154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 mutex_lock(&clocksource_mutex);
300 398
301 list_for_each_entry(cs, &clocksource_list, list) { 399 list_for_each_entry(cs, &clocksource_list, list)
302 if (cs->resume) 400 if (cs->resume)
303 cs->resume(); 401 cs->resume();
304 }
305 402
306 clocksource_resume_watchdog(); 403 clocksource_resume_watchdog();
307 404
308 spin_unlock_irqrestore(&clocksource_lock, flags); 405 mutex_unlock(&clocksource_mutex);
309} 406}
310 407
311/** 408/**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 417 clocksource_resume_watchdog();
321} 418}
322 419
420#ifdef CONFIG_GENERIC_TIME
421
323/** 422/**
324 * clocksource_get_next - Returns the selected clocksource 423 * clocksource_select - Select the best clocksource available
424 *
425 * Private function. Must hold clocksource_mutex when called.
325 * 426 *
427 * Select the clocksource with the best rating, or the clocksource,
428 * which is selected by userspace override.
326 */ 429 */
327struct clocksource *clocksource_get_next(void) 430static void clocksource_select(void)
328{ 431{
329 unsigned long flags; 432 struct clocksource *best, *cs;
330 433
331 spin_lock_irqsave(&clocksource_lock, flags); 434 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 435 return;
333 curr_clocksource = next_clocksource; 436 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 437 best = list_first_entry(&clocksource_list, struct clocksource, list);
438 /* Check for the override clocksource. */
439 list_for_each_entry(cs, &clocksource_list, list) {
440 if (strcmp(cs->name, override_name) != 0)
441 continue;
442 /*
443 * Check to make sure we don't switch to a non-highres
444 * capable clocksource if the tick code is in oneshot
445 * mode (highres or nohz)
446 */
447 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
448 tick_oneshot_mode_active()) {
449 /* Override clocksource cannot be used. */
450 printk(KERN_WARNING "Override clocksource %s is not "
451 "HRT compatible. Cannot switch while in "
452 "HRT/NOHZ mode\n", cs->name);
453 override_name[0] = 0;
454 } else
455 /* Override clocksource can be used. */
456 best = cs;
457 break;
458 }
459 if (curr_clocksource != best) {
460 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
461 curr_clocksource = best;
462 timekeeping_notify(curr_clocksource);
335 } 463 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 464}
340 465
341/** 466#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 467
343 * 468static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 469
470#endif
471
472/*
473 * clocksource_done_booting - Called near the end of core bootup
345 * 474 *
346 * Select the clocksource with the best rating, or the clocksource, 475 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 476 * We use fs_initcall because we want this to start before
477 * device_initcall but after subsys_initcall.
348 */ 478 */
349static struct clocksource *select_clocksource(void) 479static int __init clocksource_done_booting(void)
350{ 480{
351 struct clocksource *next; 481 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 482
362 if (next == curr_clocksource) 483 /*
363 return NULL; 484 * Run the watchdog first to eliminate unstable clock sources
485 */
486 clocksource_watchdog_kthread(NULL);
364 487
365 return next; 488 mutex_lock(&clocksource_mutex);
489 clocksource_select();
490 mutex_unlock(&clocksource_mutex);
491 return 0;
366} 492}
493fs_initcall(clocksource_done_booting);
367 494
368/* 495/*
369 * Enqueue the clocksource sorted by rating 496 * Enqueue the clocksource sorted by rating
370 */ 497 */
371static int clocksource_enqueue(struct clocksource *c) 498static void clocksource_enqueue(struct clocksource *cs)
372{ 499{
373 struct list_head *tmp, *entry = &clocksource_list; 500 struct list_head *entry = &clocksource_list;
501 struct clocksource *tmp;
374 502
375 list_for_each(tmp, &clocksource_list) { 503 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 504 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 505 if (tmp->rating >= cs->rating)
383 entry = tmp; 506 entry = &tmp->list;
384 } 507 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 508}
393 509
394/** 510/**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 513 *
398 * Returns -EBUSY if registration fails, zero otherwise. 514 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 515 */
400int clocksource_register(struct clocksource *c) 516int clocksource_register(struct clocksource *cs)
401{ 517{
402 unsigned long flags; 518 mutex_lock(&clocksource_mutex);
403 int ret; 519 clocksource_enqueue(cs);
404 520 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 521 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 522 mutex_unlock(&clocksource_mutex);
407 if (!ret) 523 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 524}
414EXPORT_SYMBOL(clocksource_register); 525EXPORT_SYMBOL(clocksource_register);
415 526
527static void __clocksource_change_rating(struct clocksource *cs, int rating)
528{
529 list_del(&cs->list);
530 cs->rating = rating;
531 clocksource_enqueue(cs);
532 clocksource_select();
533}
534
416/** 535/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 536 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 537 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 538void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 539{
422 unsigned long flags; 540 mutex_lock(&clocksource_mutex);
423 541 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 542 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 543}
544EXPORT_SYMBOL(clocksource_change_rating);
431 545
432/** 546/**
433 * clocksource_unregister - remove a registered clocksource 547 * clocksource_unregister - remove a registered clocksource
434 */ 548 */
435void clocksource_unregister(struct clocksource *cs) 549void clocksource_unregister(struct clocksource *cs)
436{ 550{
437 unsigned long flags; 551 mutex_lock(&clocksource_mutex);
438 552 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 553 list_del(&cs->list);
441 if (clocksource_override == cs) 554 clocksource_select();
442 clocksource_override = NULL; 555 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 556}
557EXPORT_SYMBOL(clocksource_unregister);
446 558
447#ifdef CONFIG_SYSFS 559#ifdef CONFIG_SYSFS
448/** 560/**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 570{
459 ssize_t count = 0; 571 ssize_t count = 0;
460 572
461 spin_lock_irq(&clocksource_lock); 573 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 574 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 575 mutex_unlock(&clocksource_mutex);
464 576
465 return count; 577 return count;
466} 578}
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 590 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 591 const char *buf, size_t count)
480{ 592{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 593 size_t ret = count;
483 int len;
484 594
485 /* strings from sysfs write are not 0 terminated! */ 595 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 596 if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 600 if (buf[count-1] == '\n')
491 count--; 601 count--;
492 602
493 spin_lock_irq(&clocksource_lock); 603 mutex_lock(&clocksource_mutex);
494 604
495 if (count > 0) 605 if (count > 0)
496 memcpy(override_name, buf, count); 606 memcpy(override_name, buf, count);
497 override_name[count] = 0; 607 override_name[count] = 0;
608 clocksource_select();
498 609
499 len = strlen(override_name); 610 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 611
532 return ret; 612 return ret;
533} 613}
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 627 struct clocksource *src;
548 ssize_t count = 0; 628 ssize_t count = 0;
549 629
550 spin_lock_irq(&clocksource_lock); 630 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 631 list_for_each_entry(src, &clocksource_list, list) {
552 /* 632 /*
553 * Don't show non-HRES clocksource if the tick code is 633 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 639 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 640 "%s ", src->name);
561 } 641 }
562 spin_unlock_irq(&clocksource_lock); 642 mutex_unlock(&clocksource_mutex);
563 643
564 count += snprintf(buf + count, 644 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 645 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 694 */
615static int __init boot_override_clocksource(char* str) 695static int __init boot_override_clocksource(char* str)
616{ 696{
617 unsigned long flags; 697 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 698 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 699 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 700 mutex_unlock(&clocksource_mutex);
622 return 1; 701 return 1;
623} 702}
624 703
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/tick.h> 20#include <linux/tick.h>
21#include <linux/stop_machine.h>
22
23/* Structure holding internal timekeeping values. */
24struct timekeeper {
25 /* Current clocksource used for timekeeping. */
26 struct clocksource *clock;
27 /* The shift value of the current clocksource. */
28 int shift;
29
30 /* Number of clock cycles in one NTP interval. */
31 cycle_t cycle_interval;
32 /* Number of clock shifted nano seconds in one NTP interval. */
33 u64 xtime_interval;
34 /* Raw nano seconds accumulated per NTP interval. */
35 u32 raw_interval;
36
37 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
38 u64 xtime_nsec;
39 /* Difference between accumulated time and NTP time in ntp
40 * shifted nano seconds. */
41 s64 ntp_error;
42 /* Shift conversion between clock shifted nano seconds and
43 * ntp shifted nano seconds. */
44 int ntp_error_shift;
45 /* NTP adjusted clock multiplier */
46 u32 mult;
47};
48
49struct timekeeper timekeeper;
50
51/**
52 * timekeeper_setup_internals - Set up internals to use clocksource clock.
53 *
54 * @clock: Pointer to clocksource.
55 *
56 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
57 * pair and interval request.
58 *
59 * Unless you're the timekeeping code, you should not be using this!
60 */
61static void timekeeper_setup_internals(struct clocksource *clock)
62{
63 cycle_t interval;
64 u64 tmp;
65
66 timekeeper.clock = clock;
67 clock->cycle_last = clock->read(clock);
21 68
69 /* Do the ns -> cycle conversion first, using original mult */
70 tmp = NTP_INTERVAL_LENGTH;
71 tmp <<= clock->shift;
72 tmp += clock->mult/2;
73 do_div(tmp, clock->mult);
74 if (tmp == 0)
75 tmp = 1;
76
77 interval = (cycle_t) tmp;
78 timekeeper.cycle_interval = interval;
79
80 /* Go back from cycles -> shifted ns */
81 timekeeper.xtime_interval = (u64) interval * clock->mult;
82 timekeeper.raw_interval =
83 ((u64) interval * clock->mult) >> clock->shift;
84
85 timekeeper.xtime_nsec = 0;
86 timekeeper.shift = clock->shift;
87
88 timekeeper.ntp_error = 0;
89 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
90
91 /*
92 * The timekeeper keeps its own mult values for the currently
93 * active clocksource. These value will be adjusted via NTP
94 * to counteract clock drifting.
95 */
96 timekeeper.mult = clock->mult;
97}
98
99/* Timekeeper helper functions. */
100static inline s64 timekeeping_get_ns(void)
101{
102 cycle_t cycle_now, cycle_delta;
103 struct clocksource *clock;
104
105 /* read clocksource: */
106 clock = timekeeper.clock;
107 cycle_now = clock->read(clock);
108
109 /* calculate the delta since the last update_wall_time: */
110 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
111
112 /* return delta convert to nanoseconds using ntp adjusted mult. */
113 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
114 timekeeper.shift);
115}
116
117static inline s64 timekeeping_get_ns_raw(void)
118{
119 cycle_t cycle_now, cycle_delta;
120 struct clocksource *clock;
121
122 /* read clocksource: */
123 clock = timekeeper.clock;
124 cycle_now = clock->read(clock);
125
126 /* calculate the delta since the last update_wall_time: */
127 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
128
129 /* return delta convert to nanoseconds using ntp adjusted mult. */
130 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
131}
22 132
23/* 133/*
24 * This read-write spinlock protects us from races in SMP while 134 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 154 */
45struct timespec xtime __attribute__ ((aligned (16))); 155struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 156struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 157static struct timespec total_sleep_time;
158
159/*
160 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
161 */
162struct timespec raw_time;
48 163
49/* flag for if timekeeping is suspended */ 164/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 165int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 171 timespec_add_ns(&xtime_cache, nsec);
57} 172}
58 173
59struct clocksource *clock; 174/* must hold xtime_lock */
60 175void timekeeping_leap_insert(int leapsecond)
176{
177 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock);
180}
61 181
62#ifdef CONFIG_GENERIC_TIME 182#ifdef CONFIG_GENERIC_TIME
183
63/** 184/**
64 * clocksource_forward_now - update clock to the current time 185 * timekeeping_forward_now - update clock to the current time
65 * 186 *
66 * Forward the current clock to update its state since the last call to 187 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 188 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 189 * as it avoids having to deal with this time offset explicitly.
69 */ 190 */
70static void clocksource_forward_now(void) 191static void timekeeping_forward_now(void)
71{ 192{
72 cycle_t cycle_now, cycle_delta; 193 cycle_t cycle_now, cycle_delta;
194 struct clocksource *clock;
73 s64 nsec; 195 s64 nsec;
74 196
75 cycle_now = clocksource_read(clock); 197 clock = timekeeper.clock;
198 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 200 clock->cycle_last = cycle_now;
78 201
79 nsec = cyc2ns(clock, cycle_delta); 202 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
203 timekeeper.shift);
80 204
81 /* If arch requires, add in gettimeoffset() */ 205 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 206 nsec += arch_gettimeoffset();
83 207
84 timespec_add_ns(&xtime, nsec); 208 timespec_add_ns(&xtime, nsec);
85 209
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 210 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 211 timespec_add_ns(&raw_time, nsec);
88} 212}
89 213
90/** 214/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
95 */ 219 */
96void getnstimeofday(struct timespec *ts) 220void getnstimeofday(struct timespec *ts)
97{ 221{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 222 unsigned long seq;
100 s64 nsecs; 223 s64 nsecs;
101 224
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 228 seq = read_seqbegin(&xtime_lock);
106 229
107 *ts = xtime; 230 *ts = xtime;
108 231 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 232
118 /* If arch requires, add in gettimeoffset() */ 233 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 234 nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
125 240
126EXPORT_SYMBOL(getnstimeofday); 241EXPORT_SYMBOL(getnstimeofday);
127 242
243ktime_t ktime_get(void)
244{
245 unsigned int seq;
246 s64 secs, nsecs;
247
248 WARN_ON(timekeeping_suspended);
249
250 do {
251 seq = read_seqbegin(&xtime_lock);
252 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
253 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
254 nsecs += timekeeping_get_ns();
255
256 } while (read_seqretry(&xtime_lock, seq));
257 /*
258 * Use ktime_set/ktime_add_ns to create a proper ktime on
259 * 32-bit architectures without CONFIG_KTIME_SCALAR.
260 */
261 return ktime_add_ns(ktime_set(secs, 0), nsecs);
262}
263EXPORT_SYMBOL_GPL(ktime_get);
264
265/**
266 * ktime_get_ts - get the monotonic clock in timespec format
267 * @ts: pointer to timespec variable
268 *
269 * The function calculates the monotonic clock from the realtime
270 * clock and the wall_to_monotonic offset and stores the result
271 * in normalized timespec format in the variable pointed to by @ts.
272 */
273void ktime_get_ts(struct timespec *ts)
274{
275 struct timespec tomono;
276 unsigned int seq;
277 s64 nsecs;
278
279 WARN_ON(timekeeping_suspended);
280
281 do {
282 seq = read_seqbegin(&xtime_lock);
283 *ts = xtime;
284 tomono = wall_to_monotonic;
285 nsecs = timekeeping_get_ns();
286
287 } while (read_seqretry(&xtime_lock, seq));
288
289 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
290 ts->tv_nsec + tomono.tv_nsec + nsecs);
291}
292EXPORT_SYMBOL_GPL(ktime_get_ts);
293
128/** 294/**
129 * do_gettimeofday - Returns the time of day in a timeval 295 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 296 * @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
157 323
158 write_seqlock_irqsave(&xtime_lock, flags); 324 write_seqlock_irqsave(&xtime_lock, flags);
159 325
160 clocksource_forward_now(); 326 timekeeping_forward_now();
161 327
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 328 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 329 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
167 333
168 update_xtime_cache(0); 334 update_xtime_cache(0);
169 335
170 clock->error = 0; 336 timekeeper.ntp_error = 0;
171 ntp_clear(); 337 ntp_clear();
172 338
173 update_vsyscall(&xtime, clock); 339 update_vsyscall(&xtime, timekeeper.clock);
174 340
175 write_sequnlock_irqrestore(&xtime_lock, flags); 341 write_sequnlock_irqrestore(&xtime_lock, flags);
176 342
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 353 *
188 * Accumulates current time interval and initializes new clocksource 354 * Accumulates current time interval and initializes new clocksource
189 */ 355 */
190static void change_clocksource(void) 356static int change_clocksource(void *data)
191{ 357{
192 struct clocksource *new, *old; 358 struct clocksource *new, *old;
193 359
194 new = clocksource_get_next(); 360 new = (struct clocksource *) data;
361
362 timekeeping_forward_now();
363 if (!new->enable || new->enable(new) == 0) {
364 old = timekeeper.clock;
365 timekeeper_setup_internals(new);
366 if (old->disable)
367 old->disable(old);
368 }
369 return 0;
370}
195 371
196 if (clock == new) 372/**
373 * timekeeping_notify - Install a new clock source
374 * @clock: pointer to the clock source
375 *
376 * This function is called from clocksource.c after a new, better clock
377 * source has been registered. The caller holds the clocksource_mutex.
378 */
379void timekeeping_notify(struct clocksource *clock)
380{
381 if (timekeeper.clock == clock)
197 return; 382 return;
383 stop_machine(change_clocksource, clock, NULL);
384 tick_clock_notify();
385}
198 386
199 clocksource_forward_now(); 387#else /* GENERIC_TIME */
200 388
201 if (clocksource_enable(new)) 389static inline void timekeeping_forward_now(void) { }
202 return;
203 390
204 new->raw_time = clock->raw_time; 391/**
205 old = clock; 392 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 393 *
207 clocksource_disable(old); 394 * returns the time in ktime_t format
395 */
396ktime_t ktime_get(void)
397{
398 struct timespec now;
208 399
209 clock->cycle_last = 0; 400 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 401
215 tick_clock_notify(); 402 return timespec_to_ktime(now);
403}
404EXPORT_SYMBOL_GPL(ktime_get);
216 405
217 /* 406/**
218 * We're holding xtime lock and waking up klogd would deadlock 407 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 408 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 409 *
221 clock->name); 410 * The function calculates the monotonic clock from the realtime
222 */ 411 * clock and the wall_to_monotonic offset and stores the result
412 * in normalized timespec format in the variable pointed to by @ts.
413 */
414void ktime_get_ts(struct timespec *ts)
415{
416 struct timespec tomono;
417 unsigned long seq;
418
419 do {
420 seq = read_seqbegin(&xtime_lock);
421 getnstimeofday(ts);
422 tomono = wall_to_monotonic;
423
424 } while (read_seqretry(&xtime_lock, seq));
425
426 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
427 ts->tv_nsec + tomono.tv_nsec);
223} 428}
224#else 429EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 430
226static inline void change_clocksource(void) { } 431#endif /* !GENERIC_TIME */
227#endif 432
433/**
434 * ktime_get_real - get the real (wall-) time in ktime_t format
435 *
436 * returns the time in ktime_t format
437 */
438ktime_t ktime_get_real(void)
439{
440 struct timespec now;
441
442 getnstimeofday(&now);
443
444 return timespec_to_ktime(now);
445}
446EXPORT_SYMBOL_GPL(ktime_get_real);
228 447
229/** 448/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 449 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 455{
237 unsigned long seq; 456 unsigned long seq;
238 s64 nsecs; 457 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 458
241 do { 459 do {
242 seq = read_seqbegin(&xtime_lock); 460 seq = read_seqbegin(&xtime_lock);
243 461 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 462 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 463
255 } while (read_seqretry(&xtime_lock, seq)); 464 } while (read_seqretry(&xtime_lock, seq));
256 465
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
270 do { 479 do {
271 seq = read_seqbegin(&xtime_lock); 480 seq = read_seqbegin(&xtime_lock);
272 481
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 482 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 483
275 } while (read_seqretry(&xtime_lock, seq)); 484 } while (read_seqretry(&xtime_lock, seq));
276 485
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
278} 487}
279 488
280/** 489/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 490 * read_persistent_clock - Return time from the persistent clock.
282 * 491 *
283 * Weak dummy function for arches that do not yet support it. 492 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 493 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 494 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 495 *
287 * XXX - Do be sure to remove it once all arches implement it. 496 * XXX - Do be sure to remove it once all arches implement it.
288 */ 497 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 498void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 499{
291 return 0; 500 ts->tv_sec = 0;
501 ts->tv_nsec = 0;
502}
503
504/**
505 * read_boot_clock - Return time of the system start.
506 *
507 * Weak dummy function for arches that do not yet support it.
508 * Function to read the exact time the system has been started.
509 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
510 *
511 * XXX - Do be sure to remove it once all arches implement it.
512 */
513void __attribute__((weak)) read_boot_clock(struct timespec *ts)
514{
515 ts->tv_sec = 0;
516 ts->tv_nsec = 0;
292} 517}
293 518
294/* 519/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 521 */
297void __init timekeeping_init(void) 522void __init timekeeping_init(void)
298{ 523{
524 struct clocksource *clock;
299 unsigned long flags; 525 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 526 struct timespec now, boot;
527
528 read_persistent_clock(&now);
529 read_boot_clock(&boot);
301 530
302 write_seqlock_irqsave(&xtime_lock, flags); 531 write_seqlock_irqsave(&xtime_lock, flags);
303 532
304 ntp_init(); 533 ntp_init();
305 534
306 clock = clocksource_get_next(); 535 clock = clocksource_default_clock();
307 clocksource_enable(clock); 536 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 537 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 538 timekeeper_setup_internals(clock);
310 539
311 xtime.tv_sec = sec; 540 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 541 xtime.tv_nsec = now.tv_nsec;
542 raw_time.tv_sec = 0;
543 raw_time.tv_nsec = 0;
544 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
545 boot.tv_sec = xtime.tv_sec;
546 boot.tv_nsec = xtime.tv_nsec;
547 }
313 set_normalized_timespec(&wall_to_monotonic, 548 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 549 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 550 update_xtime_cache(0);
316 total_sleep_time = 0; 551 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 553 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 554}
319 555
320/* time in seconds when suspend began */ 556/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 557static struct timespec timekeeping_suspend_time;
322 558
323/** 559/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 560 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 567static int timekeeping_resume(struct sys_device *dev)
332{ 568{
333 unsigned long flags; 569 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 570 struct timespec ts;
571
572 read_persistent_clock(&ts);
335 573
336 clocksource_resume(); 574 clocksource_resume();
337 575
338 write_seqlock_irqsave(&xtime_lock, flags); 576 write_seqlock_irqsave(&xtime_lock, flags);
339 577
340 if (now && (now > timekeeping_suspend_time)) { 578 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 579 ts = timespec_sub(ts, timekeeping_suspend_time);
342 580 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 582 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 583 }
347 update_xtime_cache(0); 584 update_xtime_cache(0);
348 /* re-base the last cycle value */ 585 /* re-base the last cycle value */
349 clock->cycle_last = 0; 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 587 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 588 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 589 write_sequnlock_irqrestore(&xtime_lock, flags);
354 590
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 602{
367 unsigned long flags; 603 unsigned long flags;
368 604
369 timekeeping_suspend_time = read_persistent_clock(); 605 read_persistent_clock(&timekeeping_suspend_time);
370 606
371 write_seqlock_irqsave(&xtime_lock, flags); 607 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 608 timekeeping_forward_now();
373 timekeeping_suspended = 1; 609 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 610 write_sequnlock_irqrestore(&xtime_lock, flags);
375 611
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 640 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 641 * to compensate for late or lost adjustments.
406 */ 642 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 643static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 644 s64 *offset)
409{ 645{
410 s64 tick_error, i; 646 s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 656 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 657 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 658 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 659 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 660 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 661 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 662 error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 665 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 666 * remove the single look ahead already included in the error.
431 */ 667 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 668 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 669 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 670 error = ((error - tick_error) >> look_ahead) + tick_error;
435 671
436 /* Finally calculate the adjustment shift value. */ 672 /* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 691 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 692 * for other values we can do a bit more work.
457 */ 693 */
458static void clocksource_adjust(s64 offset) 694static void timekeeping_adjust(s64 offset)
459{ 695{
460 s64 error, interval = clock->cycle_interval; 696 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 697 int adj;
462 698
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 699 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 700 if (error > interval) {
465 error >>= 2; 701 error >>= 2;
466 if (likely(error <= interval)) 702 if (likely(error <= interval))
467 adj = 1; 703 adj = 1;
468 else 704 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 705 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 706 } else if (error < -interval) {
471 error >>= 2; 707 error >>= 2;
472 if (likely(error >= -interval)) { 708 if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 710 interval = -interval;
475 offset = -offset; 711 offset = -offset;
476 } else 712 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 713 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 714 } else
479 return; 715 return;
480 716
481 clock->mult += adj; 717 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 718 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 719 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 720 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 721 timekeeper.ntp_error_shift;
486} 722}
487 723
488/** 724/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
492 */ 728 */
493void update_wall_time(void) 729void update_wall_time(void)
494{ 730{
731 struct clocksource *clock;
495 cycle_t offset; 732 cycle_t offset;
733 u64 nsecs;
496 734
497 /* Make sure we're fully resumed: */ 735 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 736 if (unlikely(timekeeping_suspended))
499 return; 737 return;
500 738
739 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 740#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 741 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 742#else
504 offset = clock->cycle_interval; 743 offset = timekeeper.cycle_interval;
505#endif 744#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 746
508 /* normally this loop will run just once, however in the 747 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 748 * case of lost or late ticks, it will accumulate correctly.
510 */ 749 */
511 while (offset >= clock->cycle_interval) { 750 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
752
512 /* accumulate one interval */ 753 /* accumulate one interval */
513 offset -= clock->cycle_interval; 754 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 755 clock->cycle_last += timekeeper.cycle_interval;
515 756
516 clock->xtime_nsec += clock->xtime_interval; 757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 758 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 759 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 760 xtime.tv_sec++;
520 second_overflow(); 761 second_overflow();
521 } 762 }
522 763
523 clock->raw_time.tv_nsec += clock->raw_interval; 764 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 766 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 767 raw_time.tv_sec++;
527 } 768 }
528 769
529 /* accumulate error between NTP and clock interval */ 770 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 771 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
532 } 774 }
533 775
534 /* correct the clock when NTP error is too big */ 776 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 777 timekeeping_adjust(offset);
536 778
537 /* 779 /*
538 * Since in the loop above, we accumulate any amount of time 780 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 781 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 782 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 783 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 784 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 785 * cause it to underflow.
544 * 786 *
@@ -550,24 +792,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 792 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 793 * xtime_nsec is not as small.
552 */ 794 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 795 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 796 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 797 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 798 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 799 }
558 800
559 /* store full nanoseconds into xtime after rounding it up and 801 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 802 * add the remainder to the error difference.
561 */ 803 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 804 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 805 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 806 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift;
565 808
566 update_xtime_cache(cyc2ns(clock, offset)); 809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
567 811
568 /* check to see if there is a new clocksource to use */ 812 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 813 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 814}
572 815
573/** 816/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
583 */ 826 */
584void getboottime(struct timespec *ts) 827void getboottime(struct timespec *ts)
585{ 828{
586 set_normalized_timespec(ts, 829 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 830 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 831 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
832 };
833
834 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 835}
590 836
591/** 837/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
594 */ 840 */
595void monotonic_to_bootbased(struct timespec *ts) 841void monotonic_to_bootbased(struct timespec *ts)
596{ 842{
597 ts->tv_sec += total_sleep_time; 843 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 844}
599 845
600unsigned long get_seconds(void) 846unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
603} 849}
604EXPORT_SYMBOL(get_seconds); 850EXPORT_SYMBOL(get_seconds);
605 851
852struct timespec __current_kernel_time(void)
853{
854 return xtime_cache;
855}
606 856
607struct timespec current_kernel_time(void) 857struct timespec current_kernel_time(void)
608{ 858{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
618 return now; 868 return now;
619} 869}
620EXPORT_SYMBOL(current_kernel_time); 870EXPORT_SYMBOL(current_kernel_time);
871
872struct timespec get_monotonic_coarse(void)
873{
874 struct timespec now, mono;
875 unsigned long seq;
876
877 do {
878 seq = read_seqbegin(&xtime_lock);
879
880 now = xtime_cache;
881 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq));
883
884 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
885 now.tv_nsec + mono.tv_nsec);
886 return now;
887}
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..5db5a8d26811 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -46,6 +46,9 @@
46#include <asm/timex.h> 46#include <asm/timex.h>
47#include <asm/io.h> 47#include <asm/io.h>
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/timer.h>
51
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; 52u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50 53
51EXPORT_SYMBOL(jiffies_64); 54EXPORT_SYMBOL(jiffies_64);
@@ -72,6 +75,7 @@ struct tvec_base {
72 spinlock_t lock; 75 spinlock_t lock;
73 struct timer_list *running_timer; 76 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 77 unsigned long timer_jiffies;
78 unsigned long next_timer;
75 struct tvec_root tv1; 79 struct tvec_root tv1;
76 struct tvec tv2; 80 struct tvec tv2;
77 struct tvec tv3; 81 struct tvec tv3;
@@ -520,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
520static inline void debug_timer_deactivate(struct timer_list *timer) { } 524static inline void debug_timer_deactivate(struct timer_list *timer) { }
521#endif 525#endif
522 526
527static inline void debug_init(struct timer_list *timer)
528{
529 debug_timer_init(timer);
530 trace_timer_init(timer);
531}
532
533static inline void
534debug_activate(struct timer_list *timer, unsigned long expires)
535{
536 debug_timer_activate(timer);
537 trace_timer_start(timer, expires);
538}
539
540static inline void debug_deactivate(struct timer_list *timer)
541{
542 debug_timer_deactivate(timer);
543 trace_timer_cancel(timer);
544}
545
523static void __init_timer(struct timer_list *timer, 546static void __init_timer(struct timer_list *timer,
524 const char *name, 547 const char *name,
525 struct lock_class_key *key) 548 struct lock_class_key *key)
@@ -548,7 +571,7 @@ void init_timer_key(struct timer_list *timer,
548 const char *name, 571 const char *name,
549 struct lock_class_key *key) 572 struct lock_class_key *key)
550{ 573{
551 debug_timer_init(timer); 574 debug_init(timer);
552 __init_timer(timer, name, key); 575 __init_timer(timer, name, key);
553} 576}
554EXPORT_SYMBOL(init_timer_key); 577EXPORT_SYMBOL(init_timer_key);
@@ -567,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer,
567{ 590{
568 struct list_head *entry = &timer->entry; 591 struct list_head *entry = &timer->entry;
569 592
570 debug_timer_deactivate(timer); 593 debug_deactivate(timer);
571 594
572 __list_del(entry->prev, entry->next); 595 __list_del(entry->prev, entry->next);
573 if (clear_pending) 596 if (clear_pending)
@@ -622,13 +645,16 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 645
623 if (timer_pending(timer)) { 646 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 647 detach_timer(timer, 0);
648 if (timer->expires == base->next_timer &&
649 !tbase_get_deferrable(timer->base))
650 base->next_timer = base->timer_jiffies;
625 ret = 1; 651 ret = 1;
626 } else { 652 } else {
627 if (pending_only) 653 if (pending_only)
628 goto out_unlock; 654 goto out_unlock;
629 } 655 }
630 656
631 debug_timer_activate(timer); 657 debug_activate(timer, expires);
632 658
633 new_base = __get_cpu_var(tvec_bases); 659 new_base = __get_cpu_var(tvec_bases);
634 660
@@ -663,6 +689,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 689 }
664 690
665 timer->expires = expires; 691 timer->expires = expires;
692 if (time_before(timer->expires, base->next_timer) &&
693 !tbase_get_deferrable(timer->base))
694 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 695 internal_add_timer(base, timer);
667 696
668out_unlock: 697out_unlock:
@@ -780,7 +809,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
780 BUG_ON(timer_pending(timer) || !timer->function); 809 BUG_ON(timer_pending(timer) || !timer->function);
781 spin_lock_irqsave(&base->lock, flags); 810 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 811 timer_set_base(timer, base);
783 debug_timer_activate(timer); 812 debug_activate(timer, timer->expires);
813 if (time_before(timer->expires, base->next_timer) &&
814 !tbase_get_deferrable(timer->base))
815 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 816 internal_add_timer(base, timer);
785 /* 817 /*
786 * Check whether the other CPU is idle and needs to be 818 * Check whether the other CPU is idle and needs to be
@@ -817,6 +849,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 849 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 850 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 851 detach_timer(timer, 1);
852 if (timer->expires == base->next_timer &&
853 !tbase_get_deferrable(timer->base))
854 base->next_timer = base->timer_jiffies;
820 ret = 1; 855 ret = 1;
821 } 856 }
822 spin_unlock_irqrestore(&base->lock, flags); 857 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +885,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 885 ret = 0;
851 if (timer_pending(timer)) { 886 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 887 detach_timer(timer, 1);
888 if (timer->expires == base->next_timer &&
889 !tbase_get_deferrable(timer->base))
890 base->next_timer = base->timer_jiffies;
853 ret = 1; 891 ret = 1;
854 } 892 }
855out: 893out:
@@ -984,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base)
984 */ 1022 */
985 lock_map_acquire(&lockdep_map); 1023 lock_map_acquire(&lockdep_map);
986 1024
1025 trace_timer_expire_entry(timer);
987 fn(data); 1026 fn(data);
1027 trace_timer_expire_exit(timer);
988 1028
989 lock_map_release(&lockdep_map); 1029 lock_map_release(&lockdep_map);
990 1030
@@ -1007,8 +1047,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1047#ifdef CONFIG_NO_HZ
1008/* 1048/*
1009 * Find out when the next timer event is due to happen. This 1049 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1050 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1051 * This function needs to be called with interrupts disabled.
1012 */ 1052 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1053static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1054{
@@ -1134,7 +1174,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1174 unsigned long expires;
1135 1175
1136 spin_lock(&base->lock); 1176 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1177 if (time_before_eq(base->next_timer, base->timer_jiffies))
1178 base->next_timer = __next_timer_interrupt(base);
1179 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1180 spin_unlock(&base->lock);
1139 1181
1140 if (time_before_eq(expires, now)) 1182 if (time_before_eq(expires, now))
@@ -1156,8 +1198,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1198 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1199 account_process_tick(p, user_tick);
1158 run_local_timers(); 1200 run_local_timers();
1159 if (rcu_pending(cpu)) 1201 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1202 printk_tick();
1162 scheduler_tick(); 1203 scheduler_tick();
1163 run_posix_cpu_timers(p); 1204 run_posix_cpu_timers(p);
@@ -1170,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1211{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1212 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1213
1173 perf_counter_do_pending(); 1214 perf_event_do_pending();
1174 1215
1175 hrtimer_run_pending(); 1216 hrtimer_run_pending();
1176 1217
@@ -1523,6 +1564,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1523 INIT_LIST_HEAD(base->tv1.vec + j); 1564 INIT_LIST_HEAD(base->tv1.vec + j);
1524 1565
1525 base->timer_jiffies = jiffies; 1566 base->timer_jiffies = jiffies;
1567 base->next_timer = base->timer_jiffies;
1526 return 0; 1568 return 0;
1527} 1569}
1528 1570
@@ -1535,6 +1577,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1535 timer = list_first_entry(head, struct timer_list, entry); 1577 timer = list_first_entry(head, struct timer_list, entry);
1536 detach_timer(timer, 0); 1578 detach_timer(timer, 0);
1537 timer_set_base(timer, new_base); 1579 timer_set_base(timer, new_base);
1580 if (time_before(timer->expires, new_base->next_timer) &&
1581 !tbase_get_deferrable(timer->base))
1582 new_base->next_timer = timer->expires;
1538 internal_add_timer(new_base, timer); 1583 internal_add_timer(new_base, timer);
1539 } 1584 }
1540} 1585}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e78dcbde1a81..15372a9f2399 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_SYSCALL_TRACEPOINTS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -73,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP
73# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
74# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
75# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
76# hidding of the automatic options options. 86# hidding of the automatic options.
77 87
78config TRACING 88config TRACING
79 bool 89 bool
@@ -481,6 +491,18 @@ config FTRACE_STARTUP_TEST
481 functioning properly. It will do tests on all the configured 491 functioning properly. It will do tests on all the configured
482 tracers of ftrace. 492 tracers of ftrace.
483 493
494config EVENT_TRACE_TEST_SYSCALLS
495 bool "Run selftest on syscall events"
496 depends on FTRACE_STARTUP_TEST
497 help
498 This option will also enable testing every syscall event.
499 It only enables the event and disables it and runs various loads
500 with the event enabled. This adds a bit more time for kernel boot
501 up since it runs this on every system call defined.
502
503 TBD - enable a way to actually call the syscalls as we test their
504 events
505
484config MMIOTRACE 506config MMIOTRACE
485 bool "Memory mapped IO tracing" 507 bool "Memory mapped IO tracing"
486 depends on HAVE_MMIOTRACE_SUPPORT && PCI 508 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 7c00a1ec1496..c8cb75d7f280 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -55,5 +54,6 @@ obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
57obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o 56obj-$(CONFIG_KPROBE_TRACER) += trace_kprobe.o
57obj-$(CONFIG_EVENT_TRACING) += power-traces.o
58 58
59libftrace-y := ftrace.o 59libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c804e24f96f..23df7771c937 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1323 1323
1324enum { 1324enum {
1325 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1326 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1327 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1328 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1329 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1330 FTRACE_ITER_HASH = (1 << 5),
1331}; 1330};
1332 1331
1333#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1337,8 +1336,7 @@ struct ftrace_iterator {
1337 int hidx; 1336 int hidx;
1338 int idx; 1337 int idx;
1339 unsigned flags; 1338 unsigned flags;
1340 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1341 unsigned buffer_idx;
1342}; 1340};
1343 1341
1344static void * 1342static void *
@@ -1407,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v)
1407 if (rec->ops->print) 1405 if (rec->ops->print)
1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1409 1407
1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1411 1409
1412 if (rec->data) 1410 if (rec->data)
1413 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1517,12 +1515,12 @@ static int t_show(struct seq_file *m, void *v)
1517 if (!rec) 1515 if (!rec)
1518 return 0; 1516 return 0;
1519 1517
1520 seq_printf(m, "%pf\n", (void *)rec->ip); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1521 1519
1522 return 0; 1520 return 0;
1523} 1521}
1524 1522
1525static struct seq_operations show_ftrace_seq_ops = { 1523static const struct seq_operations show_ftrace_seq_ops = {
1526 .start = t_start, 1524 .start = t_start,
1527 .next = t_next, 1525 .next = t_next,
1528 .stop = t_stop, 1526 .stop = t_stop,
@@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1604 if (!iter) 1602 if (!iter)
1605 return -ENOMEM; 1603 return -ENOMEM;
1606 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1607 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1608 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1609 (file->f_flags & O_TRUNC)) 1612 (file->f_flags & O_TRUNC))
@@ -2059,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2059 int i, len = 0; 2062 int i, len = 0;
2060 char *search; 2063 char *search;
2061 2064
2062 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2065 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2063 glob = NULL; 2066 glob = NULL;
2064 else { 2067 else if (glob) {
2065 int not; 2068 int not;
2066 2069
2067 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos, int enable) 2199 size_t cnt, loff_t *ppos, int enable)
2197{ 2200{
2198 struct ftrace_iterator *iter; 2201 struct ftrace_iterator *iter;
2199 char ch; 2202 struct trace_parser *parser;
2200 size_t read = 0; 2203 ssize_t ret, read;
2201 ssize_t ret;
2202 2204
2203 if (!cnt || cnt < 0) 2205 if (!cnt || cnt < 0)
2204 return 0; 2206 return 0;
@@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2211 } else 2213 } else
2212 iter = file->private_data; 2214 iter = file->private_data;
2213 2215
2214 if (!*ppos) { 2216 parser = &iter->parser;
2215 iter->flags &= ~FTRACE_ITER_CONT; 2217 read = trace_get_user(parser, ubuf, cnt, ppos);
2216 iter->buffer_idx = 0;
2217 }
2218 2218
2219 ret = get_user(ch, ubuf++); 2219 if (trace_parser_loaded(parser) &&
2220 if (ret) 2220 !trace_parser_cont(parser)) {
2221 goto out; 2221 ret = ftrace_process_regex(parser->buffer,
2222 read++; 2222 parser->idx, enable);
2223 cnt--;
2224
2225 /*
2226 * If the parser haven't finished with the last write,
2227 * continue reading the user input without skipping spaces.
2228 */
2229 if (!(iter->flags & FTRACE_ITER_CONT)) {
2230 /* skip white space */
2231 while (cnt && isspace(ch)) {
2232 ret = get_user(ch, ubuf++);
2233 if (ret)
2234 goto out;
2235 read++;
2236 cnt--;
2237 }
2238
2239 /* only spaces were written */
2240 if (isspace(ch)) {
2241 *ppos += read;
2242 ret = read;
2243 goto out;
2244 }
2245
2246 iter->buffer_idx = 0;
2247 }
2248
2249 while (cnt && !isspace(ch)) {
2250 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2251 iter->buffer[iter->buffer_idx++] = ch;
2252 else {
2253 ret = -EINVAL;
2254 goto out;
2255 }
2256 ret = get_user(ch, ubuf++);
2257 if (ret) 2223 if (ret)
2258 goto out; 2224 goto out;
2259 read++;
2260 cnt--;
2261 }
2262 2225
2263 if (isspace(ch)) { 2226 trace_parser_clear(parser);
2264 iter->buffer[iter->buffer_idx] = 0;
2265 ret = ftrace_process_regex(iter->buffer,
2266 iter->buffer_idx, enable);
2267 if (ret)
2268 goto out;
2269 iter->buffer_idx = 0;
2270 } else {
2271 iter->flags |= FTRACE_ITER_CONT;
2272 iter->buffer[iter->buffer_idx++] = ch;
2273 } 2227 }
2274 2228
2275 *ppos += read;
2276 ret = read; 2229 ret = read;
2277 out:
2278 mutex_unlock(&ftrace_regex_lock);
2279 2230
2231 mutex_unlock(&ftrace_regex_lock);
2232out:
2280 return ret; 2233 return ret;
2281} 2234}
2282 2235
@@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2381{ 2334{
2382 struct seq_file *m = (struct seq_file *)file->private_data; 2335 struct seq_file *m = (struct seq_file *)file->private_data;
2383 struct ftrace_iterator *iter; 2336 struct ftrace_iterator *iter;
2337 struct trace_parser *parser;
2384 2338
2385 mutex_lock(&ftrace_regex_lock); 2339 mutex_lock(&ftrace_regex_lock);
2386 if (file->f_mode & FMODE_READ) { 2340 if (file->f_mode & FMODE_READ) {
@@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2390 } else 2344 } else
2391 iter = file->private_data; 2345 iter = file->private_data;
2392 2346
2393 if (iter->buffer_idx) { 2347 parser = &iter->parser;
2394 iter->buffer[iter->buffer_idx] = 0; 2348 if (trace_parser_loaded(parser)) {
2395 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2349 parser->buffer[parser->idx] = 0;
2350 ftrace_match_records(parser->buffer, parser->idx, enable);
2396 } 2351 }
2397 2352
2398 mutex_lock(&ftrace_lock); 2353 mutex_lock(&ftrace_lock);
@@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2400 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2355 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2401 mutex_unlock(&ftrace_lock); 2356 mutex_unlock(&ftrace_lock);
2402 2357
2358 trace_parser_put(parser);
2403 kfree(iter); 2359 kfree(iter);
2360
2404 mutex_unlock(&ftrace_regex_lock); 2361 mutex_unlock(&ftrace_regex_lock);
2405 return 0; 2362 return 0;
2406} 2363}
@@ -2457,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2457static void * 2414static void *
2458__g_next(struct seq_file *m, loff_t *pos) 2415__g_next(struct seq_file *m, loff_t *pos)
2459{ 2416{
2460 unsigned long *array = m->private;
2461
2462 if (*pos >= ftrace_graph_count) 2417 if (*pos >= ftrace_graph_count)
2463 return NULL; 2418 return NULL;
2464 return &array[*pos]; 2419 return &ftrace_graph_funcs[*pos];
2465} 2420}
2466 2421
2467static void * 2422static void *
@@ -2499,12 +2454,12 @@ static int g_show(struct seq_file *m, void *v)
2499 return 0; 2454 return 0;
2500 } 2455 }
2501 2456
2502 seq_printf(m, "%pf\n", v); 2457 seq_printf(m, "%ps\n", (void *)*ptr);
2503 2458
2504 return 0; 2459 return 0;
2505} 2460}
2506 2461
2507static struct seq_operations ftrace_graph_seq_ops = { 2462static const struct seq_operations ftrace_graph_seq_ops = {
2508 .start = g_start, 2463 .start = g_start,
2509 .next = g_next, 2464 .next = g_next,
2510 .stop = g_stop, 2465 .stop = g_stop,
@@ -2525,16 +2480,10 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2525 ftrace_graph_count = 0; 2480 ftrace_graph_count = 0;
2526 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2481 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2527 } 2482 }
2483 mutex_unlock(&graph_lock);
2528 2484
2529 if (file->f_mode & FMODE_READ) { 2485 if (file->f_mode & FMODE_READ)
2530 ret = seq_open(file, &ftrace_graph_seq_ops); 2486 ret = seq_open(file, &ftrace_graph_seq_ops);
2531 if (!ret) {
2532 struct seq_file *m = file->private_data;
2533 m->private = ftrace_graph_funcs;
2534 }
2535 } else
2536 file->private_data = ftrace_graph_funcs;
2537 mutex_unlock(&graph_lock);
2538 2487
2539 return ret; 2488 return ret;
2540} 2489}
@@ -2602,12 +2551,9 @@ static ssize_t
2602ftrace_graph_write(struct file *file, const char __user *ubuf, 2551ftrace_graph_write(struct file *file, const char __user *ubuf,
2603 size_t cnt, loff_t *ppos) 2552 size_t cnt, loff_t *ppos)
2604{ 2553{
2605 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2554 struct trace_parser parser;
2606 unsigned long *array;
2607 size_t read = 0; 2555 size_t read = 0;
2608 ssize_t ret; 2556 ssize_t ret;
2609 int index = 0;
2610 char ch;
2611 2557
2612 if (!cnt || cnt < 0) 2558 if (!cnt || cnt < 0)
2613 return 0; 2559 return 0;
@@ -2619,57 +2565,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2619 goto out; 2565 goto out;
2620 } 2566 }
2621 2567
2622 if (file->f_mode & FMODE_READ) { 2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2623 struct seq_file *m = file->private_data; 2569 ret = -ENOMEM;
2624 array = m->private;
2625 } else
2626 array = file->private_data;
2627
2628 ret = get_user(ch, ubuf++);
2629 if (ret)
2630 goto out; 2570 goto out;
2631 read++;
2632 cnt--;
2633
2634 /* skip white space */
2635 while (cnt && isspace(ch)) {
2636 ret = get_user(ch, ubuf++);
2637 if (ret)
2638 goto out;
2639 read++;
2640 cnt--;
2641 } 2571 }
2642 2572
2643 if (isspace(ch)) { 2573 read = trace_get_user(&parser, ubuf, cnt, ppos);
2644 *ppos += read;
2645 ret = read;
2646 goto out;
2647 }
2648 2574
2649 while (cnt && !isspace(ch)) { 2575 if (trace_parser_loaded((&parser))) {
2650 if (index < FTRACE_BUFF_MAX) 2576 parser.buffer[parser.idx] = 0;
2651 buffer[index++] = ch; 2577
2652 else { 2578 /* we allow only one expression at a time */
2653 ret = -EINVAL; 2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2654 goto out; 2580 parser.buffer);
2655 }
2656 ret = get_user(ch, ubuf++);
2657 if (ret) 2581 if (ret)
2658 goto out; 2582 goto out;
2659 read++;
2660 cnt--;
2661 } 2583 }
2662 buffer[index] = 0;
2663
2664 /* we allow only one expression at a time */
2665 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2666 if (ret)
2667 goto out;
2668
2669 file->f_pos += read;
2670 2584
2671 ret = read; 2585 ret = read;
2672 out: 2586 out:
2587 trace_parser_put(&parser);
2673 mutex_unlock(&graph_lock); 2588 mutex_unlock(&graph_lock);
2674 2589
2675 return ret; 2590 return ret;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 454e74e718cf..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -701,8 +699,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
701 699
702 val &= ~RB_FLAG_MASK; 700 val &= ~RB_FLAG_MASK;
703 701
704 ret = (unsigned long)cmpxchg(&list->next, 702 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag); 703 val | old_flag, val | new_flag);
706 704
707 /* check if the reader took the page */ 705 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val) 706 if ((ret & ~RB_FLAG_MASK) != val)
@@ -794,7 +792,7 @@ static int rb_head_page_replace(struct buffer_page *old,
794 val = *ptr & ~RB_FLAG_MASK; 792 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD; 793 val |= RB_PAGE_HEAD;
796 794
797 ret = cmpxchg(ptr, val, &new->list); 795 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798 796
799 return ret == val; 797 return ret == val;
800} 798}
@@ -2997,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2997} 2995}
2998 2996
2999static struct ring_buffer_event * 2997static struct ring_buffer_event *
3000rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 2998rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3001{ 2999{
3002 struct ring_buffer_per_cpu *cpu_buffer;
3003 struct ring_buffer_event *event; 3000 struct ring_buffer_event *event;
3004 struct buffer_page *reader; 3001 struct buffer_page *reader;
3005 int nr_loops = 0; 3002 int nr_loops = 0;
3006 3003
3007 cpu_buffer = buffer->buffers[cpu];
3008
3009 again: 3004 again:
3010 /* 3005 /*
3011 * We repeat when a timestamp is encountered. It is possible 3006 * We repeat when a timestamp is encountered. It is possible
@@ -3049,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3049 case RINGBUF_TYPE_DATA: 3044 case RINGBUF_TYPE_DATA:
3050 if (ts) { 3045 if (ts) {
3051 *ts = cpu_buffer->read_stamp + event->time_delta; 3046 *ts = cpu_buffer->read_stamp + event->time_delta;
3052 ring_buffer_normalize_time_stamp(buffer, 3047 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3053 cpu_buffer->cpu, ts); 3048 cpu_buffer->cpu, ts);
3054 } 3049 }
3055 return event; 3050 return event;
@@ -3168,7 +3163,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3168 local_irq_save(flags); 3163 local_irq_save(flags);
3169 if (dolock) 3164 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock); 3165 spin_lock(&cpu_buffer->reader_lock);
3171 event = rb_buffer_peek(buffer, cpu, ts); 3166 event = rb_buffer_peek(cpu_buffer, ts);
3172 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3167 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer); 3168 rb_advance_reader(cpu_buffer);
3174 if (dolock) 3169 if (dolock)
@@ -3237,7 +3232,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3237 if (dolock) 3232 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock); 3233 spin_lock(&cpu_buffer->reader_lock);
3239 3234
3240 event = rb_buffer_peek(buffer, cpu, ts); 3235 event = rb_buffer_peek(cpu_buffer, ts);
3241 if (event) 3236 if (event)
3242 rb_advance_reader(cpu_buffer); 3237 rb_advance_reader(cpu_buffer);
3243 3238
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c75deeefe30..6c0f6a8a22eb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -125,13 +125,13 @@ int ftrace_dump_on_oops;
125 125
126static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
127 127
128#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
129static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
130static char *default_bootup_tracer; 130static char *default_bootup_tracer;
131 131
132static int __init set_ftrace(char *str) 132static int __init set_ftrace(char *str)
133{ 133{
134 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
135 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
136 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
137 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
@@ -242,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
242static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
243 243
244/* 244/*
245 * max_tracer_type_len is used to simplify the allocating of
246 * buffers to read userspace tracer names. We keep track of
247 * the longest tracer name registered.
248 */
249static int max_tracer_type_len;
250
251/*
252 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
253 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
254 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -275,12 +268,18 @@ static DEFINE_SPINLOCK(tracing_start_lock);
275 */ 268 */
276void trace_wake_up(void) 269void trace_wake_up(void)
277{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
278 /* 275 /*
279 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
280 * have for now: 277 * have for now:
281 */ 278 */
282 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
283 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
284} 283}
285 284
286static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -339,6 +338,112 @@ static struct {
339 338
340int trace_clock_id; 339int trace_clock_id;
341 340
341/*
342 * trace_parser_get_init - gets the buffer for trace parser
343 */
344int trace_parser_get_init(struct trace_parser *parser, int size)
345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
355
356/*
357 * trace_parser_put - frees the buffer for trace parser
358 */
359void trace_parser_put(struct trace_parser *parser)
360{
361 kfree(parser->buffer);
362}
363
364/*
365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
381
382 if (!*ppos)
383 trace_parser_clear(parser);
384
385 ret = get_user(ch, ubuf++);
386 if (ret)
387 goto out;
388
389 read++;
390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
445}
446
342ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
343{ 448{
344 int len; 449 int len;
@@ -513,7 +618,6 @@ __releases(kernel_lock)
513__acquires(kernel_lock) 618__acquires(kernel_lock)
514{ 619{
515 struct tracer *t; 620 struct tracer *t;
516 int len;
517 int ret = 0; 621 int ret = 0;
518 622
519 if (!type->name) { 623 if (!type->name) {
@@ -521,6 +625,11 @@ __acquires(kernel_lock)
521 return -1; 625 return -1;
522 } 626 }
523 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
524 /* 633 /*
525 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
526 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -535,7 +644,7 @@ __acquires(kernel_lock)
535 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
536 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
537 /* already found */ 646 /* already found */
538 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
539 type->name); 648 type->name);
540 ret = -1; 649 ret = -1;
541 goto out; 650 goto out;
@@ -586,9 +695,6 @@ __acquires(kernel_lock)
586 695
587 type->next = trace_types; 696 type->next = trace_types;
588 trace_types = type; 697 trace_types = type;
589 len = strlen(type->name);
590 if (len > max_tracer_type_len)
591 max_tracer_type_len = len;
592 698
593 out: 699 out:
594 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -597,7 +703,7 @@ __acquires(kernel_lock)
597 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
598 goto out_unlock; 704 goto out_unlock;
599 705
600 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
601 goto out_unlock; 707 goto out_unlock;
602 708
603 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -619,14 +725,13 @@ __acquires(kernel_lock)
619void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
620{ 726{
621 struct tracer **t; 727 struct tracer **t;
622 int len;
623 728
624 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
625 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
626 if (*t == type) 731 if (*t == type)
627 goto found; 732 goto found;
628 } 733 }
629 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
630 goto out; 735 goto out;
631 736
632 found: 737 found:
@@ -639,17 +744,7 @@ void unregister_tracer(struct tracer *type)
639 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
640 current_trace = &nop_trace; 745 current_trace = &nop_trace;
641 } 746 }
642 747out:
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
654} 749}
655 750
@@ -719,6 +814,11 @@ static void trace_init_cmdlines(void)
719 cmdline_idx = 0; 814 cmdline_idx = 0;
720} 815}
721 816
817int is_tracing_stopped(void)
818{
819 return trace_stop_count;
820}
821
722/** 822/**
723 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
724 * 824 *
@@ -886,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
886 986
887 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
888 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
889 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
890 entry->flags = 990 entry->flags =
891#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
892 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1068,6 +1168,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1068 return; 1168 return;
1069 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1070 1170
1171 entry->tgid = current->tgid;
1071 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1072 1173
1073 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1094,6 +1195,7 @@ ftrace_trace_special(void *__tr,
1094 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1095 int pc) 1196 int pc)
1096{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1097 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1098 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1099 struct ring_buffer *buffer = tr->buffer; 1201 struct ring_buffer *buffer = tr->buffer;
@@ -1107,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1107 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1108 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1109 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1110 trace_buffer_unlock_commit(buffer, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1111} 1215}
1112 1216
1113void 1217void
@@ -1530,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m)
1530 seq_puts(m, "# | / _----=> need-resched \n"); 1634 seq_puts(m, "# | / _----=> need-resched \n");
1531 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1635 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1532 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1636 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1533 seq_puts(m, "# |||| / \n"); 1637 seq_puts(m, "# |||| /_--=> lock-depth \n");
1534 seq_puts(m, "# ||||| delay \n"); 1638 seq_puts(m, "# |||||/ delay \n");
1535 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1639 seq_puts(m, "# cmd pid |||||| time | caller \n");
1536 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1640 seq_puts(m, "# \\ / |||||| \\ | / \n");
1537} 1641}
1538 1642
1539static void print_func_help_header(struct seq_file *m) 1643static void print_func_help_header(struct seq_file *m)
@@ -1845,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v)
1845 return 0; 1949 return 0;
1846} 1950}
1847 1951
1848static struct seq_operations tracer_seq_ops = { 1952static const struct seq_operations tracer_seq_ops = {
1849 .start = s_start, 1953 .start = s_start,
1850 .next = s_next, 1954 .next = s_next,
1851 .stop = s_stop, 1955 .stop = s_stop,
@@ -2059,7 +2163,7 @@ static int t_show(struct seq_file *m, void *v)
2059 return 0; 2163 return 0;
2060} 2164}
2061 2165
2062static struct seq_operations show_traces_seq_ops = { 2166static const struct seq_operations show_traces_seq_ops = {
2063 .start = t_start, 2167 .start = t_start,
2064 .next = t_next, 2168 .next = t_next,
2065 .stop = t_stop, 2169 .stop = t_stop,
@@ -2489,7 +2593,7 @@ static ssize_t
2489tracing_set_trace_read(struct file *filp, char __user *ubuf, 2593tracing_set_trace_read(struct file *filp, char __user *ubuf,
2490 size_t cnt, loff_t *ppos) 2594 size_t cnt, loff_t *ppos)
2491{ 2595{
2492 char buf[max_tracer_type_len+2]; 2596 char buf[MAX_TRACER_SIZE+2];
2493 int r; 2597 int r;
2494 2598
2495 mutex_lock(&trace_types_lock); 2599 mutex_lock(&trace_types_lock);
@@ -2639,15 +2743,15 @@ static ssize_t
2639tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2743tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2640 size_t cnt, loff_t *ppos) 2744 size_t cnt, loff_t *ppos)
2641{ 2745{
2642 char buf[max_tracer_type_len+1]; 2746 char buf[MAX_TRACER_SIZE+1];
2643 int i; 2747 int i;
2644 size_t ret; 2748 size_t ret;
2645 int err; 2749 int err;
2646 2750
2647 ret = cnt; 2751 ret = cnt;
2648 2752
2649 if (cnt > max_tracer_type_len) 2753 if (cnt > MAX_TRACER_SIZE)
2650 cnt = max_tracer_type_len; 2754 cnt = MAX_TRACER_SIZE;
2651 2755
2652 if (copy_from_user(&buf, ubuf, cnt)) 2756 if (copy_from_user(&buf, ubuf, cnt))
2653 return -EFAULT; 2757 return -EFAULT;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 821064914c80..104c1a72418f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,10 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -36,163 +36,59 @@ enum trace_type {
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
38 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
39 TRACE_POWER,
40 TRACE_BLK, 39 TRACE_BLK,
41 40
42 __TRACE_LAST_TYPE, 41 __TRACE_LAST_TYPE,
43}; 42};
44 43
45/* 44enum kmemtrace_type_id {
46 * Function trace entry - function address and parent function addres: 45 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
47 */ 46 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
48struct ftrace_entry { 47 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
49 struct trace_entry ent;
50 unsigned long ip;
51 unsigned long parent_ip;
52};
53
54/* Function call entry */
55struct ftrace_graph_ent_entry {
56 struct trace_entry ent;
57 struct ftrace_graph_ent graph_ent;
58}; 48};
59 49
60/* Function return entry */
61struct ftrace_graph_ret_entry {
62 struct trace_entry ent;
63 struct ftrace_graph_ret ret;
64};
65extern struct tracer boot_tracer; 50extern struct tracer boot_tracer;
66 51
67/* 52#undef __field
68 * Context switch trace entry - which task (and prio) we switched from/to: 53#define __field(type, item) type item;
69 */
70struct ctx_switch_entry {
71 struct trace_entry ent;
72 unsigned int prev_pid;
73 unsigned char prev_prio;
74 unsigned char prev_state;
75 unsigned int next_pid;
76 unsigned char next_prio;
77 unsigned char next_state;
78 unsigned int next_cpu;
79};
80
81/*
82 * Special (free-form) trace entry:
83 */
84struct special_entry {
85 struct trace_entry ent;
86 unsigned long arg1;
87 unsigned long arg2;
88 unsigned long arg3;
89};
90
91/*
92 * Stack-trace entry:
93 */
94
95#define FTRACE_STACK_ENTRIES 8
96 54
97struct stack_entry { 55#undef __field_struct
98 struct trace_entry ent; 56#define __field_struct(type, item) __field(type, item)
99 unsigned long caller[FTRACE_STACK_ENTRIES];
100};
101 57
102struct userstack_entry { 58#undef __field_desc
103 struct trace_entry ent; 59#define __field_desc(type, container, item)
104 unsigned long caller[FTRACE_STACK_ENTRIES];
105};
106 60
107/* 61#undef __array
108 * trace_printk entry: 62#define __array(type, item, size) type item[size];
109 */
110struct bprint_entry {
111 struct trace_entry ent;
112 unsigned long ip;
113 const char *fmt;
114 u32 buf[];
115};
116 63
117struct print_entry { 64#undef __array_desc
118 struct trace_entry ent; 65#define __array_desc(type, container, item, size)
119 unsigned long ip;
120 char buf[];
121};
122
123#define TRACE_OLD_SIZE 88
124
125struct trace_field_cont {
126 unsigned char type;
127 /* Temporary till we get rid of this completely */
128 char buf[TRACE_OLD_SIZE - 1];
129};
130 66
131struct trace_mmiotrace_rw { 67#undef __dynamic_array
132 struct trace_entry ent; 68#define __dynamic_array(type, item) type item[];
133 struct mmiotrace_rw rw;
134};
135 69
136struct trace_mmiotrace_map { 70#undef F_STRUCT
137 struct trace_entry ent; 71#define F_STRUCT(args...) args
138 struct mmiotrace_map map;
139};
140
141struct trace_boot_call {
142 struct trace_entry ent;
143 struct boot_trace_call boot_call;
144};
145 72
146struct trace_boot_ret { 73#undef FTRACE_ENTRY
147 struct trace_entry ent; 74#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
148 struct boot_trace_ret boot_ret; 75 struct struct_name { \
149}; 76 struct trace_entry ent; \
77 tstruct \
78 }
150 79
151#define TRACE_FUNC_SIZE 30 80#undef TP_ARGS
152#define TRACE_FILE_SIZE 20 81#define TP_ARGS(args...) args
153struct trace_branch {
154 struct trace_entry ent;
155 unsigned line;
156 char func[TRACE_FUNC_SIZE+1];
157 char file[TRACE_FILE_SIZE+1];
158 char correct;
159};
160 82
161struct hw_branch_entry { 83#undef FTRACE_ENTRY_DUP
162 struct trace_entry ent; 84#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
163 u64 from;
164 u64 to;
165};
166 85
167struct trace_power { 86#include "trace_entries.h"
168 struct trace_entry ent;
169 struct power_trace state_data;
170};
171
172enum kmemtrace_type_id {
173 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
174 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
175 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
176};
177
178struct kmemtrace_alloc_entry {
179 struct trace_entry ent;
180 enum kmemtrace_type_id type_id;
181 unsigned long call_site;
182 const void *ptr;
183 size_t bytes_req;
184 size_t bytes_alloc;
185 gfp_t gfp_flags;
186 int node;
187};
188
189struct kmemtrace_free_entry {
190 struct trace_entry ent;
191 enum kmemtrace_type_id type_id;
192 unsigned long call_site;
193 const void *ptr;
194};
195 87
88/*
89 * syscalls are special, and need special handling, this is why
90 * they are not included in trace_entries.h
91 */
196struct syscall_trace_enter { 92struct syscall_trace_enter {
197 struct trace_entry ent; 93 struct trace_entry ent;
198 int nr; 94 int nr;
@@ -228,14 +124,12 @@ struct kretprobe_trace_entry {
228 (offsetof(struct kretprobe_trace_entry, args) + \ 124 (offsetof(struct kretprobe_trace_entry, args) + \
229 (sizeof(unsigned long) * (n))) 125 (sizeof(unsigned long) * (n)))
230 126
231
232
233/* 127/*
234 * trace_flag_type is an enumeration that holds different 128 * trace_flag_type is an enumeration that holds different
235 * states when a trace occurs. These are: 129 * states when a trace occurs. These are:
236 * IRQS_OFF - interrupts were disabled 130 * IRQS_OFF - interrupts were disabled
237 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 131 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
238 * NEED_RESCED - reschedule is requested 132 * NEED_RESCHED - reschedule is requested
239 * HARDIRQ - inside an interrupt handler 133 * HARDIRQ - inside an interrupt handler
240 * SOFTIRQ - inside a softirq handler 134 * SOFTIRQ - inside a softirq handler
241 */ 135 */
@@ -334,7 +228,6 @@ extern void __ftrace_bad_type(void);
334 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 228 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
335 TRACE_GRAPH_RET); \ 229 TRACE_GRAPH_RET); \
336 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 230 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
337 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
338 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 231 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
339 TRACE_KMEM_ALLOC); \ 232 TRACE_KMEM_ALLOC); \
340 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 233 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -414,7 +307,6 @@ struct tracer {
414 struct tracer *next; 307 struct tracer *next;
415 int print_max; 308 int print_max;
416 struct tracer_flags *flags; 309 struct tracer_flags *flags;
417 struct tracer_stat *stats;
418}; 310};
419 311
420 312
@@ -493,6 +385,7 @@ void tracing_stop_sched_switch_record(void);
493void tracing_start_sched_switch_record(void); 385void tracing_start_sched_switch_record(void);
494int register_tracer(struct tracer *type); 386int register_tracer(struct tracer *type);
495void unregister_tracer(struct tracer *type); 387void unregister_tracer(struct tracer *type);
388int is_tracing_stopped(void);
496 389
497extern unsigned long nsecs_to_usecs(unsigned long nsecs); 390extern unsigned long nsecs_to_usecs(unsigned long nsecs);
498 391
@@ -533,20 +426,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
533 426
534extern cycle_t ftrace_now(int cpu); 427extern cycle_t ftrace_now(int cpu);
535 428
536#ifdef CONFIG_CONTEXT_SWITCH_TRACER
537typedef void
538(*tracer_switch_func_t)(void *private,
539 void *__rq,
540 struct task_struct *prev,
541 struct task_struct *next);
542
543struct tracer_switch_ops {
544 tracer_switch_func_t func;
545 void *private;
546 struct tracer_switch_ops *next;
547};
548#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
549
550extern void trace_find_cmdline(int pid, char comm[]); 429extern void trace_find_cmdline(int pid, char comm[]);
551 430
552#ifdef CONFIG_DYNAMIC_FTRACE 431#ifdef CONFIG_DYNAMIC_FTRACE
@@ -662,6 +541,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
662#endif 541#endif
663 542
664/* 543/*
544 * struct trace_parser - servers for reading the user input separated by spaces
545 * @cont: set if the input is not complete - no final space char was found
546 * @buffer: holds the parsed user input
547 * @idx: user input lenght
548 * @size: buffer size
549 */
550struct trace_parser {
551 bool cont;
552 char *buffer;
553 unsigned idx;
554 unsigned size;
555};
556
557static inline bool trace_parser_loaded(struct trace_parser *parser)
558{
559 return (parser->idx != 0);
560}
561
562static inline bool trace_parser_cont(struct trace_parser *parser)
563{
564 return parser->cont;
565}
566
567static inline void trace_parser_clear(struct trace_parser *parser)
568{
569 parser->cont = false;
570 parser->idx = 0;
571}
572
573extern int trace_parser_get_init(struct trace_parser *parser, int size);
574extern void trace_parser_put(struct trace_parser *parser);
575extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
576 size_t cnt, loff_t *ppos);
577
578/*
665 * trace_iterator_flags is an enumeration that defines bit 579 * trace_iterator_flags is an enumeration that defines bit
666 * positions into trace_flags that controls the output. 580 * positions into trace_flags that controls the output.
667 * 581 *
@@ -847,58 +761,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
847 return 0; 761 return 0;
848} 762}
849 763
850#define DEFINE_COMPARISON_PRED(type) \
851static int filter_pred_##type(struct filter_pred *pred, void *event, \
852 int val1, int val2) \
853{ \
854 type *addr = (type *)(event + pred->offset); \
855 type val = (type)pred->val; \
856 int match = 0; \
857 \
858 switch (pred->op) { \
859 case OP_LT: \
860 match = (*addr < val); \
861 break; \
862 case OP_LE: \
863 match = (*addr <= val); \
864 break; \
865 case OP_GT: \
866 match = (*addr > val); \
867 break; \
868 case OP_GE: \
869 match = (*addr >= val); \
870 break; \
871 default: \
872 break; \
873 } \
874 \
875 return match; \
876}
877
878#define DEFINE_EQUALITY_PRED(size) \
879static int filter_pred_##size(struct filter_pred *pred, void *event, \
880 int val1, int val2) \
881{ \
882 u##size *addr = (u##size *)(event + pred->offset); \
883 u##size val = (u##size)pred->val; \
884 int match; \
885 \
886 match = (val == *addr) ^ pred->not; \
887 \
888 return match; \
889}
890
891extern struct mutex event_mutex; 764extern struct mutex event_mutex;
892extern struct list_head ftrace_events; 765extern struct list_head ftrace_events;
893 766
894extern const char *__start___trace_bprintk_fmt[]; 767extern const char *__start___trace_bprintk_fmt[];
895extern const char *__stop___trace_bprintk_fmt[]; 768extern const char *__stop___trace_bprintk_fmt[];
896 769
897#undef TRACE_EVENT_FORMAT 770#undef FTRACE_ENTRY
898#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 771#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
899 extern struct ftrace_event_call event_##call; 772 extern struct ftrace_event_call event_##call;
900#undef TRACE_EVENT_FORMAT_NOFILTER 773#undef FTRACE_ENTRY_DUP
901#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 774#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
902#include "trace_event_types.h" 775 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
776#include "trace_entries.h"
903 777
904#endif /* _LINUX_KERNEL_TRACE_H */ 778#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 19bfc75d467e..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =
129 129
130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
131{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
132 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
133 struct ring_buffer *buffer; 134 struct ring_buffer *buffer;
134 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
@@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(buffer, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
161 struct ring_buffer *buffer; 164 struct ring_buffer *buffer;
162 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
@@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
175 goto out; 178 goto out;
176 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
177 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
178 trace_buffer_unlock_commit(buffer, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
179 out: 183 out:
180 preempt_enable(); 184 preempt_enable();
181} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..ead3d724599d
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,366 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..e812f1c1264c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,60 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count++) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable(event);
49 if (!ret)
50 return 0;
51
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi:
54 kfree(trace_profile_buf);
55fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count);
58
59 return ret;
60}
61
10int ftrace_profile_enable(int event_id) 62int ftrace_profile_enable(int event_id)
11{ 63{
12 struct ftrace_event_call *event; 64 struct ftrace_event_call *event;
@@ -14,8 +66,9 @@ int ftrace_profile_enable(int event_id)
14 66
15 mutex_lock(&event_mutex); 67 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 68 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 69 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 70 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event);
19 break; 72 break;
20 } 73 }
21 } 74 }
@@ -24,6 +77,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 77 return ret;
25} 78}
26 79
80static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{
82 char *buf, *nmi_buf;
83
84 if (!atomic_add_negative(-1, &event->profile_count))
85 return;
86
87 event->profile_disable(event);
88
89 if (!--total_profile_count) {
90 buf = trace_profile_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL);
92
93 nmi_buf = trace_profile_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
95
96 /*
97 * Ensure every events in profiling have finished before
98 * releasing the buffers
99 */
100 synchronize_sched();
101
102 free_percpu(buf);
103 free_percpu(nmi_buf);
104 }
105}
106
27void ftrace_profile_disable(int event_id) 107void ftrace_profile_disable(int event_id)
28{ 108{
29 struct ftrace_event_call *event; 109 struct ftrace_event_call *event;
@@ -31,7 +111,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 111 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 112 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 113 if (event->id == event_id) {
34 event->profile_disable(event); 114 ftrace_profile_disable_event(event);
115 module_put(event->mod);
35 break; 116 break;
36 } 117 }
37 } 118 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index e74f0906ab1a..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO(char, buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO(char, buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f85b0f1cb942..a4b7c9a9130c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,6 +21,7 @@
21 21
22#include "trace_output.h" 22#include "trace_output.h"
23 23
24#undef TRACE_SYSTEM
24#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
25 26
26DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
@@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
86 __common_field(unsigned char, flags); 87 __common_field(unsigned char, flags);
87 __common_field(unsigned char, preempt_count); 88 __common_field(unsigned char, preempt_count);
88 __common_field(int, pid); 89 __common_field(int, pid);
89 __common_field(int, tgid); 90 __common_field(int, lock_depth);
90 91
91 return ret; 92 return ret;
92} 93}
@@ -226,11 +227,9 @@ static ssize_t
226ftrace_event_write(struct file *file, const char __user *ubuf, 227ftrace_event_write(struct file *file, const char __user *ubuf,
227 size_t cnt, loff_t *ppos) 228 size_t cnt, loff_t *ppos)
228{ 229{
230 struct trace_parser parser;
229 size_t read = 0; 231 size_t read = 0;
230 int i, set = 1;
231 ssize_t ret; 232 ssize_t ret;
232 char *buf;
233 char ch;
234 233
235 if (!cnt || cnt < 0) 234 if (!cnt || cnt < 0)
236 return 0; 235 return 0;
@@ -239,60 +238,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
239 if (ret < 0) 238 if (ret < 0)
240 return ret; 239 return ret;
241 240
242 ret = get_user(ch, ubuf++); 241 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
243 if (ret)
244 return ret;
245 read++;
246 cnt--;
247
248 /* skip white space */
249 while (cnt && isspace(ch)) {
250 ret = get_user(ch, ubuf++);
251 if (ret)
252 return ret;
253 read++;
254 cnt--;
255 }
256
257 /* Only white space found? */
258 if (isspace(ch)) {
259 file->f_pos += read;
260 ret = read;
261 return ret;
262 }
263
264 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
265 if (!buf)
266 return -ENOMEM; 242 return -ENOMEM;
267 243
268 if (cnt > EVENT_BUF_SIZE) 244 read = trace_get_user(&parser, ubuf, cnt, ppos);
269 cnt = EVENT_BUF_SIZE; 245
246 if (trace_parser_loaded((&parser))) {
247 int set = 1;
270 248
271 i = 0; 249 if (*parser.buffer == '!')
272 while (cnt && !isspace(ch)) {
273 if (!i && ch == '!')
274 set = 0; 250 set = 0;
275 else
276 buf[i++] = ch;
277 251
278 ret = get_user(ch, ubuf++); 252 parser.buffer[parser.idx] = 0;
253
254 ret = ftrace_set_clr_event(parser.buffer + !set, set);
279 if (ret) 255 if (ret)
280 goto out_free; 256 goto out_put;
281 read++;
282 cnt--;
283 } 257 }
284 buf[i] = 0;
285
286 file->f_pos += read;
287
288 ret = ftrace_set_clr_event(buf, set);
289 if (ret)
290 goto out_free;
291 258
292 ret = read; 259 ret = read;
293 260
294 out_free: 261 out_put:
295 kfree(buf); 262 trace_parser_put(&parser);
296 263
297 return ret; 264 return ret;
298} 265}
@@ -300,42 +267,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
300static void * 267static void *
301t_next(struct seq_file *m, void *v, loff_t *pos) 268t_next(struct seq_file *m, void *v, loff_t *pos)
302{ 269{
303 struct list_head *list = m->private; 270 struct ftrace_event_call *call = v;
304 struct ftrace_event_call *call;
305 271
306 (*pos)++; 272 (*pos)++;
307 273
308 for (;;) { 274 list_for_each_entry_continue(call, &ftrace_events, list) {
309 if (list == &ftrace_events)
310 return NULL;
311
312 call = list_entry(list, struct ftrace_event_call, list);
313
314 /* 275 /*
315 * The ftrace subsystem is for showing formats only. 276 * The ftrace subsystem is for showing formats only.
316 * They can not be enabled or disabled via the event files. 277 * They can not be enabled or disabled via the event files.
317 */ 278 */
318 if (call->regfunc) 279 if (call->regfunc)
319 break; 280 return call;
320
321 list = list->next;
322 } 281 }
323 282
324 m->private = list->next; 283 return NULL;
325
326 return call;
327} 284}
328 285
329static void *t_start(struct seq_file *m, loff_t *pos) 286static void *t_start(struct seq_file *m, loff_t *pos)
330{ 287{
331 struct ftrace_event_call *call = NULL; 288 struct ftrace_event_call *call;
332 loff_t l; 289 loff_t l;
333 290
334 mutex_lock(&event_mutex); 291 mutex_lock(&event_mutex);
335 292
336 m->private = ftrace_events.next; 293 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
337 for (l = 0; l <= *pos; ) { 294 for (l = 0; l <= *pos; ) {
338 call = t_next(m, NULL, &l); 295 call = t_next(m, call, &l);
339 if (!call) 296 if (!call)
340 break; 297 break;
341 } 298 }
@@ -345,37 +302,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
345static void * 302static void *
346s_next(struct seq_file *m, void *v, loff_t *pos) 303s_next(struct seq_file *m, void *v, loff_t *pos)
347{ 304{
348 struct list_head *list = m->private; 305 struct ftrace_event_call *call = v;
349 struct ftrace_event_call *call;
350 306
351 (*pos)++; 307 (*pos)++;
352 308
353 retry: 309 list_for_each_entry_continue(call, &ftrace_events, list) {
354 if (list == &ftrace_events) 310 if (call->enabled)
355 return NULL; 311 return call;
356
357 call = list_entry(list, struct ftrace_event_call, list);
358
359 if (!call->enabled) {
360 list = list->next;
361 goto retry;
362 } 312 }
363 313
364 m->private = list->next; 314 return NULL;
365
366 return call;
367} 315}
368 316
369static void *s_start(struct seq_file *m, loff_t *pos) 317static void *s_start(struct seq_file *m, loff_t *pos)
370{ 318{
371 struct ftrace_event_call *call = NULL; 319 struct ftrace_event_call *call;
372 loff_t l; 320 loff_t l;
373 321
374 mutex_lock(&event_mutex); 322 mutex_lock(&event_mutex);
375 323
376 m->private = ftrace_events.next; 324 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
377 for (l = 0; l <= *pos; ) { 325 for (l = 0; l <= *pos; ) {
378 call = s_next(m, NULL, &l); 326 call = s_next(m, call, &l);
379 if (!call) 327 if (!call)
380 break; 328 break;
381 } 329 }
@@ -574,7 +522,7 @@ static int trace_write_header(struct trace_seq *s)
574 FIELD(unsigned char, flags), 522 FIELD(unsigned char, flags),
575 FIELD(unsigned char, preempt_count), 523 FIELD(unsigned char, preempt_count),
576 FIELD(int, pid), 524 FIELD(int, pid),
577 FIELD(int, tgid)); 525 FIELD(int, lock_depth));
578} 526}
579 527
580static ssize_t 528static ssize_t
@@ -1242,7 +1190,7 @@ static int trace_module_notify(struct notifier_block *self,
1242} 1190}
1243#endif /* CONFIG_MODULES */ 1191#endif /* CONFIG_MODULES */
1244 1192
1245struct notifier_block trace_module_nb = { 1193static struct notifier_block trace_module_nb = {
1246 .notifier_call = trace_module_notify, 1194 .notifier_call = trace_module_notify,
1247 .priority = 0, 1195 .priority = 0,
1248}; 1196};
@@ -1414,6 +1362,18 @@ static __init void event_trace_self_tests(void)
1414 if (!call->regfunc) 1362 if (!call->regfunc)
1415 continue; 1363 continue;
1416 1364
1365/*
1366 * Testing syscall events here is pretty useless, but
1367 * we still do it if configured. But this is time consuming.
1368 * What we really need is a user thread to perform the
1369 * syscalls as we test.
1370 */
1371#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1372 if (call->system &&
1373 strcmp(call->system, "syscalls") == 0)
1374 continue;
1375#endif
1376
1417 pr_info("Testing event %s: ", call->name); 1377 pr_info("Testing event %s: ", call->name);
1418 1378
1419 /* 1379 /*
@@ -1487,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1487 1447
1488#ifdef CONFIG_FUNCTION_TRACER 1448#ifdef CONFIG_FUNCTION_TRACER
1489 1449
1490static DEFINE_PER_CPU(atomic_t, test_event_disable); 1450static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1491 1451
1492static void 1452static void
1493function_test_events_call(unsigned long ip, unsigned long parent_ip) 1453function_test_events_call(unsigned long ip, unsigned long parent_ip)
@@ -1504,7 +1464,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1504 pc = preempt_count(); 1464 pc = preempt_count();
1505 resched = ftrace_preempt_disable(); 1465 resched = ftrace_preempt_disable();
1506 cpu = raw_smp_processor_id(); 1466 cpu = raw_smp_processor_id();
1507 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1467 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1508 1468
1509 if (disabled != 1) 1469 if (disabled != 1)
1510 goto out; 1470 goto out;
@@ -1523,7 +1483,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1523 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); 1483 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1524 1484
1525 out: 1485 out:
1526 atomic_dec(&per_cpu(test_event_disable, cpu)); 1486 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1527 ftrace_preempt_enable(resched); 1487 ftrace_preempt_enable(resched);
1528} 1488}
1529 1489
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 93660fbbf629..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index a79ef6f193c0..ed7d48083520 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,147 +15,124 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
35 83
36#undef TRACE_FIELD_SPECIAL 84#undef __array
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 85#define __array(type, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 87 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
41 (unsigned int)sizeof(field.item)); \ 89 sizeof(field.item)); \
42 if (!ret) \ 90 if (!ret) \
43 return 0; 91 return 0;
44 92
45#undef TRACE_FIELD_ZERO 93#undef __array_desc
46#define TRACE_FIELD_ZERO(type, item) \ 94#define __array_desc(type, container, item, len) \
47 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
48 "offset:%u;\tsize:0;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 97 offsetof(typeof(field), container.item), \
98 sizeof(field.container.item)); \
50 if (!ret) \ 99 if (!ret) \
51 return 0; 100 return 0;
52 101
53#undef TRACE_FIELD_SIGN 102#undef __dynamic_array
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 103#define __dynamic_array(type, item) \
55 TRACE_FIELD(type, item, assign) 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
105 "offset:%zu;\tsize:0;\n", \
106 offsetof(typeof(field), item)); \
107 if (!ret) \
108 return 0;
56 109
57#undef TP_RAW_FMT 110#undef F_printk
58#define TP_RAW_FMT(args...) args 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
59 112
60#undef TRACE_EVENT_FORMAT 113#undef __entry
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 114#define __entry REC
62static int \
63ftrace_format_##call(struct ftrace_event_call *unused, \
64 struct trace_seq *s) \
65{ \
66 struct args field; \
67 int ret; \
68 \
69 tstruct; \
70 \
71 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
72 \
73 return ret; \
74}
75 115
76#undef TRACE_EVENT_FORMAT_NOFILTER 116#undef FTRACE_ENTRY
77#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
78 tpfmt) \
79static int \ 118static int \
80ftrace_format_##call(struct ftrace_event_call *unused, \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
81 struct trace_seq *s) \ 120 struct trace_seq *s) \
82{ \ 121{ \
83 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
84 int ret; \ 123 int ret = 0; \
85 \ 124 \
86 tstruct; \ 125 tstruct; \
87 \ 126 \
88 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
89 \ 128 \
90 return ret; \ 129 return ret; \
91} 130}
92 131
93#include "trace_event_types.h" 132#include "trace_entries.h"
94
95#undef TRACE_FIELD
96#define TRACE_FIELD(type, item, assign)\
97 entry->item = assign;
98
99#undef TRACE_FIELD
100#define TRACE_FIELD(type, item, assign)\
101 entry->item = assign;
102
103#undef TRACE_FIELD_SIGN
104#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
105 TRACE_FIELD(type, item, assign)
106
107#undef TRACE_FIELD_ZERO
108#define TRACE_FIELD_ZERO(type, item)
109
110#undef TP_CMD
111#define TP_CMD(cmd...) cmd
112
113#undef TRACE_ENTRY
114#define TRACE_ENTRY entry
115
116#undef TRACE_FIELD_SPECIAL
117#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
118 cmd;
119
120static int ftrace_raw_init_event(struct ftrace_event_call *event_call)
121{
122 INIT_LIST_HEAD(&event_call->fields);
123
124 return 0;
125}
126
127#undef TRACE_EVENT_FORMAT
128#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
129int ftrace_define_fields_##call(struct ftrace_event_call *event_call); \
130 \
131struct ftrace_event_call __used \
132__attribute__((__aligned__(4))) \
133__attribute__((section("_ftrace_events"))) event_##call = { \
134 .name = #call, \
135 .id = proto, \
136 .system = __stringify(TRACE_SYSTEM), \
137 .raw_init = ftrace_raw_init_event, \
138 .show_format = ftrace_format_##call, \
139 .define_fields = ftrace_define_fields_##call, \
140}; \
141
142#undef TRACE_EVENT_FORMAT_NOFILTER
143#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
144 tpfmt) \
145 \
146struct ftrace_event_call __used \
147__attribute__((__aligned__(4))) \
148__attribute__((section("_ftrace_events"))) event_##call = { \
149 .name = #call, \
150 .id = proto, \
151 .system = __stringify(TRACE_SYSTEM), \
152 .show_format = ftrace_format_##call, \
153};
154
155#include "trace_event_types.h"
156 133
157#undef TRACE_FIELD 134#undef __field
158#define TRACE_FIELD(type, item, assign) \ 135#define __field(type, item) \
159 ret = trace_define_field(event_call, #type, #item, \ 136 ret = trace_define_field(event_call, #type, #item, \
160 offsetof(typeof(field), item), \ 137 offsetof(typeof(field), item), \
161 sizeof(field.item), \ 138 sizeof(field.item), \
@@ -163,32 +140,45 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
163 if (ret) \ 140 if (ret) \
164 return ret; 141 return ret;
165 142
166#undef TRACE_FIELD_SPECIAL 143#undef __field_desc
167#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ 144#define __field_desc(type, container, item) \
145 ret = trace_define_field(event_call, #type, #item, \
146 offsetof(typeof(field), \
147 container.item), \
148 sizeof(field.container.item), \
149 is_signed_type(type), FILTER_OTHER); \
150 if (ret) \
151 return ret;
152
153#undef __array
154#define __array(type, item, len) \
155 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 156 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), item), \ 157 offsetof(typeof(field), item), \
170 sizeof(field.item), 0, FILTER_OTHER); \ 158 sizeof(field.item), 0, FILTER_OTHER); \
171 if (ret) \ 159 if (ret) \
172 return ret; 160 return ret;
173 161
174#undef TRACE_FIELD_SIGN 162#undef __array_desc
175#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 163#define __array_desc(type, container, item, len) \
176 ret = trace_define_field(event_call, #type, #item, \ 164 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
177 offsetof(typeof(field), item), \ 165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
178 sizeof(field.item), is_signed, \ 166 offsetof(typeof(field), \
167 container.item), \
168 sizeof(field.container.item), 0, \
179 FILTER_OTHER); \ 169 FILTER_OTHER); \
180 if (ret) \ 170 if (ret) \
181 return ret; 171 return ret;
182 172
183#undef TRACE_FIELD_ZERO 173#undef __dynamic_array
184#define TRACE_FIELD_ZERO(type, item) 174#define __dynamic_array(type, item)
185 175
186#undef TRACE_EVENT_FORMAT 176#undef FTRACE_ENTRY
187#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 177#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
188int \ 178int \
189ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ 179ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
190{ \ 180{ \
191 struct args field; \ 181 struct struct_name field; \
192 int ret; \ 182 int ret; \
193 \ 183 \
194 ret = trace_define_common_fields(event_call); \ 184 ret = trace_define_common_fields(event_call); \
@@ -200,8 +190,41 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \
200 return ret; \ 190 return ret; \
201} 191}
202 192
203#undef TRACE_EVENT_FORMAT_NOFILTER 193#include "trace_entries.h"
204#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 194
205 tpfmt) 195static int ftrace_raw_init_event(struct ftrace_event_call *call)
196{
197 INIT_LIST_HEAD(&call->fields);
198 return 0;
199}
200
201#undef __field
202#define __field(type, item)
203
204#undef __field_desc
205#define __field_desc(type, container, item)
206
207#undef __array
208#define __array(type, item, len)
209
210#undef __array_desc
211#define __array_desc(type, container, item, len)
212
213#undef __dynamic_array
214#define __dynamic_array(type, item)
215
216#undef FTRACE_ENTRY
217#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
218 \
219struct ftrace_event_call __used \
220__attribute__((__aligned__(4))) \
221__attribute__((section("_ftrace_events"))) event_##call = { \
222 .name = #call, \
223 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \
227 .define_fields = ftrace_define_fields_##call, \
228}; \
206 229
207#include "trace_event_types.h" 230#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5b01b94518fc..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
290{ 290{
291 long count = (long)data; 291 long count = (long)data;
292 292
293 seq_printf(m, "%pf:", (void *)ip); 293 seq_printf(m, "%ps:", (void *)ip);
294 294
295 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b3749a2c3132..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
364} 364}
365 365
366 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
367/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
368static enum print_line_t 377static enum print_line_t
369verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
521 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
522 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
523 } 532 }
533
524 /* Proc */ 534 /* Proc */
525 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
526 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
659 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
660 } 670 }
661 671
662 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
663 if (!ret) 673 if (!ret)
664 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
665 675
@@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
702 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
703 } 713 }
704 714
705 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
706 if (!ret) 716 if (!ret)
707 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
708 718
@@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
758 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
759 } 769 }
760 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
761 return 0; 778 return 0;
762} 779}
763 780
@@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
952 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
953} 970}
954 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
955static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
956{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
957 /* 1st line */ 1001 /* 1st line */
958 seq_printf(s, "# "); 1002 seq_printf(s, "#");
959 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
960 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
961 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
962 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
963 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
964 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
965 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
966 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
967 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
968 1014
969 /* 2nd line */ 1015 /* 2nd line */
970 seq_printf(s, "# "); 1016 seq_printf(s, "#");
971 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
972 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
973 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
974 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
975 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
976 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
977 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
978 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
979 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..23b63859130e 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5555b75a0d12..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index f6821f16227e..09cba270392d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -28,7 +28,7 @@
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_counter.h> 31#include <linux/perf_event.h>
32 32
33#include "trace.h" 33#include "trace.h"
34#include "trace_output.h" 34#include "trace_output.h"
@@ -1176,7 +1176,7 @@ static __kprobes int kprobe_profile_func(struct kprobe *kp,
1176 entry->ip = (unsigned long)kp->addr; 1176 entry->ip = (unsigned long)kp->addr;
1177 for (i = 0; i < tp->nr_args; i++) 1177 for (i = 0; i < tp->nr_args; i++)
1178 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1178 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1179 perf_tpcounter_event(call->id, entry->ip, 1, entry, size); 1179 perf_tp_event(call->id, entry->ip, 1, entry, size);
1180 } while (0); 1180 } while (0);
1181 return 0; 1181 return 0;
1182} 1182}
@@ -1213,7 +1213,7 @@ static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1213 entry->ret_ip = (unsigned long)ri->ret_addr; 1213 entry->ret_ip = (unsigned long)ri->ret_addr;
1214 for (i = 0; i < tp->nr_args; i++) 1214 for (i = 0; i < tp->nr_args; i++)
1215 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1215 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1216 perf_tpcounter_event(call->id, entry->ret_ip, 1, entry, size); 1216 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1217 } while (0); 1217 } while (0);
1218 return 0; 1218 return 0;
1219} 1219}
@@ -1222,10 +1222,8 @@ static int probe_profile_enable(struct ftrace_event_call *call)
1222{ 1222{
1223 struct trace_probe *tp = (struct trace_probe *)call->data; 1223 struct trace_probe *tp = (struct trace_probe *)call->data;
1224 1224
1225 if (atomic_inc_return(&call->profile_count))
1226 return 0;
1227
1228 tp->flags |= TP_FLAG_PROFILE; 1225 tp->flags |= TP_FLAG_PROFILE;
1226
1229 if (probe_is_return(tp)) 1227 if (probe_is_return(tp))
1230 return enable_kretprobe(&tp->rp); 1228 return enable_kretprobe(&tp->rp);
1231 else 1229 else
@@ -1236,10 +1234,9 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1236{ 1234{
1237 struct trace_probe *tp = (struct trace_probe *)call->data; 1235 struct trace_probe *tp = (struct trace_probe *)call->data;
1238 1236
1239 if (atomic_add_negative(-1, &call->profile_count)) 1237 tp->flags &= ~TP_FLAG_PROFILE;
1240 tp->flags &= ~TP_FLAG_PROFILE;
1241 1238
1242 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { 1239 if (!(tp->flags & TP_FLAG_TRACE)) {
1243 if (probe_is_return(tp)) 1240 if (probe_is_return(tp))
1244 disable_kretprobe(&tp->rp); 1241 disable_kretprobe(&tp->rp);
1245 else 1242 else
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c4c9bbda53d3..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
310 struct ring_buffer *buffer = tr->buffer; 311 struct ring_buffer *buffer = tr->buffer;
311 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
312 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
@@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
320 } 321 }
321 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
322 entry->rw = *rw; 323 entry->rw = *rw;
323 trace_buffer_unlock_commit(buffer, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
324} 327}
325 328
326void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
334 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
335 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
336{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
337 struct ring_buffer *buffer = tr->buffer; 341 struct ring_buffer *buffer = tr->buffer;
338 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
339 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
@@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
347 } 351 }
348 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
349 entry->map = *map; 353 entry->map = *map;
350 trace_buffer_unlock_commit(buffer, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
351} 357}
352 358
353void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
481 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
482 return 0; 487 return 0;
483 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
484 if (entry->preempt_count) 496 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
487} 513}
488 514
489static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
42 struct trace_power *entry;
43 struct trace_array_cpu *data;
44 struct trace_array *tr = power_trace;
45
46 if (!trace_power_enabled)
47 return;
48
49 buffer = tr->buffer;
50
51 preempt_disable();
52 it->end = ktime_get();
53 data = tr->data[smp_processor_id()];
54
55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
56 sizeof(*entry), 0, 0);
57 if (!event)
58 goto out;
59 entry = ring_buffer_event_data(event);
60 entry->state_data = *it;
61 if (!filter_check_discard(call, entry, buffer, event))
62 trace_buffer_unlock_commit(buffer, event, 0, 0);
63 out:
64 preempt_enable();
65}
66
67static void probe_power_mark(struct power_trace *it, unsigned int type,
68 unsigned int level)
69{
70 struct ftrace_event_call *call = &event_power;
71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
73 struct trace_power *entry;
74 struct trace_array_cpu *data;
75 struct trace_array *tr = power_trace;
76
77 if (!trace_power_enabled)
78 return;
79
80 buffer = tr->buffer;
81
82 memset(it, 0, sizeof(struct power_trace));
83 it->state = level;
84 it->type = type;
85 it->stamp = ktime_get();
86 preempt_disable();
87 it->end = it->stamp;
88 data = tr->data[smp_processor_id()];
89
90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
91 sizeof(*entry), 0, 0);
92 if (!event)
93 goto out;
94 entry = ring_buffer_event_data(event);
95 entry->state_data = *it;
96 if (!filter_check_discard(call, entry, buffer, event))
97 trace_buffer_unlock_commit(buffer, event, 0, 0);
98 out:
99 preempt_enable();
100}
101
102static int tracing_power_register(void)
103{
104 int ret;
105
106 ret = register_trace_power_start(probe_power_start);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_start\n");
110 return ret;
111 }
112 ret = register_trace_power_end(probe_power_end);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_end\n");
116 goto fail_start;
117 }
118 ret = register_trace_power_mark(probe_power_mark);
119 if (ret) {
120 pr_info("power trace: Couldn't activate tracepoint"
121 " probe to trace_power_mark\n");
122 goto fail_end;
123 }
124 return ret;
125fail_end:
126 unregister_trace_power_end(probe_power_end);
127fail_start:
128 unregister_trace_power_start(probe_power_start);
129 return ret;
130}
131
132static void start_power_trace(struct trace_array *tr)
133{
134 trace_power_enabled = 1;
135}
136
137static void stop_power_trace(struct trace_array *tr)
138{
139 trace_power_enabled = 0;
140}
141
142static void power_trace_reset(struct trace_array *tr)
143{
144 trace_power_enabled = 0;
145 unregister_trace_power_start(probe_power_start);
146 unregister_trace_power_end(probe_power_end);
147 unregister_trace_power_mark(probe_power_mark);
148}
149
150
151static int power_trace_init(struct trace_array *tr)
152{
153 power_trace = tr;
154
155 trace_power_enabled = 1;
156 tracing_power_register();
157
158 tracing_reset_online_cpus(tr);
159 return 0;
160}
161
162static enum print_line_t power_print_line(struct trace_iterator *iter)
163{
164 int ret = 0;
165 struct trace_entry *entry = iter->ent;
166 struct trace_power *field ;
167 struct power_trace *it;
168 struct trace_seq *s = &iter->seq;
169 struct timespec stamp;
170 struct timespec duration;
171
172 trace_assign_type(field, entry);
173 it = &field->state_data;
174 stamp = ktime_to_timespec(it->stamp);
175 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
176
177 if (entry->type == TRACE_POWER) {
178 if (it->type == POWER_CSTATE)
179 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
180 stamp.tv_sec,
181 stamp.tv_nsec,
182 it->state, iter->cpu,
183 duration.tv_sec,
184 duration.tv_nsec);
185 if (it->type == POWER_PSTATE)
186 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
187 stamp.tv_sec,
188 stamp.tv_nsec,
189 it->state, iter->cpu);
190 if (!ret)
191 return TRACE_TYPE_PARTIAL_LINE;
192 return TRACE_TYPE_HANDLED;
193 }
194 return TRACE_TYPE_UNHANDLED;
195}
196
197static void power_print_header(struct seq_file *s)
198{
199 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
200 seq_puts(s, "# | | |\n");
201}
202
203static struct tracer power_tracer __read_mostly =
204{
205 .name = "power",
206 .init = power_trace_init,
207 .start = start_power_trace,
208 .stop = stop_power_trace,
209 .reset = power_trace_reset,
210 .print_line = power_print_line,
211 .print_header = power_print_header,
212};
213
214static int init_power_trace(void)
215{
216 return register_tracer(&power_tracer);
217}
218device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 687699d365ae..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ad69f105a7c6..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
244 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
245 236
246 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
247 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
248 240
249 wakeup_task = p; 241 wakeup_task = p;
@@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
293 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
294 } 286 }
295 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
296 wakeup_reset(tr); 295 wakeup_reset(tr);
297 296
298 /* 297 /*
@@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
328} 328}
329 329
330static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index dfc55fed2099..1b050ab47120 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,7 +2,7 @@
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h> 4#include <linux/ftrace.h>
5#include <linux/perf_counter.h> 5#include <linux/perf_event.h>
6#include <asm/syscall.h> 6#include <asm/syscall.h>
7 7
8#include "trace_output.h" 8#include "trace_output.h"
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
384 384
385static void prof_syscall_enter(struct pt_regs *regs, long id) 385static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 386{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
389 int syscall_nr; 391 int syscall_nr;
390 int size; 392 int size;
393 int cpu;
391 394
392 syscall_nr = syscall_get_nr(current, regs); 395 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 405 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 406 size -= sizeof(u32);
404 407
405 do { 408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
407 414
408 /* zero the dead bytes from align to not leak stack to user */ 415 cpu = smp_processor_id();
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
410 424
411 rec = (struct syscall_trace_enter *) raw_data; 425 raw_data = per_cpu_ptr(raw_data, cpu);
412 tracing_generic_entry_update(&rec->ent, 0, 0); 426
413 rec->ent.type = sys_data->enter_id; 427 /* zero the dead bytes from align to not leak stack to user */
414 rec->nr = syscall_nr; 428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 429
416 (unsigned long *)&rec->args); 430 rec = (struct syscall_trace_enter *) raw_data;
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); 431 tracing_generic_entry_update(&rec->ent, 0, 0);
418 } while(0); 432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
419} 440}
420 441
421int reg_prof_syscall_enter(char *name) 442int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 481static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 482{
462 struct syscall_metadata *sys_data; 483 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 484 struct syscall_trace_exit *rec;
485 unsigned long flags;
464 int syscall_nr; 486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
465 490
466 syscall_nr = syscall_get_nr(current, regs); 491 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 496 if (!sys_data)
472 return; 497 return;
473 498
474 tracing_generic_entry_update(&rec.ent, 0, 0); 499 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 501 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs);
478 502
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
480} 539}
481 540
482int reg_prof_syscall_exit(char *name) 541int reg_prof_syscall_exit(char *name)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 9489a0a9b1be..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
48 48
49/* 49/*
50 * Note about RCU : 50 * Note about RCU :
51 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
52 * state is reached. 52 * state is reached.
53 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
54 */ 54 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{