aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cred.c296
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c59
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c128
-rw-r--r--kernel/module.c17
-rw-r--r--kernel/perf_counter.c183
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c412
-rw-r--r--kernel/printk.c181
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c280
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/sched.c1632
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c474
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c4
-rw-r--r--kernel/sched_rt.c69
-rw-r--r--kernel/smp.c40
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sysctl.c39
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/timer.c31
-rw-r--r--kernel/trace/Kconfig41
-rw-r--r--kernel/trace/blktrace.c12
-rw-r--r--kernel/trace/ftrace.c259
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/ring_buffer.c1123
-rw-r--r--kernel/trace/trace.c782
-rw-r--r--kernel/trace/trace.h352
-rw-r--r--kernel/trace/trace_boot.c20
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h383
-rw-r--r--kernel/trace/trace_event_profile.c5
-rw-r--r--kernel/trace/trace_event_types.h178
-rw-r--r--kernel/trace/trace_events.c229
-rw-r--r--kernel/trace/trace_events_filter.c302
-rw-r--r--kernel/trace/trace_export.c290
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c228
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c42
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c22
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c59
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c43
-rw-r--r--kernel/trace/trace_stat.c17
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c471
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c50
-rw-r--r--kernel/workqueue.c9
94 files changed, 8712 insertions, 7053 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..3d9c7e27e3f9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@ -92,7 +90,6 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
92obj-$(CONFIG_MARKERS) += marker.o 90obj-$(CONFIG_MARKERS) += marker.o
93obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 91obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
94obj-$(CONFIG_LATENCYTOP) += latencytop.o 92obj-$(CONFIG_LATENCYTOP) += latencytop.o
95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 93obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 94obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 95obj-$(CONFIG_X86_DS) += trace/
@@ -119,7 +116,7 @@ $(obj)/config_data.gz: .config FORCE
119 $(call if_changed,gzip) 116 $(call if_changed,gzip)
120 117
121quiet_cmd_ikconfiggz = IKCFG $@ 118quiet_cmd_ikconfiggz = IKCFG $@
122 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 119 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
123targets += config_data.h 120targets += config_data.h
124$(obj)/config_data.h: $(obj)/config_data.gz FORCE 121$(obj)/config_data.h: $(obj)/config_data.gz FORCE
125 $(call if_changed,ikconfiggz) 122 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3e..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
491 u64 run_time; 491 u64 run_time;
492 struct timespec uptime; 492 struct timespec uptime;
493 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
494 498
495 /* 499 /*
496 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
497 * the process accounting system. 501 * the process accounting system.
498 */ 502 */
499 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
500 return; 504 goto out;
501 505
502 /* 506 /*
503 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
578 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
579 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
580 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
581} 587}
582 588
583/** 589/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
601 601
602static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
603 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
604}; 605};
605 606
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..d7f7a01082eb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
95 release_tgcred(cred); 149 release_tgcred(cred);
96 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
97 free_uid(cred->user); 152 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
99} 154}
@@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 161 */
107void __put_cred(struct cred *cred) 162void __put_cred(struct cred *cred)
108{ 163{
164 kdebug("__put_cred(%p{%d,%d})", cred,
165 atomic_read(&cred->usage),
166 read_cred_subscribers(cred));
167
109 BUG_ON(atomic_read(&cred->usage) != 0); 168 BUG_ON(atomic_read(&cred->usage) != 0);
169#ifdef CONFIG_DEBUG_CREDENTIALS
170 BUG_ON(read_cred_subscribers(cred) != 0);
171 cred->magic = CRED_MAGIC_DEAD;
172 cred->put_addr = __builtin_return_address(0);
173#endif
174 BUG_ON(cred == current->cred);
175 BUG_ON(cred == current->real_cred);
110 176
111 call_rcu(&cred->rcu, put_cred_rcu); 177 call_rcu(&cred->rcu, put_cred_rcu);
112} 178}
113EXPORT_SYMBOL(__put_cred); 179EXPORT_SYMBOL(__put_cred);
114 180
181/*
182 * Clean up a task's credentials when it exits
183 */
184void exit_creds(struct task_struct *tsk)
185{
186 struct cred *cred;
187
188 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
189 atomic_read(&tsk->cred->usage),
190 read_cred_subscribers(tsk->cred));
191
192 cred = (struct cred *) tsk->real_cred;
193 tsk->real_cred = NULL;
194 validate_creds(cred);
195 alter_cred_subscribers(cred, -1);
196 put_cred(cred);
197
198 cred = (struct cred *) tsk->cred;
199 tsk->cred = NULL;
200 validate_creds(cred);
201 alter_cred_subscribers(cred, -1);
202 put_cred(cred);
203
204 cred = (struct cred *) tsk->replacement_session_keyring;
205 if (cred) {
206 tsk->replacement_session_keyring = NULL;
207 validate_creds(cred);
208 put_cred(cred);
209 }
210}
211
212/*
213 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM.
215 */
216struct cred *cred_alloc_blank(void)
217{
218 struct cred *new;
219
220 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
221 if (!new)
222 return NULL;
223
224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) {
227 kfree(new);
228 return NULL;
229 }
230 atomic_set(&new->tgcred->usage, 1);
231#endif
232
233 atomic_set(&new->usage, 1);
234
235 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
236 goto error;
237
238#ifdef CONFIG_DEBUG_CREDENTIALS
239 new->magic = CRED_MAGIC;
240#endif
241 return new;
242
243error:
244 abort_creds(new);
245 return NULL;
246}
247
115/** 248/**
116 * prepare_creds - Prepare a new set of credentials for modification 249 * prepare_creds - Prepare a new set of credentials for modification
117 * 250 *
@@ -132,16 +265,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 265 const struct cred *old;
133 struct cred *new; 266 struct cred *new;
134 267
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 268 validate_process_creds();
136 269
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 270 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 271 if (!new)
139 return NULL; 272 return NULL;
140 273
274 kdebug("prepare_creds() alloc %p", new);
275
141 old = task->cred; 276 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 277 memcpy(new, old, sizeof(struct cred));
143 278
144 atomic_set(&new->usage, 1); 279 atomic_set(&new->usage, 1);
280 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 281 get_group_info(new->group_info);
146 get_uid(new->user); 282 get_uid(new->user);
147 283
@@ -157,6 +293,7 @@ struct cred *prepare_creds(void)
157 293
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 294 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 295 goto error;
296 validate_creds(new);
160 return new; 297 return new;
161 298
162error: 299error:
@@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 366 if (!new)
230 return NULL; 367 return NULL;
231 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
232 memcpy(new, &init_cred, sizeof(struct cred)); 371 memcpy(new, &init_cred, sizeof(struct cred));
233 372
234 atomic_set(&new->usage, 1); 373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 375 get_group_info(new->group_info);
236 get_uid(new->user); 376 get_uid(new->user);
237 377
@@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 390#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 392 goto error;
393 validate_creds(new);
253 394
254 BUG_ON(atomic_read(&new->usage) != 1); 395 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 396 return new;
@@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 427 ) {
287 p->real_cred = get_cred(p->cred); 428 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 429 get_cred(p->cred);
430 alter_cred_subscribers(p->cred, 2);
431 kdebug("share_creds(%p{%d,%d})",
432 p->cred, atomic_read(&p->cred->usage),
433 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 434 atomic_inc(&p->cred->user->processes);
290 return 0; 435 return 0;
291 } 436 }
@@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 476
332 atomic_inc(&new->user->processes); 477 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 478 p->cred = p->real_cred = get_cred(new);
479 alter_cred_subscribers(new, 2);
480 validate_creds(new);
334 return 0; 481 return 0;
335 482
336error_put: 483error_put:
@@ -355,13 +502,20 @@ error_put:
355int commit_creds(struct cred *new) 502int commit_creds(struct cred *new)
356{ 503{
357 struct task_struct *task = current; 504 struct task_struct *task = current;
358 const struct cred *old; 505 const struct cred *old = task->real_cred;
359 506
360 BUG_ON(task->cred != task->real_cred); 507 kdebug("commit_creds(%p{%d,%d})", new,
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 508 atomic_read(&new->usage),
509 read_cred_subscribers(new));
510
511 BUG_ON(task->cred != old);
512#ifdef CONFIG_DEBUG_CREDENTIALS
513 BUG_ON(read_cred_subscribers(old) < 2);
514 validate_creds(old);
515 validate_creds(new);
516#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 517 BUG_ON(atomic_read(&new->usage) < 1);
363 518
364 old = task->real_cred;
365 security_commit_creds(new, old); 519 security_commit_creds(new, old);
366 520
367 get_cred(new); /* we will require a ref for the subj creds too */ 521 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +544,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 544 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 545 * we should be checking for it. -DaveM
392 */ 546 */
547 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 548 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 549 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 550 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 551 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 552 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 553 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2);
399 555
400 sched_switch_user(task); 556 sched_switch_user(task);
401 557
@@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 584 */
429void abort_creds(struct cred *new) 585void abort_creds(struct cred *new)
430{ 586{
587 kdebug("abort_creds(%p{%d,%d})", new,
588 atomic_read(&new->usage),
589 read_cred_subscribers(new));
590
591#ifdef CONFIG_DEBUG_CREDENTIALS
592 BUG_ON(read_cred_subscribers(new) != 0);
593#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 594 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 595 put_cred(new);
433} 596}
@@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 607{
445 const struct cred *old = current->cred; 608 const struct cred *old = current->cred;
446 609
447 rcu_assign_pointer(current->cred, get_cred(new)); 610 kdebug("override_creds(%p{%d,%d})", new,
611 atomic_read(&new->usage),
612 read_cred_subscribers(new));
613
614 validate_creds(old);
615 validate_creds(new);
616 get_cred(new);
617 alter_cred_subscribers(new, 1);
618 rcu_assign_pointer(current->cred, new);
619 alter_cred_subscribers(old, -1);
620
621 kdebug("override_creds() = %p{%d,%d}", old,
622 atomic_read(&old->usage),
623 read_cred_subscribers(old));
448 return old; 624 return old;
449} 625}
450EXPORT_SYMBOL(override_creds); 626EXPORT_SYMBOL(override_creds);
@@ -460,7 +636,15 @@ void revert_creds(const struct cred *old)
460{ 636{
461 const struct cred *override = current->cred; 637 const struct cred *override = current->cred;
462 638
639 kdebug("revert_creds(%p{%d,%d})", old,
640 atomic_read(&old->usage),
641 read_cred_subscribers(old));
642
643 validate_creds(old);
644 validate_creds(override);
645 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 646 rcu_assign_pointer(current->cred, old);
647 alter_cred_subscribers(override, -1);
464 put_cred(override); 648 put_cred(override);
465} 649}
466EXPORT_SYMBOL(revert_creds); 650EXPORT_SYMBOL(revert_creds);
@@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 686 if (!new)
503 return NULL; 687 return NULL;
504 688
689 kdebug("prepare_kernel_cred() alloc %p", new);
690
505 if (daemon) 691 if (daemon)
506 old = get_task_cred(daemon); 692 old = get_task_cred(daemon);
507 else 693 else
508 old = get_cred(&init_cred); 694 old = get_cred(&init_cred);
509 695
696 validate_creds(old);
697
510 *new = *old; 698 *new = *old;
511 get_uid(new->user); 699 get_uid(new->user);
512 get_group_info(new->group_info); 700 get_group_info(new->group_info);
@@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 714 goto error;
527 715
528 atomic_set(&new->usage, 1); 716 atomic_set(&new->usage, 1);
717 set_cred_subscribers(new, 0);
529 put_cred(old); 718 put_cred(old);
719 validate_creds(new);
530 return new; 720 return new;
531 721
532error: 722error:
@@ -589,3 +779,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 779 return security_kernel_create_files_as(new, inode);
590} 780}
591EXPORT_SYMBOL(set_create_files_as); 781EXPORT_SYMBOL(set_create_files_as);
782
783#ifdef CONFIG_DEBUG_CREDENTIALS
784
785/*
786 * dump invalid credentials
787 */
788static void dump_invalid_creds(const struct cred *cred, const char *label,
789 const struct task_struct *tsk)
790{
791 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
792 label, cred,
793 cred == &init_cred ? "[init]" : "",
794 cred == tsk->real_cred ? "[real]" : "",
795 cred == tsk->cred ? "[eff]" : "");
796 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
797 cred->magic, cred->put_addr);
798 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
799 atomic_read(&cred->usage),
800 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid);
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid);
805#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE &&
808 (((unsigned long) cred->security & 0xffffff00) !=
809 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
810 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
811 ((u32*)cred->security)[0],
812 ((u32*)cred->security)[1]);
813#endif
814}
815
816/*
817 * report use of invalid credentials
818 */
819void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
820{
821 printk(KERN_ERR "CRED: Invalid credentials\n");
822 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
823 dump_invalid_creds(cred, "Specified", current);
824 BUG();
825}
826EXPORT_SYMBOL(__invalid_creds);
827
828/*
829 * check the credentials on a process
830 */
831void __validate_process_creds(struct task_struct *tsk,
832 const char *file, unsigned line)
833{
834 if (tsk->cred == tsk->real_cred) {
835 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
836 creds_are_invalid(tsk->cred)))
837 goto invalid_creds;
838 } else {
839 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
840 read_cred_subscribers(tsk->cred) < 1 ||
841 creds_are_invalid(tsk->real_cred) ||
842 creds_are_invalid(tsk->cred)))
843 goto invalid_creds;
844 }
845 return;
846
847invalid_creds:
848 printk(KERN_ERR "CRED: Invalid process credentials\n");
849 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
850
851 dump_invalid_creds(tsk->real_cred, "Real", tsk);
852 if (tsk->cred != tsk->real_cred)
853 dump_invalid_creds(tsk->cred, "Effective", tsk);
854 else
855 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
856 BUG();
857}
858EXPORT_SYMBOL(__validate_process_creds);
859
860/*
861 * check creds for do_exit()
862 */
863void validate_creds_for_do_exit(struct task_struct *tsk)
864{
865 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
866 tsk->real_cred, tsk->cred,
867 atomic_read(&tsk->cred->usage),
868 read_cred_subscribers(tsk->cred));
869
870 __validate_process_creds(tsk, __FILE__, __LINE__);
871}
872
873#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..ae5d8660ddff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code)
901 901
902 tracehook_report_exit(&code); 902 tracehook_report_exit(&code);
903 903
904 validate_creds_for_do_exit(tsk);
905
904 /* 906 /*
905 * We're taking recursive faults here in do_exit. Safest is to just 907 * We're taking recursive faults here in do_exit. Safest is to just
906 * leave this task alone and wait for reboot. 908 * leave this task alone and wait for reboot.
@@ -1009,7 +1011,10 @@ NORET_TYPE void do_exit(long code)
1009 if (tsk->splice_pipe) 1011 if (tsk->splice_pipe)
1010 __free_pipe_info(tsk->splice_pipe); 1012 __free_pipe_info(tsk->splice_pipe);
1011 1013
1014 validate_creds_for_do_exit(tsk);
1015
1012 preempt_disable(); 1016 preempt_disable();
1017 exit_rcu();
1013 /* causes final put_task_struct in finish_task_switch(). */ 1018 /* causes final put_task_struct in finish_task_switch(). */
1014 tsk->state = TASK_DEAD; 1019 tsk->state = TASK_DEAD;
1015 schedule(); 1020 schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index e6c04d462ab2..bfee931ee3fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
152 WARN_ON(atomic_read(&tsk->usage)); 152 WARN_ON(atomic_read(&tsk->usage));
153 WARN_ON(tsk == current); 153 WARN_ON(tsk == current);
154 154
155 put_cred(tsk->real_cred); 155 exit_creds(tsk);
156 put_cred(tsk->cred);
157 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
158 157
159 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -1008,10 +1007,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1008 copy_flags(clone_flags, p); 1007 copy_flags(clone_flags, p);
1009 INIT_LIST_HEAD(&p->children); 1008 INIT_LIST_HEAD(&p->children);
1010 INIT_LIST_HEAD(&p->sibling); 1009 INIT_LIST_HEAD(&p->sibling);
1011#ifdef CONFIG_PREEMPT_RCU 1010 rcu_copy_process(p);
1012 p->rcu_read_lock_nesting = 0;
1013 p->rcu_flipctr_idx = 0;
1014#endif /* #ifdef CONFIG_PREEMPT_RCU */
1015 p->vfork_done = NULL; 1011 p->vfork_done = NULL;
1016 spin_lock_init(&p->alloc_lock); 1012 spin_lock_init(&p->alloc_lock);
1017 1013
@@ -1297,8 +1293,7 @@ bad_fork_cleanup_put_domain:
1297 module_put(task_thread_info(p)->exec_domain->module); 1293 module_put(task_thread_info(p)->exec_domain->module);
1298bad_fork_cleanup_count: 1294bad_fork_cleanup_count:
1299 atomic_dec(&p->cred->user->processes); 1295 atomic_dec(&p->cred->user->processes);
1300 put_cred(p->real_cred); 1296 exit_creds(p);
1301 put_cred(p->cred);
1302bad_fork_free: 1297bad_fork_free:
1303 free_task(p); 1298 free_task(p);
1304fork_out: 1299fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1089 if (!top_waiter) 1092 if (!top_waiter)
1090 return 0; 1093 return 0;
1091 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1092 /* 1099 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
1276 continue; 1283 continue;
1277 } 1284 }
1278 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1279 /* 1292 /*
1280 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1751 q.pi_state = NULL; 1764 q.pi_state = NULL;
1752 q.bitset = bitset; 1765 q.bitset = bitset;
1753 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1754 1768
1755 if (abs_time) { 1769 if (abs_time) {
1756 to = &timeout; 1770 to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1858 1872
1859 q.pi_state = NULL; 1873 q.pi_state = NULL;
1860 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1861retry: 1876retry:
1862 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following: 2134 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2122 * 3) signal (before or after requeue) 2137 * 3) signal
2123 * 4) timeout (before or after requeue) 2138 * 4) timeout
2124 * 2139 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2126 * 2141 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock 2143 * 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2130 * 7) timeout 2145 * 7) timeout
2131 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2132 * 2147 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2134 * 2149 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 * 2151 *
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2169 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2171 2186
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2179 goto out; 2190 goto out;
2180 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2181 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret) 2199 if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2248 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2250 /* 2266 /*
2251 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2252 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2253 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2254 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2255 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */ 2272 */
2260 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2261 } 2274 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..654efd09f6a9 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL)
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 49da79ab8486..c03f221fee44 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/**
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81
82/* 51/*
83 * The timer bases: 52 * The timer bases:
84 * 53 *
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 75 }
107}; 76};
108 77
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 78/*
135 * Get the coarse grained time at the softirq based on xtime and 79 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 80 * wall_to_monotonic.
@@ -485,6 +429,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
485 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 429 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
486 __hrtimer_init(timer, clock_id, mode); 430 __hrtimer_init(timer, clock_id, mode);
487} 431}
432EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
488 433
489void destroy_hrtimer_on_stack(struct hrtimer *timer) 434void destroy_hrtimer_on_stack(struct hrtimer *timer)
490{ 435{
@@ -1154,7 +1099,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1154 clock_id = CLOCK_MONOTONIC; 1099 clock_id = CLOCK_MONOTONIC;
1155 1100
1156 timer->base = &cpu_base->clock_base[clock_id]; 1101 timer->base = &cpu_base->clock_base[clock_id];
1157 INIT_LIST_HEAD(&timer->cb_entry);
1158 hrtimer_init_timer_hres(timer); 1102 hrtimer_init_timer_hres(timer);
1159 1103
1160#ifdef CONFIG_TIMER_STATS 1104#ifdef CONFIG_TIMER_STATS
@@ -1477,6 +1421,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1477 sl->timer.function = hrtimer_wakeup; 1421 sl->timer.function = hrtimer_wakeup;
1478 sl->task = task; 1422 sl->task = task;
1479} 1423}
1424EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1480 1425
1481static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1426static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1482{ 1427{
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc); 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
59
47/* 60/*
48 * Debugging printout: 61 * Debugging printout:
49 */ 62 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed831737..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
230 if (!desc) 230 if (!desc)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc);
233 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
234 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
235 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
236} 238}
237EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
238 240
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
294 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
295 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
296 * 298 *
297 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
298 */ 301 */
299void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
300{ 303{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
304 if (!desc) 307 if (!desc)
305 return; 308 return;
306 309
310 chip_bus_lock(irq, desc);
307 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
308 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
309 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
310} 315}
311EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
312 317
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 return ret; 441 return ret;
437} 442}
438 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
439static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
440{ 465{
441 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
451 return -1; 476 return -1;
452} 477}
453 478
479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
454#ifdef CONFIG_SMP 496#ifdef CONFIG_SMP
455/* 497/*
456 * Check whether we need to change the affinity of the interrupt thread. 498 * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
492 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
493 struct irqaction *action = data; 535 struct irqaction *action = data;
494 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
495 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
496 538
497 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
498 current->irqaction = action; 540 current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
518 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
519 561
520 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
521 } 566 }
522 567
523 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
565 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
566 const char *old_name = NULL; 611 const char *old_name = NULL;
567 unsigned long flags; 612 unsigned long flags;
568 int shared = 0; 613 int nested, shared = 0;
569 int ret; 614 int ret;
570 615
571 if (!desc) 616 if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
590 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
591 } 636 }
592 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
593 /* 658 /*
594 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
595 */ 662 */
596 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
597 struct task_struct *t; 664 struct task_struct *t;
598 665
599 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
662 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
663#endif 730#endif
664 731
665 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
666 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
667 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
668 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
669 desc->depth = 0; 739 desc->depth = 0;
670 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
875 */ 945 */
876void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
877{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
878 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
879} 956}
880EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
881 958
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
884 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
885 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
886 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
887 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
888 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
889 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
963 1042
964 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
965 return -EINVAL; 1044 return -EINVAL;
966 if (!handler) 1045
967 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
968 1051
969 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
970 if (!action) 1053 if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
976 action->name = devname; 1059 action->name = devname;
977 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
978 1061
1062 chip_bus_lock(irq, desc);
979 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
980 if (retval) 1066 if (retval)
981 kfree(action); 1067 kfree(action);
982 1068
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 385c31a1bdbf..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -37,6 +37,8 @@
37#include <linux/suspend.h> 37#include <linux/suspend.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39 39
40#include <trace/events/module.h>
41
40extern int max_threads; 42extern int max_threads;
41 43
42static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -78,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
78#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
79 static int kmod_loop_msg; 81 static int kmod_loop_msg;
80 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
81 va_start(args, fmt); 87 va_start(args, fmt);
82 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
83 va_end(args); 89 va_end(args);
@@ -108,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
108 return -ENOMEM; 114 return -ENOMEM;
109 } 115 }
110 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
111 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
112 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
113 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
@@ -462,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
462 int retval = 0; 470 int retval = 0;
463 471
464 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
465 474
466 helper_lock(); 475 helper_lock();
467 if (sub_info->path[0] == '\0') 476 if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 234
241 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
242 if (check_safety()) 236 if (check_safety())
243 return -EAGAIN; 237 return -EAGAIN;
244 238
245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
246 int i; 240 int i;
247 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
248 continue; 242 continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
260void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
261{ 255{
262 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
263 struct hlist_node *pos;
264 257
265 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
266 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
267 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
268 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
269 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
270 if (dirty) { 263 if (dirty) {
271 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
272 kip->ngarbage++; 265 kip->ngarbage++;
273 } else { 266 } else
274 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
275 }
276 break; 268 break;
277 } 269 }
278 } 270 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..f74d2d7aa605 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 840}
899 841
900/* 842/*
843 * For good efficiency of modular, we use power of 2
844 */
845#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
846#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
847
848/*
849 * The circular_queue and helpers is used to implement the
850 * breadth-first search(BFS)algorithem, by which we can build
851 * the shortest path from the next lock to be acquired to the
852 * previous held lock if there is a circular between them.
853 */
854struct circular_queue {
855 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
856 unsigned int front, rear;
857};
858
859static struct circular_queue lock_cq;
860
861unsigned int max_bfs_queue_depth;
862
863static unsigned int lockdep_dependency_gen_id;
864
865static inline void __cq_init(struct circular_queue *cq)
866{
867 cq->front = cq->rear = 0;
868 lockdep_dependency_gen_id++;
869}
870
871static inline int __cq_empty(struct circular_queue *cq)
872{
873 return (cq->front == cq->rear);
874}
875
876static inline int __cq_full(struct circular_queue *cq)
877{
878 return ((cq->rear + 1) & CQ_MASK) == cq->front;
879}
880
881static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
882{
883 if (__cq_full(cq))
884 return -1;
885
886 cq->element[cq->rear] = elem;
887 cq->rear = (cq->rear + 1) & CQ_MASK;
888 return 0;
889}
890
891static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
892{
893 if (__cq_empty(cq))
894 return -1;
895
896 *elem = cq->element[cq->front];
897 cq->front = (cq->front + 1) & CQ_MASK;
898 return 0;
899}
900
901static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
902{
903 return (cq->rear - cq->front) & CQ_MASK;
904}
905
906static inline void mark_lock_accessed(struct lock_list *lock,
907 struct lock_list *parent)
908{
909 unsigned long nr;
910
911 nr = lock - list_entries;
912 WARN_ON(nr >= nr_list_entries);
913 lock->parent = parent;
914 lock->class->dep_gen_id = lockdep_dependency_gen_id;
915}
916
917static inline unsigned long lock_accessed(struct lock_list *lock)
918{
919 unsigned long nr;
920
921 nr = lock - list_entries;
922 WARN_ON(nr >= nr_list_entries);
923 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
924}
925
926static inline struct lock_list *get_lock_parent(struct lock_list *child)
927{
928 return child->parent;
929}
930
931static inline int get_lock_depth(struct lock_list *child)
932{
933 int depth = 0;
934 struct lock_list *parent;
935
936 while ((parent = get_lock_parent(child))) {
937 child = parent;
938 depth++;
939 }
940 return depth;
941}
942
943static int __bfs(struct lock_list *source_entry,
944 void *data,
945 int (*match)(struct lock_list *entry, void *data),
946 struct lock_list **target_entry,
947 int forward)
948{
949 struct lock_list *entry;
950 struct list_head *head;
951 struct circular_queue *cq = &lock_cq;
952 int ret = 1;
953
954 if (match(source_entry, data)) {
955 *target_entry = source_entry;
956 ret = 0;
957 goto exit;
958 }
959
960 if (forward)
961 head = &source_entry->class->locks_after;
962 else
963 head = &source_entry->class->locks_before;
964
965 if (list_empty(head))
966 goto exit;
967
968 __cq_init(cq);
969 __cq_enqueue(cq, (unsigned long)source_entry);
970
971 while (!__cq_empty(cq)) {
972 struct lock_list *lock;
973
974 __cq_dequeue(cq, (unsigned long *)&lock);
975
976 if (!lock->class) {
977 ret = -2;
978 goto exit;
979 }
980
981 if (forward)
982 head = &lock->class->locks_after;
983 else
984 head = &lock->class->locks_before;
985
986 list_for_each_entry(entry, head, entry) {
987 if (!lock_accessed(entry)) {
988 unsigned int cq_depth;
989 mark_lock_accessed(entry, lock);
990 if (match(entry, data)) {
991 *target_entry = entry;
992 ret = 0;
993 goto exit;
994 }
995
996 if (__cq_enqueue(cq, (unsigned long)entry)) {
997 ret = -1;
998 goto exit;
999 }
1000 cq_depth = __cq_get_elem_count(cq);
1001 if (max_bfs_queue_depth < cq_depth)
1002 max_bfs_queue_depth = cq_depth;
1003 }
1004 }
1005 }
1006exit:
1007 return ret;
1008}
1009
1010static inline int __bfs_forwards(struct lock_list *src_entry,
1011 void *data,
1012 int (*match)(struct lock_list *entry, void *data),
1013 struct lock_list **target_entry)
1014{
1015 return __bfs(src_entry, data, match, target_entry, 1);
1016
1017}
1018
1019static inline int __bfs_backwards(struct lock_list *src_entry,
1020 void *data,
1021 int (*match)(struct lock_list *entry, void *data),
1022 struct lock_list **target_entry)
1023{
1024 return __bfs(src_entry, data, match, target_entry, 0);
1025
1026}
1027
1028/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1029 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1030 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1031 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1032 */
909static struct held_lock *check_source, *check_target;
910 1033
911/* 1034/*
912 * Print a dependency chain entry (this is only done when a deadlock 1035 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1036 * has been detected):
914 */ 1037 */
915static noinline int 1038static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1039print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1040{
918 if (debug_locks_silent) 1041 if (debug_locks_silent)
919 return 0; 1042 return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1053 * header first:
931 */ 1054 */
932static noinline int 1055static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1056print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1057 struct held_lock *check_src,
1058 struct held_lock *check_tgt)
934{ 1059{
935 struct task_struct *curr = current; 1060 struct task_struct *curr = current;
936 1061
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1062 if (debug_locks_silent)
938 return 0; 1063 return 0;
939 1064
940 printk("\n=======================================================\n"); 1065 printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1068 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1069 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1070 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1071 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1072 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1073 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1074 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1075 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1076
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1079 return 0;
955} 1080}
956 1081
957static noinline int print_circular_bug_tail(void) 1082static inline int class_equal(struct lock_list *entry, void *data)
1083{
1084 return entry->class == data;
1085}
1086
1087static noinline int print_circular_bug(struct lock_list *this,
1088 struct lock_list *target,
1089 struct held_lock *check_src,
1090 struct held_lock *check_tgt)
958{ 1091{
959 struct task_struct *curr = current; 1092 struct task_struct *curr = current;
960 struct lock_list this; 1093 struct lock_list *parent;
1094 int depth;
961 1095
962 if (debug_locks_silent) 1096 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1097 return 0;
964 1098
965 this.class = hlock_class(check_source); 1099 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1100 return 0;
968 1101
969 print_circular_bug_entry(&this, 0); 1102 depth = get_lock_depth(target);
1103
1104 print_circular_bug_header(target, depth, check_src, check_tgt);
1105
1106 parent = get_lock_parent(target);
1107
1108 while (parent) {
1109 print_circular_bug_entry(parent, --depth);
1110 parent = get_lock_parent(parent);
1111 }
970 1112
971 printk("\nother info that might help us debug this:\n\n"); 1113 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1114 lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1119 return 0;
978} 1120}
979 1121
980#define RECURSION_LIMIT 40 1122static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1123{
984 if (!debug_locks_off_graph_unlock()) 1124 if (!debug_locks_off_graph_unlock())
985 return 0; 1125 return 0;
986 1126
987 WARN_ON(1); 1127 WARN(1, "lockdep bfs error:%d\n", ret);
988 1128
989 return 0; 1129 return 0;
990} 1130}
991 1131
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1132static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1133{
995 struct lock_list *entry; 1134 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1135 return 0;
1136}
997 1137
998 if (lockdep_dependency_visit(class, depth)) 1138unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1139{
1140 unsigned long count = 0;
1141 struct lock_list *uninitialized_var(target_entry);
1000 1142
1001 /* 1143 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1144
1007 return ret; 1145 return count;
1008} 1146}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1147unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1148{
1012 unsigned long ret, flags; 1149 unsigned long ret, flags;
1150 struct lock_list this;
1151
1152 this.parent = NULL;
1153 this.class = class;
1013 1154
1014 local_irq_save(flags); 1155 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1156 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1157 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1158 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1159 local_irq_restore(flags);
1019 1160
1020 return ret; 1161 return ret;
1021} 1162}
1022 1163
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1164unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1165{
1026 struct lock_list *entry; 1166 unsigned long count = 0;
1027 unsigned long ret = 1; 1167 struct lock_list *uninitialized_var(target_entry);
1028 1168
1029 if (lockdep_dependency_visit(class, depth)) 1169 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1170
1037 return ret; 1171 return count;
1038} 1172}
1039 1173
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1174unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1175{
1042 unsigned long ret, flags; 1176 unsigned long ret, flags;
1177 struct lock_list this;
1178
1179 this.parent = NULL;
1180 this.class = class;
1043 1181
1044 local_irq_save(flags); 1182 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1183 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1184 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1185 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1186 local_irq_restore(flags);
1049 1187
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1193 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1194 */
1057static noinline int 1195static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1196check_noncircular(struct lock_list *root, struct lock_class *target,
1197 struct lock_list **target_entry)
1059{ 1198{
1060 struct lock_list *entry; 1199 int result;
1061 1200
1062 if (lockdep_dependency_visit(source, depth)) 1201 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1202
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1203 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1204
1067 max_recursion_depth = depth; 1205 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1206}
1082 1207
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1208#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1211 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1212 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1213 */
1089static enum lock_usage_bit find_usage_bit; 1214
1090static struct lock_class *forwards_match, *backwards_match; 1215static inline int usage_match(struct lock_list *entry, void *bit)
1216{
1217 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1218}
1219
1220
1091 1221
1092/* 1222/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1223 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1224 * at @root->class that matches @bit.
1095 * 1225 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1226 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1227 * into *@target_entry.
1098 * 1228 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1229 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1230 * Return <0 on error.
1101 */ 1231 */
1102static noinline int 1232static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1233find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1234 struct lock_list **target_entry)
1104{ 1235{
1105 struct lock_list *entry; 1236 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1237
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1238 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1239
1122 /* 1240 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1241
1124 */ 1242 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1243}
1133 1244
1134/* 1245/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1246 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1247 * at @root->class that matches @bit.
1137 * 1248 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1249 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1250 * into *@target_entry.
1140 * 1251 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1252 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1253 * Return <0 on error.
1143 */ 1254 */
1144static noinline int 1255static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1256find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1257 struct lock_list **target_entry)
1146{ 1258{
1147 struct lock_list *entry; 1259 int result;
1148 int ret;
1149 1260
1150 if (lockdep_dependency_visit(source, depth)) 1261 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1262
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1263 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1264
1156 if (depth > max_recursion_depth) 1265 return result;
1157 max_recursion_depth = depth; 1266}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1267
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1268static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1269{
1163 backwards_match = source; 1270 int bit;
1164 return 2;
1165 }
1166 1271
1167 if (!source && debug_locks_off_graph_unlock()) { 1272 printk("%*s->", depth, "");
1168 WARN_ON(1); 1273 print_lock_name(class);
1169 return 0; 1274 printk(" ops: %lu", class->ops);
1170 } 1275 printk(" {\n");
1171 1276
1172 /* 1277 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1278 if (class->usage_mask & (1 << bit)) {
1174 */ 1279 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1280
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1281 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1282 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1283 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1284 }
1180 } 1285 }
1181 return 1; 1286 printk("%*s }\n", depth, "");
1287
1288 printk("%*s ... key at: ",depth,"");
1289 print_ip_sym((unsigned long)class->key);
1290}
1291
1292/*
1293 * printk the shortest lock dependencies from @start to @end in reverse order:
1294 */
1295static void __used
1296print_shortest_lock_dependencies(struct lock_list *leaf,
1297 struct lock_list *root)
1298{
1299 struct lock_list *entry = leaf;
1300 int depth;
1301
1302 /*compute depth from generated tree by BFS*/
1303 depth = get_lock_depth(leaf);
1304
1305 do {
1306 print_lock_class_header(entry->class, depth);
1307 printk("%*s ... acquired at:\n", depth, "");
1308 print_stack_trace(&entry->trace, 2);
1309 printk("\n");
1310
1311 if (depth == 0 && (entry != root)) {
1312 printk("lockdep:%s bad BFS generated tree\n", __func__);
1313 break;
1314 }
1315
1316 entry = get_lock_parent(entry);
1317 depth--;
1318 } while (entry && (depth >= 0));
1319
1320 return;
1182} 1321}
1183 1322
1184static int 1323static int
1185print_bad_irq_dependency(struct task_struct *curr, 1324print_bad_irq_dependency(struct task_struct *curr,
1325 struct lock_list *prev_root,
1326 struct lock_list *next_root,
1327 struct lock_list *backwards_entry,
1328 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1329 struct held_lock *prev,
1187 struct held_lock *next, 1330 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1331 enum lock_usage_bit bit1,
@@ -1215,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1358
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1359 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1360 irqclass);
1218 print_lock_name(backwards_match); 1361 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1362 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1363
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1364 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1365
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1366 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1367 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1368 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1369 printk("...");
1227 1370
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1371 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1372
1230 printk("\nother info that might help us debug this:\n\n"); 1373 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1374 lockdep_print_held_locks(curr);
1232 1375
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1376 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1377 printk(" and the holding lock:\n");
1378 if (!save_trace(&prev_root->trace))
1379 return 0;
1380 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1381
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1382 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1383 printk(" and %s-irq-unsafe lock:\n", irqclass);
1384 if (!save_trace(&next_root->trace))
1385 return 0;
1386 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1387
1239 printk("\nstack backtrace:\n"); 1388 printk("\nstack backtrace:\n");
1240 dump_stack(); 1389 dump_stack();
@@ -1248,19 +1397,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1397 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1398{
1250 int ret; 1399 int ret;
1400 struct lock_list this, that;
1401 struct lock_list *uninitialized_var(target_entry);
1402 struct lock_list *uninitialized_var(target_entry1);
1251 1403
1252 find_usage_bit = bit_backwards; 1404 this.parent = NULL;
1253 /* fills in <backwards_match> */ 1405
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1406 this.class = hlock_class(prev);
1255 if (!ret || ret == 1) 1407 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1408 if (ret < 0)
1409 return print_bfs_bug(ret);
1410 if (ret == 1)
1256 return ret; 1411 return ret;
1257 1412
1258 find_usage_bit = bit_forwards; 1413 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1414 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1415 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1416 if (ret < 0)
1417 return print_bfs_bug(ret);
1418 if (ret == 1)
1261 return ret; 1419 return ret;
1262 /* ret == 2 */ 1420
1263 return print_bad_irq_dependency(curr, prev, next, 1421 return print_bad_irq_dependency(curr, &this, &that,
1422 target_entry, target_entry1,
1423 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1424 bit_backwards, bit_forwards, irqclass);
1265} 1425}
1266 1426
@@ -1472,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1632{
1473 struct lock_list *entry; 1633 struct lock_list *entry;
1474 int ret; 1634 int ret;
1635 struct lock_list this;
1636 struct lock_list *uninitialized_var(target_entry);
1475 1637
1476 /* 1638 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1639 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1644 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1645 * keep the stackframe size of the recursive functions low:
1484 */ 1646 */
1485 check_source = next; 1647 this.class = hlock_class(next);
1486 check_target = prev; 1648 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1649 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1650 if (unlikely(!ret))
1651 return print_circular_bug(&this, target_entry, next, prev);
1652 else if (unlikely(ret < 0))
1653 return print_bfs_bug(ret);
1489 1654
1490 if (!check_prev_add_irq(curr, prev, next)) 1655 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1656 return 0;
@@ -1884,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2049 * print irq inversion bug:
1885 */ 2050 */
1886static int 2051static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2052print_irq_inversion_bug(struct task_struct *curr,
2053 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2054 struct held_lock *this, int forwards,
1889 const char *irqclass) 2055 const char *irqclass)
1890{ 2056{
@@ -1902,17 +2068,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2068 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2069 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2070 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2071 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2072 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2073
1908 printk("\nother info that might help us debug this:\n"); 2074 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2075 lockdep_print_held_locks(curr);
1910 2076
1911 printk("\nthe first lock's dependencies:\n"); 2077 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2078 if (!save_trace(&root->trace))
1913 2079 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2080 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2081
1917 printk("\nstack backtrace:\n"); 2082 printk("\nstack backtrace:\n");
1918 dump_stack(); 2083 dump_stack();
@@ -1929,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2094 enum lock_usage_bit bit, const char *irqclass)
1930{ 2095{
1931 int ret; 2096 int ret;
1932 2097 struct lock_list root;
1933 find_usage_bit = bit; 2098 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2099
1935 ret = find_usage_forwards(hlock_class(this), 0); 2100 root.parent = NULL;
1936 if (!ret || ret == 1) 2101 root.class = hlock_class(this);
2102 ret = find_usage_forwards(&root, bit, &target_entry);
2103 if (ret < 0)
2104 return print_bfs_bug(ret);
2105 if (ret == 1)
1937 return ret; 2106 return ret;
1938 2107
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2108 return print_irq_inversion_bug(curr, &root, target_entry,
2109 this, 1, irqclass);
1940} 2110}
1941 2111
1942/* 2112/*
@@ -1948,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2118 enum lock_usage_bit bit, const char *irqclass)
1949{ 2119{
1950 int ret; 2120 int ret;
1951 2121 struct lock_list root;
1952 find_usage_bit = bit; 2122 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2123
1954 ret = find_usage_backwards(hlock_class(this), 0); 2124 root.parent = NULL;
1955 if (!ret || ret == 1) 2125 root.class = hlock_class(this);
2126 ret = find_usage_backwards(&root, bit, &target_entry);
2127 if (ret < 0)
2128 return print_bfs_bug(ret);
2129 if (ret == 1)
1956 return ret; 2130 return ret;
1957 2131
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2132 return print_irq_inversion_bug(curr, &root, target_entry,
2133 this, 1, irqclass);
1959} 2134}
1960 2135
1961void print_irqtrace_events(struct task_struct *curr) 2136void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2705 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2707 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2708 struct lockdep_map *nest_lock, unsigned long ip,
2709 int references)
2534{ 2710{
2535 struct task_struct *curr = current; 2711 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2712 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2713 struct held_lock *hlock;
2538 unsigned int depth, id; 2714 unsigned int depth, id;
2539 int chain_head = 0; 2715 int chain_head = 0;
2716 int class_idx;
2540 u64 chain_key; 2717 u64 chain_key;
2541 2718
2542 if (!prove_locking) 2719 if (!prove_locking)
@@ -2584,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2761 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2762 return 0;
2586 2763
2764 class_idx = class - lock_classes + 1;
2765
2766 if (depth) {
2767 hlock = curr->held_locks + depth - 1;
2768 if (hlock->class_idx == class_idx && nest_lock) {
2769 if (hlock->references)
2770 hlock->references++;
2771 else
2772 hlock->references = 2;
2773
2774 return 1;
2775 }
2776 }
2777
2587 hlock = curr->held_locks + depth; 2778 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2779 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2780 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2781 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2782 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2783 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2784 hlock->nest_lock = nest_lock;
@@ -2595,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2786 hlock->read = read;
2596 hlock->check = check; 2787 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2788 hlock->hardirqs_off = !!hardirqs_off;
2789 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2790#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2791 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2792 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2895 return 1;
2704} 2896}
2705 2897
2898static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2899{
2900 if (hlock->instance == lock)
2901 return 1;
2902
2903 if (hlock->references) {
2904 struct lock_class *class = lock->class_cache;
2905
2906 if (!class)
2907 class = look_up_lock_class(lock, 0);
2908
2909 if (DEBUG_LOCKS_WARN_ON(!class))
2910 return 0;
2911
2912 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2913 return 0;
2914
2915 if (hlock->class_idx == class - lock_classes + 1)
2916 return 1;
2917 }
2918
2919 return 0;
2920}
2921
2706static int 2922static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2923__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2924 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2942 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2943 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2944 break;
2729 if (hlock->instance == lock) 2945 if (match_held_lock(hlock, lock))
2730 goto found_it; 2946 goto found_it;
2731 prev_hlock = hlock; 2947 prev_hlock = hlock;
2732 } 2948 }
@@ -2745,7 +2961,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2961 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2962 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2963 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2964 hlock->nest_lock, hlock->acquire_ip,
2965 hlock->references))
2749 return 0; 2966 return 0;
2750 } 2967 }
2751 2968
@@ -2784,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3001 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3002 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3003 break;
2787 if (hlock->instance == lock) 3004 if (match_held_lock(hlock, lock))
2788 goto found_it; 3005 goto found_it;
2789 prev_hlock = hlock; 3006 prev_hlock = hlock;
2790 } 3007 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3008 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3009
2793found_it: 3010found_it:
2794 lock_release_holdtime(hlock); 3011 if (hlock->instance == lock)
3012 lock_release_holdtime(hlock);
3013
3014 if (hlock->references) {
3015 hlock->references--;
3016 if (hlock->references) {
3017 /*
3018 * We had, and after removing one, still have
3019 * references, the current lock stack is still
3020 * valid. We're done!
3021 */
3022 return 1;
3023 }
3024 }
2795 3025
2796 /* 3026 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3027 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3028 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3029 * entries (if any), recalculating the hash along the way:
2800 */ 3030 */
3031
2801 curr->lockdep_depth = i; 3032 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3033 curr->curr_chain_key = hlock->prev_chain_key;
2803 3034
@@ -2806,7 +3037,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3037 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3038 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3039 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3040 hlock->nest_lock, hlock->acquire_ip,
3041 hlock->references))
2810 return 0; 3042 return 0;
2811 } 3043 }
2812 3044
@@ -2836,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3068 /*
2837 * Is the unlock non-nested: 3069 * Is the unlock non-nested:
2838 */ 3070 */
2839 if (hlock->instance != lock) 3071 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3072 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3073 curr->lockdep_depth--;
2842 3074
@@ -2881,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3113 check_chain_key(curr);
2882} 3114}
2883 3115
3116static int __lock_is_held(struct lockdep_map *lock)
3117{
3118 struct task_struct *curr = current;
3119 int i;
3120
3121 for (i = 0; i < curr->lockdep_depth; i++) {
3122 struct held_lock *hlock = curr->held_locks + i;
3123
3124 if (match_held_lock(hlock, lock))
3125 return 1;
3126 }
3127
3128 return 0;
3129}
3130
2884/* 3131/*
2885 * Check whether we follow the irq-flags state precisely: 3132 * Check whether we follow the irq-flags state precisely:
2886 */ 3133 */
@@ -2957,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3204
2958 current->lockdep_recursion = 1; 3205 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3206 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3207 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3208 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3209 raw_local_irq_restore(flags);
2963} 3210}
@@ -2982,6 +3229,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3229}
2983EXPORT_SYMBOL_GPL(lock_release); 3230EXPORT_SYMBOL_GPL(lock_release);
2984 3231
3232int lock_is_held(struct lockdep_map *lock)
3233{
3234 unsigned long flags;
3235 int ret = 0;
3236
3237 if (unlikely(current->lockdep_recursion))
3238 return ret;
3239
3240 raw_local_irq_save(flags);
3241 check_flags(flags);
3242
3243 current->lockdep_recursion = 1;
3244 ret = __lock_is_held(lock);
3245 current->lockdep_recursion = 0;
3246 raw_local_irq_restore(flags);
3247
3248 return ret;
3249}
3250EXPORT_SYMBOL_GPL(lock_is_held);
3251
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3252void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3253{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3254 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3308 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3309 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3310 break;
3044 if (hlock->instance == lock) 3311 if (match_held_lock(hlock, lock))
3045 goto found_it; 3312 goto found_it;
3046 prev_hlock = hlock; 3313 prev_hlock = hlock;
3047 } 3314 }
@@ -3049,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3316 return;
3050 3317
3051found_it: 3318found_it:
3319 if (hlock->instance != lock)
3320 return;
3321
3052 hlock->waittime_stamp = sched_clock(); 3322 hlock->waittime_stamp = sched_clock();
3053 3323
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3358 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3359 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3360 break;
3091 if (hlock->instance == lock) 3361 if (match_held_lock(hlock, lock))
3092 goto found_it; 3362 goto found_it;
3093 prev_hlock = hlock; 3363 prev_hlock = hlock;
3094 } 3364 }
@@ -3096,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3366 return;
3097 3367
3098found_it: 3368found_it:
3369 if (hlock->instance != lock)
3370 return;
3371
3099 cpu = smp_processor_id(); 3372 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3373 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3374 now = sched_clock();
@@ -3326,7 +3599,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3599 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3600 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3601 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3602 sizeof(struct list_head) * CHAINHASH_SIZE
3603#ifdef CONFIG_PROVE_LOCKING
3604 + sizeof(struct circular_queue)
3605#endif
3606 ) / 1024
3607 );
3330 3608
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3609 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3610 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..d4b3dbc79fdb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
diff --git a/kernel/module.c b/kernel/module.c
index 2d537186191f..05ce49ced8f6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -55,6 +55,11 @@
55#include <linux/percpu.h> 55#include <linux/percpu.h>
56#include <linux/kmemleak.h> 56#include <linux/kmemleak.h>
57 57
58#define CREATE_TRACE_POINTS
59#include <trace/events/module.h>
60
61EXPORT_TRACEPOINT_SYMBOL(module_get);
62
58#if 0 63#if 0
59#define DEBUGP printk 64#define DEBUGP printk
60#else 65#else
@@ -364,7 +369,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 369
365#ifdef CONFIG_SMP 370#ifdef CONFIG_SMP
366 371
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 372#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 373
369static void *percpu_modalloc(unsigned long size, unsigned long align, 374static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 375 const char *name)
@@ -389,7 +394,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 394 free_percpu(freeme);
390} 395}
391 396
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 397#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 398
394/* Number of blocks used and allocated. */ 399/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 400static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +540,7 @@ static int percpu_modinit(void)
535} 540}
536__initcall(percpu_modinit); 541__initcall(percpu_modinit);
537 542
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 543#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 544
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 545static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 546 Elf_Shdr *sechdrs,
@@ -942,6 +947,8 @@ void module_put(struct module *module)
942 if (module) { 947 if (module) {
943 unsigned int cpu = get_cpu(); 948 unsigned int cpu = get_cpu();
944 local_dec(__module_ref_addr(module, cpu)); 949 local_dec(__module_ref_addr(module, cpu));
950 trace_module_put(module, _RET_IP_,
951 local_read(__module_ref_addr(module, cpu)));
945 /* Maybe they're waiting for us to drop reference? */ 952 /* Maybe they're waiting for us to drop reference? */
946 if (unlikely(!module_is_live(module))) 953 if (unlikely(!module_is_live(module)))
947 wake_up_process(module->waiter); 954 wake_up_process(module->waiter);
@@ -1497,6 +1504,8 @@ static int __unlink_module(void *_mod)
1497/* Free a module, remove from lists, etc (must hold module_mutex). */ 1504/* Free a module, remove from lists, etc (must hold module_mutex). */
1498static void free_module(struct module *mod) 1505static void free_module(struct module *mod)
1499{ 1506{
1507 trace_module_free(mod);
1508
1500 /* Delete from various lists */ 1509 /* Delete from various lists */
1501 stop_machine(__unlink_module, mod, NULL); 1510 stop_machine(__unlink_module, mod, NULL);
1502 remove_notes_attrs(mod); 1511 remove_notes_attrs(mod);
@@ -2364,6 +2373,8 @@ static noinline struct module *load_module(void __user *umod,
2364 /* Get rid of temporary copy */ 2373 /* Get rid of temporary copy */
2365 vfree(hdr); 2374 vfree(hdr);
2366 2375
2376 trace_module_load(mod);
2377
2367 /* Done! */ 2378 /* Done! */
2368 return mod; 2379 return mod;
2369 2380
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f274e1959885..8cb94a52d1bb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,11 +46,17 @@ static atomic_t nr_task_counters __read_mostly;
46 46
47/* 47/*
48 * perf counter paranoia level: 48 * perf counter paranoia level:
49 * 0 - not paranoid 49 * -1 - not paranoid at all
50 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
51 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
52 */ 53 */
53int sysctl_perf_counter_paranoid __read_mostly; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
54 60
55static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
56{ 62{
@@ -100,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,
100 106
101void __weak perf_counter_print_debug(void) { } 107void __weak perf_counter_print_debug(void) { }
102 108
103static DEFINE_PER_CPU(int, disable_count); 109static DEFINE_PER_CPU(int, perf_disable_count);
104 110
105void __perf_disable(void) 111void __perf_disable(void)
106{ 112{
107 __get_cpu_var(disable_count)++; 113 __get_cpu_var(perf_disable_count)++;
108} 114}
109 115
110bool __perf_enable(void) 116bool __perf_enable(void)
111{ 117{
112 return !--__get_cpu_var(disable_count); 118 return !--__get_cpu_var(perf_disable_count);
113} 119}
114 120
115void perf_disable(void) 121void perf_disable(void)
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
469 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end; 476 u64 run_end;
471 477
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
473 return; 480 return;
474 481
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
518 */ 525 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx); 527 update_context_time(ctx);
521 update_counter_times(counter); 528 update_group_times(counter);
522 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
524 else 531 else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
573 * in, so we can change the state safely. 580 * in, so we can change the state safely.
574 */ 581 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter); 583 update_group_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
578 } 585 }
579 586
@@ -851,6 +858,27 @@ retry:
851} 858}
852 859
853/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
854 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
855 */ 883 */
856static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
877 905
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock; 907 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882 909
883 /* 910 /*
884 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
971 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely. 999 * in, so we can change the state safely.
973 */ 1000 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
975 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
976 counter->tstamp_enabled = 1003
977 ctx->time - counter->total_time_enabled;
978 }
979 out: 1004 out:
980 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
981} 1006}
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1479 counter->attr.enable_on_exec = 0; 1504 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue; 1506 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE; 1507 __perf_counter_mark_enabled(counter, ctx);
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1; 1508 enabled = 1;
1486 } 1509 }
1487 1510
@@ -1675,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
1675 atomic_dec(&nr_task_counters); 1698 atomic_dec(&nr_task_counters);
1676 } 1699 }
1677 1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1678 if (counter->destroy) 1706 if (counter->destroy)
1679 counter->destroy(counter); 1707 counter->destroy(counter);
1680 1708
@@ -1960,6 +1988,8 @@ unlock:
1960 return ret; 1988 return ret;
1961} 1989}
1962 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{ 1994{
1965 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1983,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1983 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1985 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1986 default: 2019 default:
1987 return -ENOTTY; 2020 return -ENOTTY;
1988 } 2021 }
@@ -2253,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2253 2286
2254 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2256 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -2638,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample) 2677 int nmi, int sample)
2640{ 2678{
2679 struct perf_counter *output_counter;
2641 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2642 unsigned int offset, head; 2681 unsigned int offset, head;
2643 int have_lost; 2682 int have_lost;
@@ -2647,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
2647 u64 lost; 2686 u64 lost;
2648 } lost_event; 2687 } lost_event;
2649 2688
2689 rcu_read_lock();
2650 /* 2690 /*
2651 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2652 */ 2692 */
2653 if (counter->parent) 2693 if (counter->parent)
2654 counter = counter->parent; 2694 counter = counter->parent;
2655 2695
2656 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2657 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2658 if (!data) 2701 if (!data)
2659 goto out; 2702 goto out;
@@ -3934,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3934 * have these. 3977 * have these.
3935 */ 3978 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && 3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3937 !capable(CAP_SYS_ADMIN)) 3981 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM); 3982 return ERR_PTR(-EPERM);
3939 3983
@@ -4066,6 +4110,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
4066 hwc->sample_period = attr->sample_period; 4110 hwc->sample_period = attr->sample_period;
4067 if (attr->freq && attr->sample_freq) 4111 if (attr->freq && attr->sample_freq)
4068 hwc->sample_period = 1; 4112 hwc->sample_period = 1;
4113 hwc->last_period = hwc->sample_period;
4069 4114
4070 atomic64_set(&hwc->period_left, hwc->sample_period); 4115 atomic64_set(&hwc->period_left, hwc->sample_period);
4071 4116
@@ -4170,6 +4215,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
4170 if (val) 4215 if (val)
4171 goto err_size; 4216 goto err_size;
4172 } 4217 }
4218 size = sizeof(*attr);
4173 } 4219 }
4174 4220
4175 ret = copy_from_user(attr, uattr, size); 4221 ret = copy_from_user(attr, uattr, size);
@@ -4201,6 +4247,57 @@ err_size:
4201 goto out; 4247 goto out;
4202} 4248}
4203 4249
4250int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4251{
4252 struct perf_counter *output_counter = NULL;
4253 struct file *output_file = NULL;
4254 struct perf_counter *old_output;
4255 int fput_needed = 0;
4256 int ret = -EINVAL;
4257
4258 if (!output_fd)
4259 goto set;
4260
4261 output_file = fget_light(output_fd, &fput_needed);
4262 if (!output_file)
4263 return -EBADF;
4264
4265 if (output_file->f_op != &perf_fops)
4266 goto out;
4267
4268 output_counter = output_file->private_data;
4269
4270 /* Don't chain output fds */
4271 if (output_counter->output)
4272 goto out;
4273
4274 /* Don't set an output fd when we already have an output channel */
4275 if (counter->data)
4276 goto out;
4277
4278 atomic_long_inc(&output_file->f_count);
4279
4280set:
4281 mutex_lock(&counter->mmap_mutex);
4282 old_output = counter->output;
4283 rcu_assign_pointer(counter->output, output_counter);
4284 mutex_unlock(&counter->mmap_mutex);
4285
4286 if (old_output) {
4287 /*
4288 * we need to make sure no existing perf_output_*()
4289 * is still referencing this counter.
4290 */
4291 synchronize_rcu();
4292 fput(old_output->filp);
4293 }
4294
4295 ret = 0;
4296out:
4297 fput_light(output_file, fput_needed);
4298 return ret;
4299}
4300
4204/** 4301/**
4205 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4302 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4206 * 4303 *
@@ -4220,15 +4317,15 @@ SYSCALL_DEFINE5(perf_counter_open,
4220 struct file *group_file = NULL; 4317 struct file *group_file = NULL;
4221 int fput_needed = 0; 4318 int fput_needed = 0;
4222 int fput_needed2 = 0; 4319 int fput_needed2 = 0;
4223 int ret; 4320 int err;
4224 4321
4225 /* for future expandability... */ 4322 /* for future expandability... */
4226 if (flags) 4323 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4227 return -EINVAL; 4324 return -EINVAL;
4228 4325
4229 ret = perf_copy_attr(attr_uptr, &attr); 4326 err = perf_copy_attr(attr_uptr, &attr);
4230 if (ret) 4327 if (err)
4231 return ret; 4328 return err;
4232 4329
4233 if (!attr.exclude_kernel) { 4330 if (!attr.exclude_kernel) {
4234 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4331 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4251,8 +4348,8 @@ SYSCALL_DEFINE5(perf_counter_open,
4251 * Look up the group leader (we will attach this counter to it): 4348 * Look up the group leader (we will attach this counter to it):
4252 */ 4349 */
4253 group_leader = NULL; 4350 group_leader = NULL;
4254 if (group_fd != -1) { 4351 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4255 ret = -EINVAL; 4352 err = -EINVAL;
4256 group_file = fget_light(group_fd, &fput_needed); 4353 group_file = fget_light(group_fd, &fput_needed);
4257 if (!group_file) 4354 if (!group_file)
4258 goto err_put_context; 4355 goto err_put_context;
@@ -4281,18 +4378,24 @@ SYSCALL_DEFINE5(perf_counter_open,
4281 4378
4282 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4379 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4283 NULL, GFP_KERNEL); 4380 NULL, GFP_KERNEL);
4284 ret = PTR_ERR(counter); 4381 err = PTR_ERR(counter);
4285 if (IS_ERR(counter)) 4382 if (IS_ERR(counter))
4286 goto err_put_context; 4383 goto err_put_context;
4287 4384
4288 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4385 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4289 if (ret < 0) 4386 if (err < 0)
4290 goto err_free_put_context; 4387 goto err_free_put_context;
4291 4388
4292 counter_file = fget_light(ret, &fput_needed2); 4389 counter_file = fget_light(err, &fput_needed2);
4293 if (!counter_file) 4390 if (!counter_file)
4294 goto err_free_put_context; 4391 goto err_free_put_context;
4295 4392
4393 if (flags & PERF_FLAG_FD_OUTPUT) {
4394 err = perf_counter_set_output(counter, group_fd);
4395 if (err)
4396 goto err_fput_free_put_context;
4397 }
4398
4296 counter->filp = counter_file; 4399 counter->filp = counter_file;
4297 WARN_ON_ONCE(ctx->parent_ctx); 4400 WARN_ON_ONCE(ctx->parent_ctx);
4298 mutex_lock(&ctx->mutex); 4401 mutex_lock(&ctx->mutex);
@@ -4306,20 +4409,20 @@ SYSCALL_DEFINE5(perf_counter_open,
4306 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4409 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4307 mutex_unlock(&current->perf_counter_mutex); 4410 mutex_unlock(&current->perf_counter_mutex);
4308 4411
4412err_fput_free_put_context:
4309 fput_light(counter_file, fput_needed2); 4413 fput_light(counter_file, fput_needed2);
4310 4414
4311out_fput:
4312 fput_light(group_file, fput_needed);
4313
4314 return ret;
4315
4316err_free_put_context: 4415err_free_put_context:
4317 kfree(counter); 4416 if (err < 0)
4417 kfree(counter);
4318 4418
4319err_put_context: 4419err_put_context:
4320 put_ctx(ctx); 4420 if (err < 0)
4421 put_ctx(ctx);
4422
4423 fput_light(group_file, fput_needed);
4321 4424
4322 goto out_fput; 4425 return err;
4323} 4426}
4324 4427
4325/* 4428/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index d089d052c4a9..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -242,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
242 return 0; 242 return 0;
243} 243}
244 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
245/* 264/*
246 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
247 */ 266 */
@@ -262,10 +281,26 @@ static __init int init_posix_timers(void)
262 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
263 .nsleep = no_nsleep, 282 .nsleep = no_nsleep,
264 }; 283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
297 };
265 298
266 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
267 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
268 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
269 304
270 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
271 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..97955b0e44f4 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..602033acd6c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 380 logged_chars = 0;
373 break; 381 break;
374 case 6: /* Disable logging to console */ 382 case 6: /* Disable logging to console */
383 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 385 console_loglevel = minimum_console_loglevel;
376 break; 386 break;
377 case 7: /* Enable logging to console */ 387 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 388 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1;
391 }
379 break; 392 break;
380 case 8: /* Set level of messages printed to console */ 393 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 394 error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 397 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 398 len = minimum_console_loglevel;
386 console_loglevel = len; 399 console_loglevel = len;
400 /* Implicitly re-enable logging to console */
401 saved_console_loglevel = -1;
387 error = 0; 402 error = 0;
388 break; 403 break;
389 case 9: /* Number of chars in the log buffer */ 404 case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 427{
413 struct console *con; 428 struct console *con;
414 429
415 for (con = console_drivers; con; con = con->next) { 430 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 431 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 432 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 433 (con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
544{ 559{
545 struct console *con; 560 struct console *con;
546 561
547 for (con = console_drivers; con; con = con->next) 562 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 563 if (con->flags & CON_ANYTIME)
549 return 1; 564 return 1;
550 565
@@ -1060,12 +1075,6 @@ void __sched console_conditional_schedule(void)
1060} 1075}
1061EXPORT_SYMBOL(console_conditional_schedule); 1076EXPORT_SYMBOL(console_conditional_schedule);
1062 1077
1063void console_print(const char *s)
1064{
1065 printk(KERN_EMERG "%s", s);
1066}
1067EXPORT_SYMBOL(console_print);
1068
1069void console_unblank(void) 1078void console_unblank(void)
1070{ 1079{
1071 struct console *c; 1080 struct console *c;
@@ -1082,7 +1091,7 @@ void console_unblank(void)
1082 1091
1083 console_locked = 1; 1092 console_locked = 1;
1084 console_may_schedule = 0; 1093 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1094 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1095 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1096 c->unblank();
1088 release_console_sem(); 1097 release_console_sem();
@@ -1097,7 +1106,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1106 struct tty_driver *driver = NULL;
1098 1107
1099 acquire_console_sem(); 1108 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1109 for_each_console(c) {
1101 if (!c->device) 1110 if (!c->device)
1102 continue; 1111 continue;
1103 driver = c->device(c, index); 1112 driver = c->device(c, index);
@@ -1134,25 +1143,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1143 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1144 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1145 * console driver was initialized.
1146 *
1147 * This can happen pretty early during the boot process (because of
1148 * early_printk) - sometimes before setup_arch() completes - be careful
1149 * of what kernel features are used - they may not be initialised yet.
1150 *
1151 * There are two types of consoles - bootconsoles (early_printk) and
1152 * "real" consoles (everything which is not a bootconsole) which are
1153 * handled differently.
1154 * - Any number of bootconsoles can be registered at any time.
1155 * - As soon as a "real" console is registered, all bootconsoles
1156 * will be unregistered automatically.
1157 * - Once a "real" console is registered, any attempt to register a
1158 * bootconsoles will be rejected
1137 */ 1159 */
1138void register_console(struct console *console) 1160void register_console(struct console *newcon)
1139{ 1161{
1140 int i; 1162 int i;
1141 unsigned long flags; 1163 unsigned long flags;
1142 struct console *bootconsole = NULL; 1164 struct console *bcon = NULL;
1143 1165
1144 if (console_drivers) { 1166 /*
1145 if (console->flags & CON_BOOT) 1167 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1168 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1169 */
1148 bootconsole = console_drivers; 1170 if (console_drivers && newcon->flags & CON_BOOT) {
1171 /* find the last or real console */
1172 for_each_console(bcon) {
1173 if (!(bcon->flags & CON_BOOT)) {
1174 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1175 newcon->name, newcon->index);
1176 return;
1177 }
1178 }
1149 } 1179 }
1150 1180
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1181 if (console_drivers && console_drivers->flags & CON_BOOT)
1182 bcon = console_drivers;
1183
1184 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1185 preferred_console = selected_console;
1153 1186
1154 if (console->early_setup) 1187 if (newcon->early_setup)
1155 console->early_setup(); 1188 newcon->early_setup();
1156 1189
1157 /* 1190 /*
1158 * See if we want to use this console driver. If we 1191 * See if we want to use this console driver. If we
@@ -1160,13 +1193,13 @@ void register_console(struct console *console)
1160 * that registers here. 1193 * that registers here.
1161 */ 1194 */
1162 if (preferred_console < 0) { 1195 if (preferred_console < 0) {
1163 if (console->index < 0) 1196 if (newcon->index < 0)
1164 console->index = 0; 1197 newcon->index = 0;
1165 if (console->setup == NULL || 1198 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1199 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1200 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1201 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1202 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1203 preferred_console = 0;
1171 } 1204 }
1172 } 1205 }
@@ -1178,64 +1211,62 @@ void register_console(struct console *console)
1178 */ 1211 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1212 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1213 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1214 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1215 continue;
1183 if (console->index >= 0 && 1216 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1217 newcon->index != console_cmdline[i].index)
1185 continue; 1218 continue;
1186 if (console->index < 0) 1219 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1220 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1221#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1222 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1223 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1224 braille_register_console(newcon,
1192 console_cmdline[i].index, 1225 console_cmdline[i].index,
1193 console_cmdline[i].options, 1226 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1227 console_cmdline[i].brl_options);
1195 return; 1228 return;
1196 } 1229 }
1197#endif 1230#endif
1198 if (console->setup && 1231 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1232 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1233 break;
1201 console->flags |= CON_ENABLED; 1234 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1235 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1236 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1237 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1238 preferred_console = selected_console;
1206 } 1239 }
1207 break; 1240 break;
1208 } 1241 }
1209 1242
1210 if (!(console->flags & CON_ENABLED)) 1243 if (!(newcon->flags & CON_ENABLED))
1211 return; 1244 return;
1212 1245
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1246 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1247 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1248 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1249 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1250 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1251 */
1219 } else { 1252 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1253 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1254
1224 /* 1255 /*
1225 * Put this console in the list - keep the 1256 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1257 * preferred driver at the head of the list.
1227 */ 1258 */
1228 acquire_console_sem(); 1259 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1260 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1261 newcon->next = console_drivers;
1231 console_drivers = console; 1262 console_drivers = newcon;
1232 if (console->next) 1263 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1264 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1265 } else {
1235 console->next = console_drivers->next; 1266 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1267 console_drivers->next = newcon;
1237 } 1268 }
1238 if (console->flags & CON_PRINTBUFFER) { 1269 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1270 /*
1240 * release_console_sem() will print out the buffered messages 1271 * release_console_sem() will print out the buffered messages
1241 * for us. 1272 * for us.
@@ -1245,6 +1276,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1276 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1277 }
1247 release_console_sem(); 1278 release_console_sem();
1279
1280 /*
1281 * By unregistering the bootconsoles after we enable the real console
1282 * we get the "console xxx enabled" message on all the consoles -
1283 * boot consoles, real consoles, etc - this is to ensure that end
1284 * users know there might be something in the kernel's log buffer that
1285 * went to the bootconsole (that they do not see on the real console)
1286 */
1287 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1288 /* we need to iterate through twice, to make sure we print
1289 * everything out, before we unregister the console(s)
1290 */
1291 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1292 newcon->name, newcon->index);
1293 for_each_console(bcon)
1294 if (bcon->flags & CON_BOOT)
1295 unregister_console(bcon);
1296 } else {
1297 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1298 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1299 newcon->name, newcon->index);
1300 }
1248} 1301}
1249EXPORT_SYMBOL(register_console); 1302EXPORT_SYMBOL(register_console);
1250 1303
@@ -1287,11 +1340,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1340
1288static int __init disable_boot_consoles(void) 1341static int __init disable_boot_consoles(void)
1289{ 1342{
1290 if (console_drivers != NULL) { 1343 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1344
1345 for_each_console(con) {
1346 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1347 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1348 con->name, con->index);
1294 return unregister_console(console_drivers); 1349 unregister_console(con);
1295 } 1350 }
1296 } 1351 }
1297 return 0; 1352 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082c320e4dbf..307c285af59e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..bd5d5c8e5140 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
98} 98}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 99EXPORT_SYMBOL_GPL(synchronize_rcu);
100 100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
101static void rcu_barrier_callback(struct rcu_head *notused) 125static void rcu_barrier_callback(struct rcu_head *notused)
102{ 126{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 153static inline void wait_migrated_callbacks(void)
130{ 154{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
132} 157}
133 158
134/* 159/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 217 wake_up(&rcu_migrate_wq);
193} 218}
194 219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 224 unsigned long action, void *hcpu)
197{ 225{
226 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 227 if (action == CPU_DYING) {
199 /* 228 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 244 wait_migrated_callbacks();
215 } 245 }
@@ -219,8 +249,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 249
220void __init rcu_init(void) 250void __init rcu_init(void)
221{ 251{
252 int i;
253
222 __rcu_init(); 254 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 264}
225 265
226void rcu_scheduler_starting(void) 266void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 320 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 321 rcu_torture_free(rp);
322 } else 322 } else
323 cur_ops->deferredfree(rp); 323 cur_ops->deferred_free(rp);
324} 324}
325 325
326static void rcu_torture_deferred_free(struct rcu_torture *p) 326static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 329}
330 330
331static struct rcu_torture_ops rcu_ops = { 331static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 332 .init = NULL,
333 .cleanup = NULL, 333 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 334 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 335 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 336 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 337 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 338 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 339 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 340 .cb_barrier = rcu_barrier,
341 .stats = NULL, 341 .stats = NULL,
342 .irqcapable = 1, 342 .irq_capable = 1,
343 .name = "rcu" 343 .name = "rcu"
344}; 344};
345 345
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 346static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
370} 370}
371 371
372static struct rcu_torture_ops rcu_sync_ops = { 372static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 373 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 374 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 375 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 376 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 377 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 378 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 379 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 380 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 381 .cb_barrier = NULL,
382 .stats = NULL, 382 .stats = NULL,
383 .irqcapable = 1, 383 .irq_capable = 1,
384 .name = "rcu_sync" 384 .name = "rcu_sync"
385}; 385};
386 386
387/* 387/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
432} 432}
433 433
434static struct rcu_torture_ops rcu_bh_ops = { 434static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 435 .init = NULL,
436 .cleanup = NULL, 436 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 437 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 438 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 439 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 440 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 441 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 442 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 443 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 444 .stats = NULL,
445 .irqcapable = 1, 445 .irq_capable = 1,
446 .name = "rcu_bh" 446 .name = "rcu_bh"
447}; 447};
448 448
449static struct rcu_torture_ops rcu_bh_sync_ops = { 449static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 450 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 451 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 452 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 453 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 454 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 455 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 456 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 457 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 458 .cb_barrier = NULL,
459 .stats = NULL, 459 .stats = NULL,
460 .irqcapable = 1, 460 .irq_capable = 1,
461 .name = "rcu_bh_sync" 461 .name = "rcu_bh_sync"
462}; 462};
463 463
464/* 464/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
530} 530}
531 531
532static struct rcu_torture_ops srcu_ops = { 532static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 533 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 534 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 535 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 536 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 537 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 538 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 539 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 540 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 541 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 542 .stats = srcu_torture_stats,
543 .name = "srcu" 543 .name = "srcu"
544}; 544};
545 545
546/* 546/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
574} 574}
575 575
576static struct rcu_torture_ops sched_ops = { 576static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 577 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 578 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 579 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 581 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 582 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 583 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 584 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 585 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 586 .stats = NULL,
587 .irqcapable = 1, 587 .irq_capable = 1,
588 .name = "sched" 588 .name = "sched"
589}; 589};
590 590
591static struct rcu_torture_ops sched_ops_sync = { 591static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 592 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 593 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 594 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 596 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 597 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 599 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 600 .cb_barrier = NULL,
601 .stats = NULL, 601 .stats = NULL,
602 .name = "sched_sync" 602 .name = "sched_sync"
603};
604
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init,
609 .cleanup = NULL,
610 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed,
614 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL,
617 .stats = rcu_expedited_torture_stats,
618 .irq_capable = 1,
619 .name = "sched_expedited"
603}; 620};
604 621
605/* 622/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 652 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 653 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 654 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 655 cur_ops->deferred_free(old_rp);
639 } 656 }
640 rcu_torture_current_version++; 657 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 658 oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 717 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 718 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 719 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 720 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 721 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 722 spin_unlock(&rand_lock);
706 preempt_disable(); 723 preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
738 755
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 756 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 757 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 759 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 760
744 do { 761 do {
745 if (irqreader && cur_ops->irqcapable) { 762 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 763 if (!timer_pending(&t))
747 mod_timer(&t, 1); 764 mod_timer(&t, 1);
748 } 765 }
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
757 } 774 }
758 if (p->rtort_mbtest == 0) 775 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 776 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 777 cur_ops->read_delay(&rand);
761 preempt_disable(); 778 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 779 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 780 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 795 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 796 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 797 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 798 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 799 del_timer_sync(&t);
783 while (!kthread_should_stop()) 800 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 801 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1095 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1096 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1098 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1099 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1100
1083 mutex_lock(&fullstop_mutex); 1101 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..6b11b07cfe7f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,59 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 unsigned long flags;
111 struct rcu_data *rdp;
112
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu);
90 rdp->passed_quiesc = 1; 115 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed; 116 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu);
118 local_irq_restore(flags);
92} 119}
93 120
94void rcu_bh_qsctr_inc(int cpu) 121void rcu_bh_qs(int cpu)
95{ 122{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 123 unsigned long flags;
124 struct rcu_data *rdp;
125
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed; 129 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags);
99} 131}
100 132
101#ifdef CONFIG_NO_HZ 133#ifdef CONFIG_NO_HZ
@@ -110,15 +142,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 142static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 143
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 144static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu);
113 146
114/* 147/*
115 * Return the number of RCU batches processed thus far for debug & stats. 148 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 149 */
117long rcu_batches_completed(void) 150long rcu_batches_completed_sched(void)
118{ 151{
119 return rcu_state.completed; 152 return rcu_sched_state.completed;
120} 153}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 154EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 155
123/* 156/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 157 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +214,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 214 return 1;
182 } 215 }
183 216
217 /* If preemptable RCU, no point in sending reschedule IPI. */
218 if (rdp->preemptable)
219 return 0;
220
184 /* The CPU is online, so send it a reschedule IPI. */ 221 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 222 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 223 smp_send_reschedule(rdp->cpu);
@@ -193,7 +230,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 230#endif /* #ifdef CONFIG_SMP */
194 231
195#ifdef CONFIG_NO_HZ 232#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 233
198/** 234/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 235 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +249,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 249 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 250 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 251 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 252 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 253 local_irq_restore(flags);
218} 254}
219 255
@@ -232,7 +268,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 268 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 269 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 270 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 271 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 272 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 273 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 274}
@@ -251,7 +287,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 287 if (rdtp->dynticks & 0x1)
252 return; 288 return;
253 rdtp->dynticks_nmi++; 289 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 290 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 292}
257 293
@@ -270,7 +306,7 @@ void rcu_nmi_exit(void)
270 return; 306 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 307 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 308 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 309 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 310}
275 311
276/** 312/**
@@ -286,7 +322,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 322 if (rdtp->dynticks_nesting++)
287 return; 323 return;
288 rdtp->dynticks++; 324 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 325 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 326 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 327}
292 328
@@ -305,10 +341,10 @@ void rcu_irq_exit(void)
305 return; 341 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 342 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 343 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 344 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 345
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 346 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 347 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 348 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 349 set_need_resched();
314} 350}
@@ -461,6 +497,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 497
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 498 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 499 for (; rnp_cur < rnp_end; rnp_cur++) {
500 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 501 if (rnp_cur->qsmask == 0)
465 continue; 502 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +506,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 506 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 507 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 508 smp_processor_id(), (long)(jiffies - rsp->gp_start));
509 trigger_all_cpu_backtrace();
510
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 511 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 512}
474 513
@@ -479,12 +518,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 518
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 519 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 520 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 521 trigger_all_cpu_backtrace();
522
483 spin_lock_irqsave(&rnp->lock, flags); 523 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 524 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 525 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 526 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 527 spin_unlock_irqrestore(&rnp->lock, flags);
528
488 set_need_resched(); /* kick ourselves to get things going. */ 529 set_need_resched(); /* kick ourselves to get things going. */
489} 530}
490 531
@@ -674,6 +715,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 715}
675 716
676/* 717/*
718 * Clean up after the prior grace period and let rcu_start_gp() start up
719 * the next grace period if one is needed. Note that the caller must
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock)
724{
725 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
728}
729
730/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 731 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 732 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 733 * group must be represented by the same leaf rcu_node structure.
@@ -694,7 +748,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 748 return;
695 } 749 }
696 rnp->qsmask &= ~mask; 750 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 751 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 752
699 /* Other bits still set at this level, so done. */ 753 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 754 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -714,14 +768,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
714 768
715 /* 769 /*
716 * Get here if we are the last CPU to pass through a quiescent 770 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 771 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 772 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 773 */
722 rsp->completed = rsp->gpnum; 774 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 775}
726 776
727/* 777/*
@@ -828,11 +878,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 878 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 879 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 880 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 881 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 882 break;
833 } 883 }
884 rcu_preempt_offline_tasks(rsp, rnp);
834 mask = rnp->grpmask; 885 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 886 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 887 rnp = rnp->parent;
837 } while (rnp != NULL); 888 } while (rnp != NULL);
838 lastcomp = rsp->completed; 889 lastcomp = rsp->completed;
@@ -845,7 +896,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
845 /* 896 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 897 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 898 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 899 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 900 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 901 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 902 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +927,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 927 */
877static void rcu_offline_cpu(int cpu) 928static void rcu_offline_cpu(int cpu)
878{ 929{
879 __rcu_offline_cpu(cpu, &rcu_state); 930 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 931 __rcu_offline_cpu(cpu, &rcu_bh_state);
932 rcu_preempt_offline_cpu(cpu);
881} 933}
882 934
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 935#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +1015,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 1015 */
964void rcu_check_callbacks(int cpu, int user) 1016void rcu_check_callbacks(int cpu, int user)
965{ 1017{
1018 if (!rcu_pending(cpu))
1019 return; /* if nothing for RCU to do. */
966 if (user || 1020 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1021 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1022 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1025,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1025 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1026 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1027 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1028 * a quiescent state, so note it.
975 * 1029 *
976 * No memory barrier is required here because both 1030 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1031 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1032 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1033 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1034 */
982 1035
983 rcu_qsctr_inc(cpu); 1036 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1037 rcu_bh_qs(cpu);
985 1038
986 } else if (!in_softirq()) { 1039 } else if (!in_softirq()) {
987 1040
@@ -989,11 +1042,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1042 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1043 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1044 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1045 * critical section, so note it.
993 */ 1046 */
994 1047
995 rcu_bh_qsctr_inc(cpu); 1048 rcu_bh_qs(cpu);
996 } 1049 }
1050 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1051 raise_softirq(RCU_SOFTIRQ);
998} 1052}
999 1053
@@ -1132,6 +1186,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1186{
1133 unsigned long flags; 1187 unsigned long flags;
1134 1188
1189 WARN_ON_ONCE(rdp->beenonline == 0);
1190
1135 /* 1191 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1192 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1193 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1226,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1226 */
1171 smp_mb(); /* See above block comment. */ 1227 smp_mb(); /* See above block comment. */
1172 1228
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1229 __rcu_process_callbacks(&rcu_sched_state,
1230 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1231 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1232 rcu_preempt_process_callbacks();
1175 1233
1176 /* 1234 /*
1177 * Memory references from any later RCU read-side critical sections 1235 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1285,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1285}
1228 1286
1229/* 1287/*
1230 * Queue an RCU callback for invocation after a grace period. 1288 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1289 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1290void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1291{
1234 __call_rcu(head, func, &rcu_state); 1292 __call_rcu(head, func, &rcu_sched_state);
1235} 1293}
1236EXPORT_SYMBOL_GPL(call_rcu); 1294EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1295
1238/* 1296/*
1239 * Queue an RCU for invocation after a quicker grace period. 1297 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1363,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1363 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1364 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1365 */
1308int rcu_pending(int cpu) 1366static int rcu_pending(int cpu)
1309{ 1367{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1368 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1369 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1370 rcu_preempt_pending(cpu);
1312} 1371}
1313 1372
1314/* 1373/*
@@ -1320,27 +1379,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1379int rcu_needs_cpu(int cpu)
1321{ 1380{
1322 /* RCU callbacks either ready or pending? */ 1381 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1382 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1383 per_cpu(rcu_bh_data, cpu).nxtlist ||
1384 rcu_preempt_needs_cpu(cpu);
1325} 1385}
1326 1386
1327/* 1387/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1388 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1389 */
1339static void __cpuinit 1390static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1391rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1392{
1342 unsigned long flags; 1393 unsigned long flags;
1343 int i; 1394 int i;
1395 struct rcu_data *rdp = rsp->rda[cpu];
1396 struct rcu_node *rnp = rcu_get_root(rsp);
1397
1398 /* Set up local state, ensuring consistent view of global state. */
1399 spin_lock_irqsave(&rnp->lock, flags);
1400 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1401 rdp->nxtlist = NULL;
1402 for (i = 0; i < RCU_NEXT_SIZE; i++)
1403 rdp->nxttail[i] = &rdp->nxtlist;
1404 rdp->qlen = 0;
1405#ifdef CONFIG_NO_HZ
1406 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1407#endif /* #ifdef CONFIG_NO_HZ */
1408 rdp->cpu = cpu;
1409 spin_unlock_irqrestore(&rnp->lock, flags);
1410}
1411
1412/*
1413 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1414 * offline event can be happening at a given time. Note also that we
1415 * can accept some slop in the rsp->completed access due to the fact
1416 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1417 */
1418static void __cpuinit
1419rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1420{
1421 unsigned long flags;
1344 long lastcomp; 1422 long lastcomp;
1345 unsigned long mask; 1423 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1424 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1432,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1432 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1433 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1434 rdp->beenonline = 1; /* We have now been online. */
1435 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1436 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1437 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1438 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1439
1370 /* 1440 /*
@@ -1405,16 +1475,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1405 1475
1406static void __cpuinit rcu_online_cpu(int cpu) 1476static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1477{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1478 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1479 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1480 rcu_preempt_init_percpu_data(cpu);
1411} 1481}
1412 1482
1413/* 1483/*
1414 * Handle CPU online/offline notifcation events. 1484 * Handle CPU online/offline notification events.
1415 */ 1485 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1486int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1487 unsigned long action, void *hcpu)
1418{ 1488{
1419 long cpu = (long)hcpu; 1489 long cpu = (long)hcpu;
1420 1490
@@ -1486,6 +1556,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1556 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1558 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1560 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1561 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1562 rnp->grplo = j * cpustride;
@@ -1503,16 +1574,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1574 j / rsp->levelspread[i - 1];
1504 } 1575 }
1505 rnp->level = i; 1576 rnp->level = i;
1577 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1579 }
1507 } 1580 }
1508} 1581}
1509 1582
1510/* 1583/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1584 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1585 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1586 * structure.
1513 */ 1587 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1589do { \
1590 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1592 j = 0; \
1518 for_each_possible_cpu(i) { \ 1593 for_each_possible_cpu(i) { \
@@ -1520,32 +1595,43 @@ do { \
1520 j++; \ 1595 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1596 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1597 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1598 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1599 } \
1524} while (0) 1600} while (0)
1525 1601
1526static struct notifier_block __cpuinitdata rcu_nb = { 1602#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1603
1528}; 1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1620
1530void __init __rcu_init(void) 1621void __init __rcu_init(void)
1531{ 1622{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1624 int j;
1534 struct rcu_node *rnp; 1625 struct rcu_node *rnp;
1535 1626
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1627 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1630#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1631 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1632 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1633 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1635}
1550 1636
1551module_param(blimit, int, 0); 1637module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..bf8a6f9f134d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..47789369ea59
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs_record(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72}
73
74/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd.
76 * If we are in an RCU read-side critical section, we need to reflect
77 * that in the state of the rcu_node structure corresponding to this CPU.
78 * Caller must disable hardirqs.
79 */
80static void rcu_preempt_qs(int cpu)
81{
82 struct task_struct *t = current;
83 int phase;
84 struct rcu_data *rdp;
85 struct rcu_node *rnp;
86
87 if (t->rcu_read_lock_nesting &&
88 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
89
90 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode;
93 spin_lock(&rnp->lock);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp;
96
97 /*
98 * If this CPU has already checked in, then this task
99 * will hold up the next grace period rather than the
100 * current grace period. Queue the task accordingly.
101 * If the task is queued for the current grace period
102 * (i.e., this CPU has not yet passed through a quiescent
103 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period
105 * cannot end.
106 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */
110 spin_unlock(&rnp->lock);
111 }
112
113 /*
114 * Either we were not in an RCU read-side critical section to
115 * begin with, or we have now recorded that critical section
116 * globally. Either way, we can now note a quiescent state
117 * for this CPU. Again, if we were in an RCU read-side critical
118 * section, and if that critical section was blocking the current
119 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period.
121 */
122 rcu_preempt_qs_record(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124 RCU_READ_UNLOCK_GOT_QS);
125}
126
127/*
128 * Tree-preemptable RCU implementation for rcu_read_lock().
129 * Just increment ->rcu_read_lock_nesting, shared state will be updated
130 * if we block.
131 */
132void __rcu_read_lock(void)
133{
134 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
135 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
136}
137EXPORT_SYMBOL_GPL(__rcu_read_lock);
138
139static void rcu_read_unlock_special(struct task_struct *t)
140{
141 int empty;
142 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp;
145 int special;
146
147 /* NMI handlers cannot block and cannot safely manipulate state. */
148 if (in_nmi())
149 return;
150
151 local_irq_save(flags);
152
153 /*
154 * If RCU core is waiting for this CPU to exit critical section,
155 * let it know that we have done so.
156 */
157 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
161 }
162
163 /* Hardware IRQ handlers cannot block. */
164 if (in_irq()) {
165 local_irq_restore(flags);
166 return;
167 }
168
169 /* Clean up if blocked during RCU read-side critical section. */
170 if (special & RCU_READ_UNLOCK_BLOCKED) {
171 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
172
173 /*
174 * Remove this task from the list it blocked on. The
175 * task can migrate while we acquire the lock, but at
176 * most one time. So at most two passes through loop.
177 */
178 for (;;) {
179 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock);
181 if (rnp == t->rcu_blocked_node)
182 break;
183 spin_unlock(&rnp->lock);
184 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL;
188
189 /*
190 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
193 * drop rnp->lock and restore irq.
194 */
195 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
211 return;
212 }
213 spin_unlock(&rnp->lock);
214 }
215 local_irq_restore(flags);
216}
217
218/*
219 * Tree-preemptable RCU implementation for rcu_read_unlock().
220 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
221 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
222 * invoke rcu_read_unlock_special() to clean up after a context switch
223 * in an RCU read-side critical section and other special cases.
224 */
225void __rcu_read_unlock(void)
226{
227 struct task_struct *t = current;
228
229 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
230 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
231 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
232 rcu_read_unlock_special(t);
233}
234EXPORT_SYMBOL_GPL(__rcu_read_unlock);
235
236#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
237
238/*
239 * Scan the current list of tasks blocked within RCU read-side critical
240 * sections, printing out the tid of each.
241 */
242static void rcu_print_task_stall(struct rcu_node *rnp)
243{
244 unsigned long flags;
245 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1;
247 struct task_struct *t;
248
249 if (!list_empty(&rnp->blocked_tasks[phase])) {
250 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */
252 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid);
255 spin_unlock_irqrestore(&rnp->lock, flags);
256 }
257}
258
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260
261/*
262 * Check for preempted RCU readers for the specified rcu_node structure.
263 * If the caller needs a reliable answer, it must hold the rcu_node's
264 * >lock.
265 */
266static int rcu_preempted_readers(struct rcu_node *rnp)
267{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
269}
270
271#ifdef CONFIG_HOTPLUG_CPU
272
273/*
274 * Handle tasklist migration for case in which all CPUs covered by the
275 * specified rcu_node have gone offline. Move them up to the root
276 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock.
279 *
280 * The caller must hold rnp->lock with irqs disabled.
281 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp)
284{
285 int i;
286 struct list_head *lp;
287 struct list_head *lp_root;
288 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp;
290
291 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */
294 }
295
296 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the
298 * root rcu_node can be at most one ahead of the rest of the
299 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element.
301 */
302 for (i = 0; i < 2; i++) {
303 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) {
306 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
307 spin_lock(&rnp_root->lock); /* irqs already disabled */
308 list_del(&tp->rcu_node_entry);
309 tp->rcu_blocked_node = rnp_root;
310 list_add(&tp->rcu_node_entry, lp_root);
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 }
313 }
314}
315
316/*
317 * Do CPU-offline processing for preemptable RCU.
318 */
319static void rcu_preempt_offline_cpu(int cpu)
320{
321 __rcu_offline_cpu(cpu, &rcu_preempt_state);
322}
323
324#endif /* #ifdef CONFIG_HOTPLUG_CPU */
325
326/*
327 * Check for a quiescent state from the current CPU. When a task blocks,
328 * the task is recorded in the corresponding CPU's rcu_node structure,
329 * which is checked elsewhere.
330 *
331 * Caller must disable hard irqs.
332 */
333static void rcu_preempt_check_callbacks(int cpu)
334{
335 struct task_struct *t = current;
336
337 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &=
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340 rcu_preempt_qs_record(cpu);
341 return;
342 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352}
353
354/*
355 * Process callbacks for preemptable RCU.
356 */
357static void rcu_preempt_process_callbacks(void)
358{
359 __rcu_process_callbacks(&rcu_preempt_state,
360 &__get_cpu_var(rcu_preempt_data));
361}
362
363/*
364 * Queue a preemptable-RCU callback for invocation after a grace period.
365 */
366void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
367{
368 __call_rcu(head, func, &rcu_preempt_state);
369}
370EXPORT_SYMBOL_GPL(call_rcu);
371
372/*
373 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done.
375 */
376static int rcu_preempt_pending(int cpu)
377{
378 return __rcu_pending(&rcu_preempt_state,
379 &per_cpu(rcu_preempt_data, cpu));
380}
381
382/*
383 * Does preemptable RCU need the CPU to stay out of dynticks mode?
384 */
385static int rcu_preempt_needs_cpu(int cpu)
386{
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388}
389
390/*
391 * Initialize preemptable RCU's per-CPU data.
392 */
393static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
394{
395 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
396}
397
398/*
399 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep
402 * is enabled.
403 */
404void exit_rcu(void)
405{
406 struct task_struct *t = current;
407
408 if (t->rcu_read_lock_nesting == 0)
409 return;
410 t->rcu_read_lock_nesting = 1;
411 rcu_read_unlock();
412}
413
414#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
415
416/*
417 * Tell them what RCU they are running.
418 */
419static inline void rcu_bootup_announce(void)
420{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422}
423
424/*
425 * Return the number of RCU batches processed thus far for debug & stats.
426 */
427long rcu_batches_completed(void)
428{
429 return rcu_batches_completed_sched();
430}
431EXPORT_SYMBOL_GPL(rcu_batches_completed);
432
433/*
434 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states.
436 */
437static void rcu_preempt_qs(int cpu)
438{
439}
440
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442
443/*
444 * Because preemptable RCU does not exist, we never have to check for
445 * tasks blocked within RCU read-side critical sections.
446 */
447static void rcu_print_task_stall(struct rcu_node *rnp)
448{
449}
450
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452
453/*
454 * Because preemptable RCU does not exist, there are never any preempted
455 * RCU readers.
456 */
457static int rcu_preempted_readers(struct rcu_node *rnp)
458{
459 return 0;
460}
461
462#ifdef CONFIG_HOTPLUG_CPU
463
464/*
465 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections.
467 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp)
470{
471}
472
473/*
474 * Because preemptable RCU does not exist, it never needs CPU-offline
475 * processing.
476 */
477static void rcu_preempt_offline_cpu(int cpu)
478{
479}
480
481#endif /* #ifdef CONFIG_HOTPLUG_CPU */
482
483/*
484 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check.
486 */
487void rcu_preempt_check_callbacks(int cpu)
488{
489}
490
491/*
492 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process.
494 */
495void rcu_preempt_process_callbacks(void)
496{
497}
498
499/*
500 * In classic RCU, call_rcu() is just call_rcu_sched().
501 */
502void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
503{
504 call_rcu_sched(head, func);
505}
506EXPORT_SYMBOL_GPL(call_rcu);
507
508/*
509 * Because preemptable RCU does not exist, it never has any work to do.
510 */
511static int rcu_preempt_pending(int cpu)
512{
513 return 0;
514}
515
516/*
517 * Because preemptable RCU does not exist, it never needs any CPU.
518 */
519static int rcu_preempt_needs_cpu(int cpu)
520{
521 return 0;
522}
523
524/*
525 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize.
527 */
528static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{
530}
531
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..0ea1bff69727 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -537,14 +505,6 @@ struct root_domain {
537#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
538 struct cpupri cpupri; 506 struct cpupri cpupri;
539#endif 507#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541 /*
542 * Preferred wake up cpu nominated by sched_mc balance that will be
543 * used when most cpus are idle in the system indicating overall very
544 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
545 */
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548}; 508};
549 509
550/* 510/*
@@ -616,6 +576,7 @@ struct rq {
616 576
617 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
618 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
619 int active_balance; 580 int active_balance;
620 int push_cpu; 581 int push_cpu;
621 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -626,6 +587,9 @@ struct rq {
626 587
627 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
628 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
629#endif 593#endif
630 594
631 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -665,9 +629,10 @@ struct rq {
665 629
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667 631
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
669{ 634{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
671} 636}
672 637
673static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 662
697inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
698{ 664{
@@ -861,6 +827,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 827unsigned int sysctl_sched_shares_thresh = 4;
862 828
863/* 829/*
830 * period over which we average the RT time consumption, measured
831 * in ms.
832 *
833 * default: 1s
834 */
835const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
836
837/*
864 * period over which we measure -rt task cpu usage in us. 838 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 839 * default: 1s
866 */ 840 */
@@ -1278,12 +1252,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1252}
1279#endif /* CONFIG_NO_HZ */ 1253#endif /* CONFIG_NO_HZ */
1280 1254
1255static u64 sched_avg_period(void)
1256{
1257 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1258}
1259
1260static void sched_avg_update(struct rq *rq)
1261{
1262 s64 period = sched_avg_period();
1263
1264 while ((s64)(rq->clock - rq->age_stamp) > period) {
1265 rq->age_stamp += period;
1266 rq->rt_avg /= 2;
1267 }
1268}
1269
1270static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1271{
1272 rq->rt_avg += rt_delta;
1273 sched_avg_update(rq);
1274}
1275
1281#else /* !CONFIG_SMP */ 1276#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1277static void resched_task(struct task_struct *p)
1283{ 1278{
1284 assert_spin_locked(&task_rq(p)->lock); 1279 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1280 set_tsk_need_resched(p);
1286} 1281}
1282
1283static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1284{
1285}
1287#endif /* CONFIG_SMP */ 1286#endif /* CONFIG_SMP */
1288 1287
1289#if BITS_PER_LONG == 32 1288#if BITS_PER_LONG == 32
@@ -1494,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
1494#endif 1493#endif
1495 1494
1496#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type); 1496/* Used instead of source_load when we know the type == 0 */
1498static unsigned long target_load(int cpu, int type); 1497static unsigned long weighted_cpuload(const int cpu)
1498{
1499 return cpu_rq(cpu)->load.weight;
1500}
1501
1502/*
1503 * Return a low guess at the load of a migration-source cpu weighted
1504 * according to the scheduling class and "nice" value.
1505 *
1506 * We want to under-estimate the load of migration sources, to
1507 * balance conservatively.
1508 */
1509static unsigned long source_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return min(rq->cpu_load[type-1], total);
1518}
1519
1520/*
1521 * Return a high guess at the load of a migration-target cpu weighted
1522 * according to the scheduling class and "nice" value.
1523 */
1524static unsigned long target_load(int cpu, int type)
1525{
1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long total = weighted_cpuload(cpu);
1528
1529 if (type == 0 || !sched_feat(LB_BIAS))
1530 return total;
1531
1532 return max(rq->cpu_load[type-1], total);
1533}
1534
1535static struct sched_group *group_of(int cpu)
1536{
1537 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1538
1539 if (!sd)
1540 return NULL;
1541
1542 return sd->groups;
1543}
1544
1545static unsigned long power_of(int cpu)
1546{
1547 struct sched_group *group = group_of(cpu);
1548
1549 if (!group)
1550 return SCHED_LOAD_SCALE;
1551
1552 return group->cpu_power;
1553}
1554
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500 1556
1501static unsigned long cpu_avg_load_per_task(int cpu) 1557static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1569,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1569
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1570#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1571
1572struct update_shares_data {
1573 unsigned long rq_weight[NR_CPUS];
1574};
1575
1576static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1577
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1578static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1579
1518/* 1580/*
1519 * Calculate and set the cpu's group shares. 1581 * Calculate and set the cpu's group shares.
1520 */ 1582 */
1521static void 1583static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1584 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1585 unsigned long sd_rq_weight,
1586 struct update_shares_data *usd)
1524{ 1587{
1525 unsigned long shares; 1588 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1589 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1590
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1591 rq_weight = usd->rq_weight[cpu];
1592 if (!rq_weight) {
1593 boost = 1;
1594 rq_weight = NICE_0_LOAD;
1595 }
1532 1596
1533 /* 1597 /*
1534 * \Sum shares * rq_weight 1598 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1599 * shares_i = -----------------------------
1536 * \Sum rq_weight 1600 * \Sum_j rq_weight_j
1537 *
1538 */ 1601 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1602 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1603 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1608,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1608 unsigned long flags;
1546 1609
1547 spin_lock_irqsave(&rq->lock, flags); 1610 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1611 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1612 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1613 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1614 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1615 }
@@ -1559,22 +1622,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1622 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1623static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1624{
1562 unsigned long weight, rq_weight = 0; 1625 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1626 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1627 struct sched_domain *sd = data;
1628 unsigned long flags;
1565 int i; 1629 int i;
1566 1630
1631 if (!tg->se[0])
1632 return 0;
1633
1634 local_irq_save(flags);
1635 usd = &__get_cpu_var(update_shares_data);
1636
1567 for_each_cpu(i, sched_domain_span(sd)) { 1637 for_each_cpu(i, sched_domain_span(sd)) {
1638 weight = tg->cfs_rq[i]->load.weight;
1639 usd->rq_weight[i] = weight;
1640
1568 /* 1641 /*
1569 * If there are currently no tasks on the cpu pretend there 1642 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1643 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1644 * run here it will not get delayed by group starvation.
1572 */ 1645 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1646 if (!weight)
1575 weight = NICE_0_LOAD; 1647 weight = NICE_0_LOAD;
1576 1648
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1649 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1650 shares += tg->cfs_rq[i]->shares;
1580 } 1651 }
@@ -1586,7 +1657,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1657 shares = tg->shares;
1587 1658
1588 for_each_cpu(i, sched_domain_span(sd)) 1659 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1660 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1661
1662 local_irq_restore(flags);
1590 1663
1591 return 0; 1664 return 0;
1592} 1665}
@@ -1616,8 +1689,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1689
1617static void update_shares(struct sched_domain *sd) 1690static void update_shares(struct sched_domain *sd)
1618{ 1691{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1692 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1693 u64 now;
1694
1695 if (root_task_group_empty())
1696 return;
1697
1698 now = cpu_clock(raw_smp_processor_id());
1699 elapsed = now - sd->last_update;
1621 1700
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1701 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1702 sd->last_update = now;
@@ -1627,6 +1706,9 @@ static void update_shares(struct sched_domain *sd)
1627 1706
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1707static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1708{
1709 if (root_task_group_empty())
1710 return;
1711
1630 spin_unlock(&rq->lock); 1712 spin_unlock(&rq->lock);
1631 update_shares(sd); 1713 update_shares(sd);
1632 spin_lock(&rq->lock); 1714 spin_lock(&rq->lock);
@@ -1634,6 +1716,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1716
1635static void update_h_load(long cpu) 1717static void update_h_load(long cpu)
1636{ 1718{
1719 if (root_task_group_empty())
1720 return;
1721
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1722 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1723}
1639 1724
@@ -1651,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1651 1736
1652#ifdef CONFIG_PREEMPT 1737#ifdef CONFIG_PREEMPT
1653 1738
1739static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1740
1654/* 1741/*
1655 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1742 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1656 * way at the expense of forcing extra atomic operations in all 1743 * way at the expense of forcing extra atomic operations in all
@@ -1915,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1915} 2002}
1916 2003
1917#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1918
1919/* Used instead of source_load when we know the type == 0 */
1920static unsigned long weighted_cpuload(const int cpu)
1921{
1922 return cpu_rq(cpu)->load.weight;
1923}
1924
1925/* 2005/*
1926 * Is this task likely cache-hot: 2006 * Is this task likely cache-hot:
1927 */ 2007 */
@@ -2195,186 +2275,6 @@ void kick_process(struct task_struct *p)
2195 preempt_enable(); 2275 preempt_enable();
2196} 2276}
2197EXPORT_SYMBOL_GPL(kick_process); 2277EXPORT_SYMBOL_GPL(kick_process);
2198
2199/*
2200 * Return a low guess at the load of a migration-source cpu weighted
2201 * according to the scheduling class and "nice" value.
2202 *
2203 * We want to under-estimate the load of migration sources, to
2204 * balance conservatively.
2205 */
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217/*
2218 * Return a high guess at the load of a migration-target cpu weighted
2219 * according to the scheduling class and "nice" value.
2220 */
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232/*
2233 * find_idlest_group finds and returns the least busy CPU group within the
2234 * domain.
2235 */
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249 /* Skip over this group if it has no CPUs allowed */
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257 /* Tally up the load of all CPUs in the group */
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261 /* Bias balancing toward cpus of our domain */
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288/*
2289 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2290 */
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298 /* Traverse only the allowed CPUs */
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311/*
2312 * sched_balance_self: balance the current task (running on cpu) in domains
2313 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2314 * SD_BALANCE_EXEC.
2315 *
2316 * Balance, ie. select the least loaded group.
2317 *
2318 * Returns the target CPU number, or the same CPU if no balancing is needed.
2319 *
2320 * preempt must be disabled.
2321 */
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328 /*
2329 * If power savings logic is enabled for a domain, stop there.
2330 */
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357 /* Now try balancing at a lower domain level of cpu */
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362 /* Now try balancing at a lower domain level of new_cpu */
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372 /* while loop will break here if sd == NULL */
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif /* CONFIG_SMP */ 2278#endif /* CONFIG_SMP */
2379 2279
2380/** 2280/**
@@ -2412,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
2412 * 2312 *
2413 * returns failure only if the task is already active. 2313 * returns failure only if the task is already active.
2414 */ 2314 */
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2315static int try_to_wake_up(struct task_struct *p, unsigned int state,
2316 int wake_flags)
2416{ 2317{
2417 int cpu, orig_cpu, this_cpu, success = 0; 2318 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2319 unsigned long flags;
2419 long old_state;
2420 struct rq *rq; 2320 struct rq *rq;
2421 2321
2422 if (!sched_feat(SYNC_WAKEUPS)) 2322 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0; 2323 wake_flags &= ~WF_SYNC;
2424
2425#ifdef CONFIG_SMP
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428
2429 this_cpu = raw_smp_processor_id();
2430 cpu = task_cpu(p);
2431 2324
2432 for_each_domain(this_cpu, sd) { 2325 this_cpu = get_cpu();
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440 2326
2441 smp_wmb(); 2327 smp_wmb();
2442 rq = task_rq_lock(p, &flags); 2328 rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq); 2329 update_rq_clock(rq);
2444 old_state = p->state; 2330 if (!(p->state & state))
2445 if (!(old_state & state))
2446 goto out; 2331 goto out;
2447 2332
2448 if (p->se.on_rq) 2333 if (p->se.on_rq)
@@ -2450,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2450 2335
2451 cpu = task_cpu(p); 2336 cpu = task_cpu(p);
2452 orig_cpu = cpu; 2337 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454 2338
2455#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p))) 2340 if (unlikely(task_running(rq, p)))
2457 goto out_activate; 2341 goto out_activate;
2458 2342
2459 cpu = p->sched_class->select_task_rq(p, sync); 2343 /*
2460 if (cpu != orig_cpu) { 2344 * In order to handle concurrent wakeups and release the rq->lock
2345 * we put the task in TASK_WAKING state.
2346 *
2347 * First fix up the nr_uninterruptible count:
2348 */
2349 if (task_contributes_to_load(p))
2350 rq->nr_uninterruptible--;
2351 p->state = TASK_WAKING;
2352 task_rq_unlock(rq, &flags);
2353
2354 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2355 if (cpu != orig_cpu)
2461 set_task_cpu(p, cpu); 2356 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags);
2463 /* might preempt at this point */
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470 2357
2471 this_cpu = smp_processor_id(); 2358 rq = task_rq_lock(p, &flags);
2472 cpu = task_cpu(p); 2359 WARN_ON(p->state != TASK_WAKING);
2473 } 2360 cpu = task_cpu(p);
2474 2361
2475#ifdef CONFIG_SCHEDSTATS 2362#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count); 2363 schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2490out_activate: 2377out_activate:
2491#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2492 schedstat_inc(p, se.nr_wakeups); 2379 schedstat_inc(p, se.nr_wakeups);
2493 if (sync) 2380 if (wake_flags & WF_SYNC)
2494 schedstat_inc(p, se.nr_wakeups_sync); 2381 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu) 2382 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate); 2383 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,7 +2406,7 @@ out_activate:
2519 2406
2520out_running: 2407out_running:
2521 trace_sched_wakeup(rq, p, success); 2408 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync); 2409 check_preempt_curr(rq, p, wake_flags);
2523 2410
2524 p->state = TASK_RUNNING; 2411 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP 2412#ifdef CONFIG_SMP
@@ -2528,6 +2415,7 @@ out_running:
2528#endif 2415#endif
2529out: 2416out:
2530 task_rq_unlock(rq, &flags); 2417 task_rq_unlock(rq, &flags);
2418 put_cpu();
2531 2419
2532 return success; 2420 return success;
2533} 2421}
@@ -2570,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_overlap = 0; 2458 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0; 2459 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2460 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2461 p->se.avg_running = 0;
2573 2462
2574#ifdef CONFIG_SCHEDSTATS 2463#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0; 2464 p->se.wait_start = 0;
@@ -2631,18 +2520,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2631 2520
2632 __sched_fork(p); 2521 __sched_fork(p);
2633 2522
2634#ifdef CONFIG_SMP
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2636#endif
2637 set_task_cpu(p, cpu);
2638
2639 /* 2523 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2524 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2525 */
2642 p->prio = current->normal_prio; 2526 p->prio = current->normal_prio;
2527
2528 /*
2529 * Revert to default priority/policy on fork if requested.
2530 */
2531 if (unlikely(p->sched_reset_on_fork)) {
2532 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2533 p->policy = SCHED_NORMAL;
2534
2535 if (p->normal_prio < DEFAULT_PRIO)
2536 p->prio = DEFAULT_PRIO;
2537
2538 if (PRIO_TO_NICE(p->static_prio) < 0) {
2539 p->static_prio = NICE_TO_PRIO(0);
2540 set_load_weight(p);
2541 }
2542
2543 /*
2544 * We don't need the reset flag anymore after the fork. It has
2545 * fulfilled its duty:
2546 */
2547 p->sched_reset_on_fork = 0;
2548 }
2549
2643 if (!rt_prio(p->prio)) 2550 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2551 p->sched_class = &fair_sched_class;
2645 2552
2553#ifdef CONFIG_SMP
2554 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2555#endif
2556 set_task_cpu(p, cpu);
2557
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2558#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on())) 2559 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2560 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2688,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2688 inc_nr_running(rq); 2600 inc_nr_running(rq);
2689 } 2601 }
2690 trace_sched_wakeup_new(rq, p, 1); 2602 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0); 2603 check_preempt_curr(rq, p, WF_FORK);
2692#ifdef CONFIG_SMP 2604#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up) 2605 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p); 2606 p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2708,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2708{
2797 struct mm_struct *mm = rq->prev_mm; 2709 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2710 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2711
2806 rq->prev_mm = NULL; 2712 rq->prev_mm = NULL;
2807 2713
@@ -2820,10 +2726,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2726 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2727 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2728 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2729
2828 fire_sched_in_preempt_notifiers(current); 2730 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2731 if (mm)
@@ -2838,6 +2740,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2740 }
2839} 2741}
2840 2742
2743#ifdef CONFIG_SMP
2744
2745/* assumes rq->lock is held */
2746static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2747{
2748 if (prev->sched_class->pre_schedule)
2749 prev->sched_class->pre_schedule(rq, prev);
2750}
2751
2752/* rq->lock is NOT held, but preemption is disabled */
2753static inline void post_schedule(struct rq *rq)
2754{
2755 if (rq->post_schedule) {
2756 unsigned long flags;
2757
2758 spin_lock_irqsave(&rq->lock, flags);
2759 if (rq->curr->sched_class->post_schedule)
2760 rq->curr->sched_class->post_schedule(rq);
2761 spin_unlock_irqrestore(&rq->lock, flags);
2762
2763 rq->post_schedule = 0;
2764 }
2765}
2766
2767#else
2768
2769static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2770{
2771}
2772
2773static inline void post_schedule(struct rq *rq)
2774{
2775}
2776
2777#endif
2778
2841/** 2779/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2780 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2781 * @prev: the thread we just switched away from.
@@ -2848,6 +2786,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2786 struct rq *rq = this_rq();
2849 2787
2850 finish_task_switch(rq, prev); 2788 finish_task_switch(rq, prev);
2789
2790 /*
2791 * FIXME: do we need to worry about rq being invalidated by the
2792 * task_switch?
2793 */
2794 post_schedule(rq);
2795
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2796#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2797 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2798 preempt_enable();
@@ -3164,7 +3109,7 @@ out:
3164void sched_exec(void) 3109void sched_exec(void)
3165{ 3110{
3166 int new_cpu, this_cpu = get_cpu(); 3111 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3112 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3168 put_cpu(); 3113 put_cpu();
3169 if (new_cpu != this_cpu) 3114 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu); 3115 sched_migrate_task(current, new_cpu);
@@ -3379,9 +3324,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3324{
3380 const struct sched_class *class; 3325 const struct sched_class *class;
3381 3326
3382 for (class = sched_class_highest; class; class = class->next) 3327 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3328 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3329 return 1;
3330 }
3385 3331
3386 return 0; 3332 return 0;
3387} 3333}
@@ -3544,7 +3490,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3490 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3491 * from other group and save more power
3546 */ 3492 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3493 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3494 return;
3549 3495
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3496 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3583 *imbalance = sds->min_load_per_task; 3529 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min; 3530 sds->busiest = sds->group_min;
3585 3531
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1; 3532 return 1;
3592 3533
3593} 3534}
@@ -3612,6 +3553,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3553#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3554
3614 3555
3556unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3557{
3558 return SCHED_LOAD_SCALE;
3559}
3560
3561unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3562{
3563 return default_scale_freq_power(sd, cpu);
3564}
3565
3566unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3567{
3568 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3569 unsigned long smt_gain = sd->smt_gain;
3570
3571 smt_gain /= weight;
3572
3573 return smt_gain;
3574}
3575
3576unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3577{
3578 return default_scale_smt_power(sd, cpu);
3579}
3580
3581unsigned long scale_rt_power(int cpu)
3582{
3583 struct rq *rq = cpu_rq(cpu);
3584 u64 total, available;
3585
3586 sched_avg_update(rq);
3587
3588 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3589 available = total - rq->rt_avg;
3590
3591 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3592 total = SCHED_LOAD_SCALE;
3593
3594 total >>= SCHED_LOAD_SHIFT;
3595
3596 return div_u64(available, total);
3597}
3598
3599static void update_cpu_power(struct sched_domain *sd, int cpu)
3600{
3601 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3602 unsigned long power = SCHED_LOAD_SCALE;
3603 struct sched_group *sdg = sd->groups;
3604
3605 if (sched_feat(ARCH_POWER))
3606 power *= arch_scale_freq_power(sd, cpu);
3607 else
3608 power *= default_scale_freq_power(sd, cpu);
3609
3610 power >>= SCHED_LOAD_SHIFT;
3611
3612 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3613 if (sched_feat(ARCH_POWER))
3614 power *= arch_scale_smt_power(sd, cpu);
3615 else
3616 power *= default_scale_smt_power(sd, cpu);
3617
3618 power >>= SCHED_LOAD_SHIFT;
3619 }
3620
3621 power *= scale_rt_power(cpu);
3622 power >>= SCHED_LOAD_SHIFT;
3623
3624 if (!power)
3625 power = 1;
3626
3627 sdg->cpu_power = power;
3628}
3629
3630static void update_group_power(struct sched_domain *sd, int cpu)
3631{
3632 struct sched_domain *child = sd->child;
3633 struct sched_group *group, *sdg = sd->groups;
3634 unsigned long power;
3635
3636 if (!child) {
3637 update_cpu_power(sd, cpu);
3638 return;
3639 }
3640
3641 power = 0;
3642
3643 group = child->groups;
3644 do {
3645 power += group->cpu_power;
3646 group = group->next;
3647 } while (group != child->groups);
3648
3649 sdg->cpu_power = power;
3650}
3651
3615/** 3652/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3653 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3617 * @group: sched_group whose statistics are to be updated. 3654 * @group: sched_group whose statistics are to be updated.
@@ -3624,7 +3661,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3661 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3662 * @sgs: variable to hold the statistics for this group.
3626 */ 3663 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3664static inline void update_sg_lb_stats(struct sched_domain *sd,
3665 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3666 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3667 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3668 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3673,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3673 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3674 unsigned long avg_load_per_task;
3637 3675
3638 if (local_group) 3676 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3677 balance_cpu = group_first_cpu(group);
3678 if (balance_cpu == this_cpu)
3679 update_group_power(sd, this_cpu);
3680 }
3640 3681
3641 /* Tally up the load of all CPUs in the group */ 3682 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3683 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3726,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3726 }
3686 3727
3687 /* Adjust by relative CPU power of the group */ 3728 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3729 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3730
3691 3731
3692 /* 3732 /*
@@ -3698,14 +3738,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3738 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3739 * the hierarchy?
3700 */ 3740 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3741 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3742 group->cpu_power;
3703 3743
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3744 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3745 sgs->group_imb = 1;
3706 3746
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3747 sgs->group_capacity =
3708 3748 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3749}
3710 3750
3711/** 3751/**
@@ -3723,9 +3763,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3763 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3764 struct sd_lb_stats *sds)
3725{ 3765{
3766 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3767 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3768 struct sg_lb_stats sgs;
3728 int load_idx; 3769 int load_idx, prefer_sibling = 0;
3770
3771 if (child && child->flags & SD_PREFER_SIBLING)
3772 prefer_sibling = 1;
3729 3773
3730 init_sd_power_savings_stats(sd, sds, idle); 3774 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3775 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3780,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3780 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3781 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3782 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3783 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3784 local_group, cpus, balance, &sgs);
3741 3785
3742 if (local_group && balance && !(*balance)) 3786 if (local_group && balance && !(*balance))
3743 return; 3787 return;
3744 3788
3745 sds->total_load += sgs.group_load; 3789 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3790 sds->total_pwr += group->cpu_power;
3791
3792 /*
3793 * In case the child domain prefers tasks go to siblings
3794 * first, lower the group capacity to one so that we'll try
3795 * and move all the excess tasks away.
3796 */
3797 if (prefer_sibling)
3798 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3799
3748 if (local_group) { 3800 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3801 sds->this_load = sgs.avg_load;
@@ -3763,7 +3815,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3815 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3816 group = group->next;
3765 } while (group != sd->groups); 3817 } while (group != sd->groups);
3766
3767} 3818}
3768 3819
3769/** 3820/**
@@ -3801,28 +3852,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3852 * moving them.
3802 */ 3853 */
3803 3854
3804 pwr_now += sds->busiest->__cpu_power * 3855 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3856 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3857 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3858 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3859 pwr_now /= SCHED_LOAD_SCALE;
3809 3860
3810 /* Amount of load we'd subtract */ 3861 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3862 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3863 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3864 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3865 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 3866 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 3867
3817 /* Amount of load we'd add */ 3868 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 3869 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3870 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 3871 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 3872 sds->this->cpu_power;
3822 else 3873 else
3823 tmp = sg_div_cpu_power(sds->this, 3874 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3875 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 3876 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 3877 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 3878 pwr_move /= SCHED_LOAD_SCALE;
3828 3879
@@ -3857,8 +3908,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 3908 sds->max_load - sds->busiest_load_per_task);
3858 3909
3859 /* How much load to actually move to equalise the imbalance */ 3910 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3911 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3912 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 3913 / SCHED_LOAD_SCALE;
3863 3914
3864 /* 3915 /*
@@ -3988,15 +4039,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4039 int i;
3989 4040
3990 for_each_cpu(i, sched_group_cpus(group)) { 4041 for_each_cpu(i, sched_group_cpus(group)) {
4042 unsigned long power = power_of(i);
4043 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4044 unsigned long wl;
3992 4045
3993 if (!cpumask_test_cpu(i, cpus)) 4046 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4047 continue;
3995 4048
3996 rq = cpu_rq(i); 4049 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4050 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4051 wl /= power;
3998 4052
3999 if (rq->nr_running == 1 && wl > imbalance) 4053 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4054 continue;
4001 4055
4002 if (wl > max_load) { 4056 if (wl > max_load) {
@@ -5257,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
5257#endif 5311#endif
5258} 5312}
5259 5313
5260static void put_prev_task(struct rq *rq, struct task_struct *prev) 5314static void put_prev_task(struct rq *rq, struct task_struct *p)
5261{ 5315{
5262 if (prev->state == TASK_RUNNING) { 5316 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5263 u64 runtime = prev->se.sum_exec_runtime;
5264 5317
5265 runtime -= prev->se.prev_sum_exec_runtime; 5318 update_avg(&p->se.avg_running, runtime);
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267 5319
5320 if (p->state == TASK_RUNNING) {
5268 /* 5321 /*
5269 * In order to avoid avg_overlap growing stale when we are 5322 * In order to avoid avg_overlap growing stale when we are
5270 * indeed overlapping and hence not getting put to sleep, grow 5323 * indeed overlapping and hence not getting put to sleep, grow
@@ -5274,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5274 * correlates to the amount of cache footprint a task can 5327 * correlates to the amount of cache footprint a task can
5275 * build up. 5328 * build up.
5276 */ 5329 */
5277 update_avg(&prev->se.avg_overlap, runtime); 5330 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5331 update_avg(&p->se.avg_overlap, runtime);
5332 } else {
5333 update_avg(&p->se.avg_running, 0);
5278 } 5334 }
5279 prev->sched_class->put_prev_task(rq, prev); 5335 p->sched_class->put_prev_task(rq, p);
5280} 5336}
5281 5337
5282/* 5338/*
@@ -5325,7 +5381,7 @@ need_resched:
5325 preempt_disable(); 5381 preempt_disable();
5326 cpu = smp_processor_id(); 5382 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5383 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5384 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5385 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5386 switch_count = &prev->nivcsw;
5331 5387
@@ -5349,10 +5405,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5405 switch_count = &prev->nvcsw;
5350 } 5406 }
5351 5407
5352#ifdef CONFIG_SMP 5408 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5409
5357 if (unlikely(!rq->nr_running)) 5410 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5411 idle_balance(cpu, rq);
@@ -5378,6 +5431,8 @@ need_resched_nonpreemptible:
5378 } else 5431 } else
5379 spin_unlock_irq(&rq->lock); 5432 spin_unlock_irq(&rq->lock);
5380 5433
5434 post_schedule(rq);
5435
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5436 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5437 goto need_resched_nonpreemptible;
5383 5438
@@ -5509,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5509 5564
5510#endif /* CONFIG_PREEMPT */ 5565#endif /* CONFIG_PREEMPT */
5511 5566
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5567int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5513 void *key) 5568 void *key)
5514{ 5569{
5515 return try_to_wake_up(curr->private, mode, sync); 5570 return try_to_wake_up(curr->private, mode, wake_flags);
5516} 5571}
5517EXPORT_SYMBOL(default_wake_function); 5572EXPORT_SYMBOL(default_wake_function);
5518 5573
@@ -5526,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
5526 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5581 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5527 */ 5582 */
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5583static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key) 5584 int nr_exclusive, int wake_flags, void *key)
5530{ 5585{
5531 wait_queue_t *curr, *next; 5586 wait_queue_t *curr, *next;
5532 5587
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5588 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags; 5589 unsigned flags = curr->flags;
5535 5590
5536 if (curr->func(curr, mode, sync, key) && 5591 if (curr->func(curr, mode, wake_flags, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break; 5593 break;
5539 } 5594 }
@@ -5594,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key) 5649 int nr_exclusive, void *key)
5595{ 5650{
5596 unsigned long flags; 5651 unsigned long flags;
5597 int sync = 1; 5652 int wake_flags = WF_SYNC;
5598 5653
5599 if (unlikely(!q)) 5654 if (unlikely(!q))
5600 return; 5655 return;
5601 5656
5602 if (unlikely(!nr_exclusive)) 5657 if (unlikely(!nr_exclusive))
5603 sync = 0; 5658 wake_flags = 0;
5604 5659
5605 spin_lock_irqsave(&q->lock, flags); 5660 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key); 5661 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5607 spin_unlock_irqrestore(&q->lock, flags); 5662 spin_unlock_irqrestore(&q->lock, flags);
5608} 5663}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5664EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6123,17 +6178,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6178 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6179 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6180 struct rq *rq;
6181 int reset_on_fork;
6126 6182
6127 /* may grab non-irq protected spin_locks */ 6183 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6184 BUG_ON(in_interrupt());
6129recheck: 6185recheck:
6130 /* double check policy once rq lock held */ 6186 /* double check policy once rq lock held */
6131 if (policy < 0) 6187 if (policy < 0) {
6188 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6189 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6190 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6191 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6192 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6193
6194 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6195 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6196 policy != SCHED_IDLE)
6197 return -EINVAL;
6198 }
6199
6137 /* 6200 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6201 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6202 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6240,10 @@ recheck:
6177 /* can't change other user's priorities */ 6240 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6241 if (!check_same_owner(p))
6179 return -EPERM; 6242 return -EPERM;
6243
6244 /* Normal users shall not reset the sched_reset_on_fork flag */
6245 if (p->sched_reset_on_fork && !reset_on_fork)
6246 return -EPERM;
6180 } 6247 }
6181 6248
6182 if (user) { 6249 if (user) {
@@ -6220,6 +6287,8 @@ recheck:
6220 if (running) 6287 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6288 p->sched_class->put_prev_task(rq, p);
6222 6289
6290 p->sched_reset_on_fork = reset_on_fork;
6291
6223 oldprio = p->prio; 6292 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6293 __setscheduler(rq, p, policy, param->sched_priority);
6225 6294
@@ -6336,14 +6405,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6405 if (p) {
6337 retval = security_task_getscheduler(p); 6406 retval = security_task_getscheduler(p);
6338 if (!retval) 6407 if (!retval)
6339 retval = p->policy; 6408 retval = p->policy
6409 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6410 }
6341 read_unlock(&tasklist_lock); 6411 read_unlock(&tasklist_lock);
6342 return retval; 6412 return retval;
6343} 6413}
6344 6414
6345/** 6415/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6416 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6417 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6418 * @param: structure containing the RT priority.
6349 */ 6419 */
@@ -6571,19 +6641,9 @@ static inline int should_resched(void)
6571 6641
6572static void __cond_resched(void) 6642static void __cond_resched(void)
6573{ 6643{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6644 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6645 schedule();
6576#endif 6646 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6647}
6588 6648
6589int __sched _cond_resched(void) 6649int __sched _cond_resched(void)
@@ -6597,18 +6657,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6657EXPORT_SYMBOL(_cond_resched);
6598 6658
6599/* 6659/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6660 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6661 * call schedule, and on return reacquire the lock.
6602 * 6662 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6663 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6664 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6665 * spin_unlock(), once by hand).
6606 */ 6666 */
6607int cond_resched_lock(spinlock_t *lock) 6667int __cond_resched_lock(spinlock_t *lock)
6608{ 6668{
6609 int resched = should_resched(); 6669 int resched = should_resched();
6610 int ret = 0; 6670 int ret = 0;
6611 6671
6672 lockdep_assert_held(lock);
6673
6612 if (spin_needbreak(lock) || resched) { 6674 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6675 spin_unlock(lock);
6614 if (resched) 6676 if (resched)
@@ -6620,9 +6682,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6682 }
6621 return ret; 6683 return ret;
6622} 6684}
6623EXPORT_SYMBOL(cond_resched_lock); 6685EXPORT_SYMBOL(__cond_resched_lock);
6624 6686
6625int __sched cond_resched_softirq(void) 6687int __sched __cond_resched_softirq(void)
6626{ 6688{
6627 BUG_ON(!in_softirq()); 6689 BUG_ON(!in_softirq());
6628 6690
@@ -6634,7 +6696,7 @@ int __sched cond_resched_softirq(void)
6634 } 6696 }
6635 return 0; 6697 return 0;
6636} 6698}
6637EXPORT_SYMBOL(cond_resched_softirq); 6699EXPORT_SYMBOL(__cond_resched_softirq);
6638 6700
6639/** 6701/**
6640 * yield - yield the current processor to other threads. 6702 * yield - yield the current processor to other threads.
@@ -6658,11 +6720,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6720 */
6659void __sched io_schedule(void) 6721void __sched io_schedule(void)
6660{ 6722{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6723 struct rq *rq = raw_rq();
6662 6724
6663 delayacct_blkio_start(); 6725 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6726 atomic_inc(&rq->nr_iowait);
6727 current->in_iowait = 1;
6665 schedule(); 6728 schedule();
6729 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6730 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6731 delayacct_blkio_end();
6668} 6732}
@@ -6670,12 +6734,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6734
6671long __sched io_schedule_timeout(long timeout) 6735long __sched io_schedule_timeout(long timeout)
6672{ 6736{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6737 struct rq *rq = raw_rq();
6674 long ret; 6738 long ret;
6675 6739
6676 delayacct_blkio_start(); 6740 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6741 atomic_inc(&rq->nr_iowait);
6742 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6743 ret = schedule_timeout(timeout);
6744 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6745 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6746 delayacct_blkio_end();
6681 return ret; 6747 return ret;
@@ -6992,8 +7058,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7058
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7059 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7060 /* Need help from migration thread: drop lock and wait. */
7061 struct task_struct *mt = rq->migration_thread;
7062
7063 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7064 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7065 wake_up_process(rq->migration_thread);
7066 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7067 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7068 tlb_migrate_finish(p->mm);
6999 return 0; 7069 return 0;
@@ -7051,6 +7121,11 @@ fail:
7051 return ret; 7121 return ret;
7052} 7122}
7053 7123
7124#define RCU_MIGRATION_IDLE 0
7125#define RCU_MIGRATION_NEED_QS 1
7126#define RCU_MIGRATION_GOT_QS 2
7127#define RCU_MIGRATION_MUST_SYNC 3
7128
7054/* 7129/*
7055 * migration_thread - this is a highprio system thread that performs 7130 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7131 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7133,7 @@ fail:
7058 */ 7133 */
7059static int migration_thread(void *data) 7134static int migration_thread(void *data)
7060{ 7135{
7136 int badcpu;
7061 int cpu = (long)data; 7137 int cpu = (long)data;
7062 struct rq *rq; 7138 struct rq *rq;
7063 7139
@@ -7092,8 +7168,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7168 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7169 list_del_init(head->next);
7094 7170
7095 spin_unlock(&rq->lock); 7171 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7172 spin_unlock(&rq->lock);
7173 __migrate_task(req->task, cpu, req->dest_cpu);
7174 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7175 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7176 spin_unlock(&rq->lock);
7177 } else {
7178 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7179 spin_unlock(&rq->lock);
7180 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7181 }
7097 local_irq_enable(); 7182 local_irq_enable();
7098 7183
7099 complete(&req->done); 7184 complete(&req->done);
@@ -7625,7 +7710,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7710 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7711 register_cpu_notifier(&migration_notifier);
7627 7712
7628 return err; 7713 return 0;
7629} 7714}
7630early_initcall(migration_init); 7715early_initcall(migration_init);
7631#endif 7716#endif
@@ -7672,7 +7757,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7757 break;
7673 } 7758 }
7674 7759
7675 if (!group->__cpu_power) { 7760 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7761 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7762 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7763 "set\n");
@@ -7696,9 +7781,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7781 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7782
7698 printk(KERN_CONT " %s", str); 7783 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7784 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7785 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7786 group->cpu_power);
7702 } 7787 }
7703 7788
7704 group = group->next; 7789 group = group->next;
@@ -7763,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
7763 } 7848 }
7764 7849
7765 /* Following flags don't use groups */ 7850 /* Following flags don't use groups */
7766 if (sd->flags & (SD_WAKE_IDLE | 7851 if (sd->flags & (SD_WAKE_AFFINE))
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0; 7852 return 0;
7770 7853
7771 return 1; 7854 return 1;
@@ -7782,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7865 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0; 7866 return 0;
7784 7867
7785 /* Does parent contain flags not in child? */
7786 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789 /* Flags needing groups don't count if only 1 group in parent */ 7868 /* Flags needing groups don't count if only 1 group in parent */
7790 if (parent->groups == parent->groups->next) { 7869 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE | 7870 pflags &= ~(SD_LOAD_BALANCE |
@@ -7841,7 +7920,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7920 rq->rd = rd;
7842 7921
7843 cpumask_set_cpu(rq->cpu, rd->span); 7922 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7923 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7924 set_rq_online(rq);
7846 7925
7847 spin_unlock_irqrestore(&rq->lock, flags); 7926 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8062,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8062 continue;
7984 8063
7985 cpumask_clear(sched_group_cpus(sg)); 8064 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8065 sg->cpu_power = 0;
7987 8066
7988 for_each_cpu(j, span) { 8067 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8068 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8170,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8170 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8171};
8093 8172
8173struct s_data {
8174#ifdef CONFIG_NUMA
8175 int sd_allnodes;
8176 cpumask_var_t domainspan;
8177 cpumask_var_t covered;
8178 cpumask_var_t notcovered;
8179#endif
8180 cpumask_var_t nodemask;
8181 cpumask_var_t this_sibling_map;
8182 cpumask_var_t this_core_map;
8183 cpumask_var_t send_covered;
8184 cpumask_var_t tmpmask;
8185 struct sched_group **sched_group_nodes;
8186 struct root_domain *rd;
8187};
8188
8189enum s_alloc {
8190 sa_sched_groups = 0,
8191 sa_rootdomain,
8192 sa_tmpmask,
8193 sa_send_covered,
8194 sa_this_core_map,
8195 sa_this_sibling_map,
8196 sa_nodemask,
8197 sa_sched_group_nodes,
8198#ifdef CONFIG_NUMA
8199 sa_notcovered,
8200 sa_covered,
8201 sa_domainspan,
8202#endif
8203 sa_none,
8204};
8205
8094/* 8206/*
8095 * SMT sched-domains: 8207 * SMT sched-domains:
8096 */ 8208 */
@@ -8208,11 +8320,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8320 continue;
8209 } 8321 }
8210 8322
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8323 sg->cpu_power += sd->groups->cpu_power;
8212 } 8324 }
8213 sg = sg->next; 8325 sg = sg->next;
8214 } while (sg != group_head); 8326 } while (sg != group_head);
8215} 8327}
8328
8329static int build_numa_sched_groups(struct s_data *d,
8330 const struct cpumask *cpu_map, int num)
8331{
8332 struct sched_domain *sd;
8333 struct sched_group *sg, *prev;
8334 int n, j;
8335
8336 cpumask_clear(d->covered);
8337 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8338 if (cpumask_empty(d->nodemask)) {
8339 d->sched_group_nodes[num] = NULL;
8340 goto out;
8341 }
8342
8343 sched_domain_node_span(num, d->domainspan);
8344 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8345
8346 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8347 GFP_KERNEL, num);
8348 if (!sg) {
8349 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8350 num);
8351 return -ENOMEM;
8352 }
8353 d->sched_group_nodes[num] = sg;
8354
8355 for_each_cpu(j, d->nodemask) {
8356 sd = &per_cpu(node_domains, j).sd;
8357 sd->groups = sg;
8358 }
8359
8360 sg->cpu_power = 0;
8361 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8362 sg->next = sg;
8363 cpumask_or(d->covered, d->covered, d->nodemask);
8364
8365 prev = sg;
8366 for (j = 0; j < nr_node_ids; j++) {
8367 n = (num + j) % nr_node_ids;
8368 cpumask_complement(d->notcovered, d->covered);
8369 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8370 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8371 if (cpumask_empty(d->tmpmask))
8372 break;
8373 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8374 if (cpumask_empty(d->tmpmask))
8375 continue;
8376 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8377 GFP_KERNEL, num);
8378 if (!sg) {
8379 printk(KERN_WARNING
8380 "Can not alloc domain group for node %d\n", j);
8381 return -ENOMEM;
8382 }
8383 sg->cpu_power = 0;
8384 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8385 sg->next = prev->next;
8386 cpumask_or(d->covered, d->covered, d->tmpmask);
8387 prev->next = sg;
8388 prev = sg;
8389 }
8390out:
8391 return 0;
8392}
8216#endif /* CONFIG_NUMA */ 8393#endif /* CONFIG_NUMA */
8217 8394
8218#ifdef CONFIG_NUMA 8395#ifdef CONFIG_NUMA
@@ -8266,15 +8443,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8443 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8444 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8445 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8446 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8447static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8448{
8276 struct sched_domain *child; 8449 struct sched_domain *child;
8277 struct sched_group *group; 8450 struct sched_group *group;
8451 long power;
8452 int weight;
8278 8453
8279 WARN_ON(!sd || !sd->groups); 8454 WARN_ON(!sd || !sd->groups);
8280 8455
@@ -8283,28 +8458,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8458
8284 child = sd->child; 8459 child = sd->child;
8285 8460
8286 sd->groups->__cpu_power = 0; 8461 sd->groups->cpu_power = 0;
8287 8462
8288 /* 8463 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8464 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8465 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8466 /*
8292 * can handle only one task, when there are other idle groups in the 8467 * SMT siblings share the power of a single core.
8293 * same sched domain. 8468 * Usually multiple threads get a better yield out of
8294 */ 8469 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8470 * reflect that in sd->smt_gain.
8296 (child->flags & 8471 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8472 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8473 power *= sd->smt_gain;
8474 power /= weight;
8475 power >>= SCHED_LOAD_SHIFT;
8476 }
8477 sd->groups->cpu_power += power;
8299 return; 8478 return;
8300 } 8479 }
8301 8480
8302 /* 8481 /*
8303 * add cpu_power of each child group to this groups cpu_power 8482 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8483 */
8305 group = child->groups; 8484 group = child->groups;
8306 do { 8485 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8486 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8487 group = group->next;
8309 } while (group != child->groups); 8488 } while (group != child->groups);
8310} 8489}
@@ -8371,287 +8550,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8371 request = attr->relax_domain_level; 8550 request = attr->relax_domain_level;
8372 if (request < sd->level) { 8551 if (request < sd->level) {
8373 /* turn off idle balance on this domain */ 8552 /* turn off idle balance on this domain */
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8553 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8375 } else { 8554 } else {
8376 /* turn on idle balance on this domain */ 8555 /* turn on idle balance on this domain */
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8556 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8557 }
8558}
8559
8560static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8561 const struct cpumask *cpu_map)
8562{
8563 switch (what) {
8564 case sa_sched_groups:
8565 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8566 d->sched_group_nodes = NULL;
8567 case sa_rootdomain:
8568 free_rootdomain(d->rd); /* fall through */
8569 case sa_tmpmask:
8570 free_cpumask_var(d->tmpmask); /* fall through */
8571 case sa_send_covered:
8572 free_cpumask_var(d->send_covered); /* fall through */
8573 case sa_this_core_map:
8574 free_cpumask_var(d->this_core_map); /* fall through */
8575 case sa_this_sibling_map:
8576 free_cpumask_var(d->this_sibling_map); /* fall through */
8577 case sa_nodemask:
8578 free_cpumask_var(d->nodemask); /* fall through */
8579 case sa_sched_group_nodes:
8580#ifdef CONFIG_NUMA
8581 kfree(d->sched_group_nodes); /* fall through */
8582 case sa_notcovered:
8583 free_cpumask_var(d->notcovered); /* fall through */
8584 case sa_covered:
8585 free_cpumask_var(d->covered); /* fall through */
8586 case sa_domainspan:
8587 free_cpumask_var(d->domainspan); /* fall through */
8588#endif
8589 case sa_none:
8590 break;
8378 } 8591 }
8379} 8592}
8380 8593
8381/* 8594static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8595 const struct cpumask *cpu_map)
8383 * to the individual cpus
8384 */
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{ 8596{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered;
8394 struct sched_group **sched_group_nodes = NULL;
8395 int sd_allnodes = 0;
8396
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
8398 goto out;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
8400 goto free_domainspan;
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
8402 goto free_covered;
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA 8597#ifdef CONFIG_NUMA
8417 /* 8598 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8599 return sa_none;
8419 */ 8600 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8601 return sa_domainspan;
8421 GFP_KERNEL); 8602 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8603 return sa_covered;
8604 /* Allocate the per-node list of sched groups */
8605 d->sched_group_nodes = kcalloc(nr_node_ids,
8606 sizeof(struct sched_group *), GFP_KERNEL);
8607 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8608 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8609 return sa_notcovered;
8425 } 8610 }
8426#endif 8611 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8612#endif
8428 rd = alloc_rootdomain(); 8613 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8614 return sa_sched_group_nodes;
8615 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8616 return sa_nodemask;
8617 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8618 return sa_this_sibling_map;
8619 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8620 return sa_this_core_map;
8621 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8622 return sa_send_covered;
8623 d->rd = alloc_rootdomain();
8624 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8625 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8626 return sa_tmpmask;
8432 } 8627 }
8628 return sa_rootdomain;
8629}
8433 8630
8631static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8632 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8633{
8634 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8635#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8636 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8637
8459 sd = &per_cpu(node_domains, i).sd; 8638 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8639 if (cpumask_weight(cpu_map) >
8640 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8641 sd = &per_cpu(allnodes_domains, i).sd;
8642 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8643 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8644 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8645 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8646 d->sd_allnodes = 1;
8465 p->child = sd; 8647 }
8466 cpumask_and(sched_domain_span(sd), 8648 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8649
8650 sd = &per_cpu(node_domains, i).sd;
8651 SD_INIT(sd, NODE);
8652 set_domain_attribute(sd, attr);
8653 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8654 sd->parent = parent;
8655 if (parent)
8656 parent->child = sd;
8657 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8658#endif
8659 return sd;
8660}
8469 8661
8470 p = sd; 8662static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8663 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8664 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8665{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8666 struct sched_domain *sd;
8475 sd->parent = p; 8667 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8668 SD_INIT(sd, CPU);
8477 p->child = sd; 8669 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8670 cpumask_copy(sched_domain_span(sd), d->nodemask);
8671 sd->parent = parent;
8672 if (parent)
8673 parent->child = sd;
8674 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8675 return sd;
8676}
8479 8677
8678static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8679 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8680 struct sched_domain *parent, int i)
8681{
8682 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8683#ifdef CONFIG_SCHED_MC
8481 p = sd; 8684 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8685 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8686 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8687 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8688 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8689 parent->child = sd;
8487 sd->parent = p; 8690 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8691#endif
8692 return sd;
8693}
8491 8694
8695static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8696 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8697 struct sched_domain *parent, int i)
8698{
8699 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8700#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8701 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8702 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8703 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8704 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8705 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8706 parent->child = sd;
8499 sd->parent = p; 8707 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8708#endif
8503 } 8709 return sd;
8710}
8504 8711
8712static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8713 const struct cpumask *cpu_map, int cpu)
8714{
8715 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8716#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8717 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8718 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8719 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8720 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8721 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8722 &cpu_to_cpu_group,
8512 8723 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8724 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8725#endif
8518
8519#ifdef CONFIG_SCHED_MC 8726#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8727 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8728 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8729 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8730 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8731 &cpu_to_core_group,
8525 8732 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8733 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8734#endif
8531 8735 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8736 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8737 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8738 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8739 &cpu_to_phys_group,
8536 continue; 8740 d->send_covered, d->tmpmask);
8537 8741 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8742#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8743 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8744 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8745 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8746 break;
8548 send_covered, tmpmask); 8747#endif
8748 default:
8749 break;
8549 } 8750 }
8751}
8550 8752
8551 for (i = 0; i < nr_node_ids; i++) { 8753/*
8552 /* Set up node groups */ 8754 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8755 * to the individual cpus
8554 int j; 8756 */
8555 8757static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8758 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8759{
8558 if (cpumask_empty(nodemask)) { 8760 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8761 struct s_data d;
8560 continue; 8762 struct sched_domain *sd;
8561 } 8763 int i;
8764#ifdef CONFIG_NUMA
8765 d.sd_allnodes = 0;
8766#endif
8562 8767
8563 sched_domain_node_span(i, domainspan); 8768 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8769 if (alloc_state != sa_rootdomain)
8770 goto error;
8771 alloc_state = sa_sched_groups;
8565 8772
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8773 /*
8567 GFP_KERNEL, i); 8774 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8775 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8776 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8777 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8778 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8779
8577 sd = &per_cpu(node_domains, j).sd; 8780 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8781 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8782 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8783 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8784 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8785
8586 for (j = 0; j < nr_node_ids; j++) { 8786 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8787 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8788 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8789 }
8588 8790
8589 cpumask_complement(notcovered, covered); 8791 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8792 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8793 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8794
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8795#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8796 /* Set up node groups */
8597 continue; 8797 if (d.sd_allnodes)
8798 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8799
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8800 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8801 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8802 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8803#endif
8616 8804
8617 /* Calculate CPU power for physical packages and nodes */ 8805 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8806#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8807 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8808 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8809 init_sched_groups_power(i, sd);
8623 } 8810 }
8624#endif 8811#endif
8625#ifdef CONFIG_SCHED_MC 8812#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8813 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8814 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8815 init_sched_groups_power(i, sd);
8630 } 8816 }
8631#endif 8817#endif
8632 8818
8633 for_each_cpu(i, cpu_map) { 8819 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8820 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8821 init_sched_groups_power(i, sd);
8637 } 8822 }
8638 8823
8639#ifdef CONFIG_NUMA 8824#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8825 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8826 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8827
8643 if (sd_allnodes) { 8828 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8829 struct sched_group *sg;
8645 8830
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8831 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8832 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8833 init_numa_sched_groups_power(sg);
8649 } 8834 }
8650#endif 8835#endif
8651 8836
8652 /* Attach the domains */ 8837 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8838 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8839#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8840 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8841#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8843,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8843#else
8660 sd = &per_cpu(phys_domains, i).sd; 8844 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8845#endif
8662 cpu_attach_domain(sd, rd, i); 8846 cpu_attach_domain(sd, d.rd, i);
8663 } 8847 }
8664 8848
8665 err = 0; 8849 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8850 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8851 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8852
8694#ifdef CONFIG_NUMA
8695error: 8853error:
8696 free_sched_groups(cpu_map, tmpmask); 8854 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8855 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8856}
8701 8857
8702static int build_sched_domains(const struct cpumask *cpu_map) 8858static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9460,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9460 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9461 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9462 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9463 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9464 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9465 */
9310 init_tg_cfs_entry(&init_task_group, 9466 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9467 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9468 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9469 root_task_group.se[i]);
9314 9470
@@ -9334,6 +9490,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9490#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9491 rq->sd = NULL;
9336 rq->rd = NULL; 9492 rq->rd = NULL;
9493 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9494 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9495 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9496 rq->push_cpu = 0;
@@ -9398,13 +9555,20 @@ void __init sched_init(void)
9398} 9555}
9399 9556
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9557#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9558static inline int preempt_count_equals(int preempt_offset)
9559{
9560 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9561
9562 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9563}
9564
9565void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9566{
9403#ifdef in_atomic 9567#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9568 static unsigned long prev_jiffy; /* ratelimiting */
9405 9569
9406 if ((!in_atomic() && !irqs_disabled()) || 9570 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9571 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9572 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9573 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9574 return;
@@ -10581,3 +10745,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10745 .subsys_id = cpuacct_subsys_id,
10582}; 10746};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10747#endif /* CONFIG_CGROUP_CPUACCT */
10748
10749#ifndef CONFIG_SMP
10750
10751int rcu_expedited_torture_stats(char *page)
10752{
10753 return 0;
10754}
10755EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10756
10757void synchronize_sched_expedited(void)
10758{
10759}
10760EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10761
10762#else /* #ifndef CONFIG_SMP */
10763
10764static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10765static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10766
10767#define RCU_EXPEDITED_STATE_POST -2
10768#define RCU_EXPEDITED_STATE_IDLE -1
10769
10770static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10771
10772int rcu_expedited_torture_stats(char *page)
10773{
10774 int cnt = 0;
10775 int cpu;
10776
10777 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10778 for_each_online_cpu(cpu) {
10779 cnt += sprintf(&page[cnt], " %d:%d",
10780 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10781 }
10782 cnt += sprintf(&page[cnt], "\n");
10783 return cnt;
10784}
10785EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10786
10787static long synchronize_sched_expedited_count;
10788
10789/*
10790 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10791 * approach to force grace period to end quickly. This consumes
10792 * significant time on all CPUs, and is thus not recommended for
10793 * any sort of common-case code.
10794 *
10795 * Note that it is illegal to call this function while holding any
10796 * lock that is acquired by a CPU-hotplug notifier. Failing to
10797 * observe this restriction will result in deadlock.
10798 */
10799void synchronize_sched_expedited(void)
10800{
10801 int cpu;
10802 unsigned long flags;
10803 bool need_full_sync = 0;
10804 struct rq *rq;
10805 struct migration_req *req;
10806 long snap;
10807 int trycount = 0;
10808
10809 smp_mb(); /* ensure prior mod happens before capturing snap. */
10810 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10811 get_online_cpus();
10812 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10813 put_online_cpus();
10814 if (trycount++ < 10)
10815 udelay(trycount * num_online_cpus());
10816 else {
10817 synchronize_sched();
10818 return;
10819 }
10820 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10821 smp_mb(); /* ensure test happens before caller kfree */
10822 return;
10823 }
10824 get_online_cpus();
10825 }
10826 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10827 for_each_online_cpu(cpu) {
10828 rq = cpu_rq(cpu);
10829 req = &per_cpu(rcu_migration_req, cpu);
10830 init_completion(&req->done);
10831 req->task = NULL;
10832 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10833 spin_lock_irqsave(&rq->lock, flags);
10834 list_add(&req->list, &rq->migration_queue);
10835 spin_unlock_irqrestore(&rq->lock, flags);
10836 wake_up_process(rq->migration_thread);
10837 }
10838 for_each_online_cpu(cpu) {
10839 rcu_expedited_state = cpu;
10840 req = &per_cpu(rcu_migration_req, cpu);
10841 rq = cpu_rq(cpu);
10842 wait_for_completion(&req->done);
10843 spin_lock_irqsave(&rq->lock, flags);
10844 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10845 need_full_sync = 1;
10846 req->dest_cpu = RCU_MIGRATION_IDLE;
10847 spin_unlock_irqrestore(&rq->lock, flags);
10848 }
10849 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10850 mutex_unlock(&rcu_sched_expedited_mutex);
10851 put_online_cpus();
10852 if (need_full_sync)
10853 synchronize_sched();
10854}
10855EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10856
10857#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
@@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 410 PN(se.wait_max);
410 PN(se.wait_sum); 411 PN(se.wait_sum);
411 P(se.wait_count); 412 P(se.wait_count);
413 PN(se.iowait_sum);
414 P(se.iowait_count);
412 P(sched_info.bkl_count); 415 P(sched_info.bkl_count);
413 P(se.nr_migrations); 416 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 417 P(se.nr_migrations_cold);
@@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 482 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 483 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 484 p->se.wait_count = 0;
485 p->se.iowait_sum = 0;
486 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 487 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 488 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 489 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..10d218ab69f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 545 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 546 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 547 rq_of(cfs_rq)->clock - se->wait_start);
548#ifdef CONFIG_SCHEDSTATS
549 if (entity_is_task(se)) {
550 trace_sched_stat_wait(task_of(se),
551 rq_of(cfs_rq)->clock - se->wait_start);
552 }
553#endif
540 schedstat_set(se->wait_start, 0); 554 schedstat_set(se->wait_start, 0);
541} 555}
542 556
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 642 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 643 se->sum_sleep_runtime += delta;
630 644
631 if (tsk) 645 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 646 account_scheduler_latency(tsk, delta >> 10, 1);
647 trace_sched_stat_sleep(tsk, delta);
648 }
633 } 649 }
634 if (se->block_start) { 650 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 651 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 660 se->sum_sleep_runtime += delta;
645 661
646 if (tsk) { 662 if (tsk) {
663 if (tsk->in_iowait) {
664 se->iowait_sum += delta;
665 se->iowait_count++;
666 trace_sched_stat_iowait(tsk, delta);
667 }
668
647 /* 669 /*
648 * Blocking time is in units of nanosecs, so shift by 670 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 671 * 20 to get a milliseconds-range estimation of the
@@ -689,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
689 711
690 if (!initial) { 712 if (!initial) {
691 /* sleeps upto a single latency don't count. */ 713 /* sleeps upto a single latency don't count. */
692 if (sched_feat(NEW_FAIR_SLEEPERS)) { 714 if (sched_feat(FAIR_SLEEPERS)) {
693 unsigned long thresh = sysctl_sched_latency; 715 unsigned long thresh = sysctl_sched_latency;
694 716
695 /* 717 /*
@@ -703,13 +725,20 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
703 task_of(se)->policy != SCHED_IDLE)) 725 task_of(se)->policy != SCHED_IDLE))
704 thresh = calc_delta_fair(thresh, se); 726 thresh = calc_delta_fair(thresh, se);
705 727
728 /*
729 * Halve their sleep time's effect, to allow
730 * for a gentler effect of sleepers:
731 */
732 if (sched_feat(GENTLE_FAIR_SLEEPERS))
733 thresh >>= 1;
734
706 vruntime -= thresh; 735 vruntime -= thresh;
707 } 736 }
708
709 /* ensure we never gain time by being placed backwards. */
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 737 }
712 738
739 /* ensure we never gain time by being placed backwards. */
740 vruntime = max_vruntime(se->vruntime, vruntime);
741
713 se->vruntime = vruntime; 742 se->vruntime = vruntime;
714} 743}
715 744
@@ -735,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
735 764
736static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 765static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
737{ 766{
738 if (cfs_rq->last == se) 767 if (!se || cfs_rq->last == se)
739 cfs_rq->last = NULL; 768 cfs_rq->last = NULL;
740 769
741 if (cfs_rq->next == se) 770 if (!se || cfs_rq->next == se)
742 cfs_rq->next = NULL; 771 cfs_rq->next = NULL;
743} 772}
744 773
@@ -1040,79 +1069,6 @@ static void yield_task_fair(struct rq *rq)
1040 se->vruntime = rightmost->vruntime + 1; 1069 se->vruntime = rightmost->vruntime + 1;
1041} 1070}
1042 1071
1043/*
1044 * wake_idle() will wake a task on an idle cpu if task->cpu is
1045 * not idle and an idle cpu is available. The span of cpus to
1046 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask)
1050 *
1051 * Returns the CPU we should wake onto.
1052 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1054static int wake_idle(int cpu, struct task_struct *p)
1055{
1056 struct sched_domain *sd;
1057 int i;
1058 unsigned int chosen_wakeup_cpu;
1059 int this_cpu;
1060
1061 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1063 * are idle and this is not a kernel thread and this task's affinity
1064 * allows it to be moved to preferred cpu, then just move!
1065 */
1066
1067 this_cpu = smp_processor_id();
1068 chosen_wakeup_cpu =
1069 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1070
1071 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1072 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1073 p->mm && !(p->flags & PF_KTHREAD) &&
1074 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1075 return chosen_wakeup_cpu;
1076
1077 /*
1078 * If it is idle, then it is the best cpu to run this task.
1079 *
1080 * This cpu is also the best, if it has more than one task already.
1081 * Siblings must be also busy(in most cases) as they didn't already
1082 * pickup the extra load from this cpu and hence we need not check
1083 * sibling runqueue info. This will avoid the checks and cache miss
1084 * penalities associated with that.
1085 */
1086 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1087 return cpu;
1088
1089 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) {
1097 schedstat_inc(p,
1098 se.nr_wakeups_idle);
1099 }
1100 return i;
1101 }
1102 }
1103 } else {
1104 break;
1105 }
1106 }
1107 return cpu;
1108}
1109#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1110static inline int wake_idle(int cpu, struct task_struct *p)
1111{
1112 return cpu;
1113}
1114#endif
1115
1116#ifdef CONFIG_SMP 1072#ifdef CONFIG_SMP
1117 1073
1118#ifdef CONFIG_FAIR_GROUP_SCHED 1074#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1199,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1199 1155
1200#endif 1156#endif
1201 1157
1202static int 1158static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1203wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1204 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1205 int idx, unsigned long load, unsigned long this_load,
1206 unsigned int imbalance)
1207{ 1159{
1208 struct task_struct *curr = this_rq->curr; 1160 struct task_struct *curr = current;
1209 struct task_group *tg; 1161 unsigned long this_load, load;
1210 unsigned long tl = this_load; 1162 int idx, this_cpu, prev_cpu;
1211 unsigned long tl_per_task; 1163 unsigned long tl_per_task;
1164 unsigned int imbalance;
1165 struct task_group *tg;
1212 unsigned long weight; 1166 unsigned long weight;
1213 int balanced; 1167 int balanced;
1214 1168
1215 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1169 idx = sd->wake_idx;
1216 return 0; 1170 this_cpu = smp_processor_id();
1171 prev_cpu = task_cpu(p);
1172 load = source_load(prev_cpu, idx);
1173 this_load = target_load(this_cpu, idx);
1217 1174
1218 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1175 if (sync) {
1219 p->se.avg_overlap > sysctl_sched_migration_cost)) 1176 if (sched_feat(SYNC_LESS) &&
1220 sync = 0; 1177 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1178 p->se.avg_overlap > sysctl_sched_migration_cost))
1179 sync = 0;
1180 } else {
1181 if (sched_feat(SYNC_MORE) &&
1182 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1183 p->se.avg_overlap < sysctl_sched_migration_cost))
1184 sync = 1;
1185 }
1221 1186
1222 /* 1187 /*
1223 * If sync wakeup then subtract the (maximum possible) 1188 * If sync wakeup then subtract the (maximum possible)
@@ -1228,14 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1228 tg = task_group(current); 1193 tg = task_group(current);
1229 weight = current->se.load.weight; 1194 weight = current->se.load.weight;
1230 1195
1231 tl += effective_load(tg, this_cpu, -weight, -weight); 1196 this_load += effective_load(tg, this_cpu, -weight, -weight);
1232 load += effective_load(tg, prev_cpu, 0, -weight); 1197 load += effective_load(tg, prev_cpu, 0, -weight);
1233 } 1198 }
1234 1199
1235 tg = task_group(p); 1200 tg = task_group(p);
1236 weight = p->se.load.weight; 1201 weight = p->se.load.weight;
1237 1202
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1203 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1204
1205 /*
1206 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1207 * due to the sync cause above having dropped this_load to 0, we'll
1208 * always have an imbalance, but there's really nothing you can do
1209 * about that, so that's good too.
1210 *
1211 * Otherwise check if either cpus are near enough in load to allow this
1212 * task to be woken on this_cpu.
1213 */
1214 balanced = !this_load ||
1215 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1216 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1217
1241 /* 1218 /*
@@ -1249,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1249 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1226 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1250 tl_per_task = cpu_avg_load_per_task(this_cpu); 1227 tl_per_task = cpu_avg_load_per_task(this_cpu);
1251 1228
1252 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1229 if (balanced ||
1253 tl_per_task)) { 1230 (this_load <= load &&
1231 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1254 /* 1232 /*
1255 * This domain has SD_WAKE_AFFINE and 1233 * This domain has SD_WAKE_AFFINE and
1256 * p is cache cold in this domain, and 1234 * p is cache cold in this domain, and
1257 * there is no bad imbalance. 1235 * there is no bad imbalance.
1258 */ 1236 */
1259 schedstat_inc(this_sd, ttwu_move_affine); 1237 schedstat_inc(sd, ttwu_move_affine);
1260 schedstat_inc(p, se.nr_wakeups_affine); 1238 schedstat_inc(p, se.nr_wakeups_affine);
1261 1239
1262 return 1; 1240 return 1;
@@ -1264,67 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1264 return 0; 1242 return 0;
1265} 1243}
1266 1244
1267static int select_task_rq_fair(struct task_struct *p, int sync) 1245/*
1246 * find_idlest_group finds and returns the least busy CPU group within the
1247 * domain.
1248 */
1249static struct sched_group *
1250find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1251 int this_cpu, int load_idx)
1268{ 1252{
1269 struct sched_domain *sd, *this_sd = NULL; 1253 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1270 int prev_cpu, this_cpu, new_cpu; 1254 unsigned long min_load = ULONG_MAX, this_load = 0;
1271 unsigned long load, this_load; 1255 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1272 struct rq *this_rq;
1273 unsigned int imbalance;
1274 int idx;
1275 1256
1276 prev_cpu = task_cpu(p); 1257 do {
1277 this_cpu = smp_processor_id(); 1258 unsigned long load, avg_load;
1278 this_rq = cpu_rq(this_cpu); 1259 int local_group;
1279 new_cpu = prev_cpu; 1260 int i;
1280 1261
1281 if (prev_cpu == this_cpu) 1262 /* Skip over this group if it has no CPUs allowed */
1282 goto out; 1263 if (!cpumask_intersects(sched_group_cpus(group),
1283 /* 1264 &p->cpus_allowed))
1284 * 'this_sd' is the first domain that both 1265 continue;
1285 * this_cpu and prev_cpu are present in: 1266
1286 */ 1267 local_group = cpumask_test_cpu(this_cpu,
1287 for_each_domain(this_cpu, sd) { 1268 sched_group_cpus(group));
1288 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1269
1289 this_sd = sd; 1270 /* Tally up the load of all CPUs in the group */
1290 break; 1271 avg_load = 0;
1272
1273 for_each_cpu(i, sched_group_cpus(group)) {
1274 /* Bias balancing toward cpus of our domain */
1275 if (local_group)
1276 load = source_load(i, load_idx);
1277 else
1278 load = target_load(i, load_idx);
1279
1280 avg_load += load;
1281 }
1282
1283 /* Adjust by relative CPU power of the group */
1284 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1285
1286 if (local_group) {
1287 this_load = avg_load;
1288 this = group;
1289 } else if (avg_load < min_load) {
1290 min_load = avg_load;
1291 idlest = group;
1292 }
1293 } while (group = group->next, group != sd->groups);
1294
1295 if (!idlest || 100*this_load < imbalance*min_load)
1296 return NULL;
1297 return idlest;
1298}
1299
1300/*
1301 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1302 */
1303static int
1304find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1305{
1306 unsigned long load, min_load = ULONG_MAX;
1307 int idlest = -1;
1308 int i;
1309
1310 /* Traverse only the allowed CPUs */
1311 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1312 load = weighted_cpuload(i);
1313
1314 if (load < min_load || (load == min_load && i == this_cpu)) {
1315 min_load = load;
1316 idlest = i;
1291 } 1317 }
1292 } 1318 }
1293 1319
1294 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1320 return idlest;
1295 goto out; 1321}
1296 1322
1297 /* 1323/*
1298 * Check for affine wakeup and passive balancing possibilities. 1324 * sched_balance_self: balance the current task (running on cpu) in domains
1299 */ 1325 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1300 if (!this_sd) 1326 * SD_BALANCE_EXEC.
1327 *
1328 * Balance, ie. select the least loaded group.
1329 *
1330 * Returns the target CPU number, or the same CPU if no balancing is needed.
1331 *
1332 * preempt must be disabled.
1333 */
1334static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1335{
1336 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1337 int cpu = smp_processor_id();
1338 int prev_cpu = task_cpu(p);
1339 int new_cpu = cpu;
1340 int want_affine = 0;
1341 int want_sd = 1;
1342 int sync = wake_flags & WF_SYNC;
1343
1344 if (sd_flag & SD_BALANCE_WAKE) {
1345 if (sched_feat(AFFINE_WAKEUPS))
1346 want_affine = 1;
1347 new_cpu = prev_cpu;
1348 }
1349
1350 rcu_read_lock();
1351 for_each_domain(cpu, tmp) {
1352 /*
1353 * If power savings logic is enabled for a domain, see if we
1354 * are not overloaded, if so, don't balance wider.
1355 */
1356 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1357 unsigned long power = 0;
1358 unsigned long nr_running = 0;
1359 unsigned long capacity;
1360 int i;
1361
1362 for_each_cpu(i, sched_domain_span(tmp)) {
1363 power += power_of(i);
1364 nr_running += cpu_rq(i)->cfs.nr_running;
1365 }
1366
1367 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1368
1369 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1370 nr_running /= 2;
1371
1372 if (nr_running < capacity)
1373 want_sd = 0;
1374 }
1375
1376 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1377 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1378
1379 affine_sd = tmp;
1380 want_affine = 0;
1381 }
1382
1383 if (!want_sd && !want_affine)
1384 break;
1385
1386 if (!(tmp->flags & sd_flag))
1387 continue;
1388
1389 if (want_sd)
1390 sd = tmp;
1391 }
1392
1393 if (sched_feat(LB_SHARES_UPDATE)) {
1394 /*
1395 * Pick the largest domain to update shares over
1396 */
1397 tmp = sd;
1398 if (affine_sd && (!tmp ||
1399 cpumask_weight(sched_domain_span(affine_sd)) >
1400 cpumask_weight(sched_domain_span(sd))))
1401 tmp = affine_sd;
1402
1403 if (tmp)
1404 update_shares(tmp);
1405 }
1406
1407 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1408 new_cpu = cpu;
1301 goto out; 1409 goto out;
1410 }
1302 1411
1303 idx = this_sd->wake_idx; 1412 while (sd) {
1413 int load_idx = sd->forkexec_idx;
1414 struct sched_group *group;
1415 int weight;
1304 1416
1305 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1417 if (!(sd->flags & sd_flag)) {
1418 sd = sd->child;
1419 continue;
1420 }
1306 1421
1307 load = source_load(prev_cpu, idx); 1422 if (sd_flag & SD_BALANCE_WAKE)
1308 this_load = target_load(this_cpu, idx); 1423 load_idx = sd->wake_idx;
1424
1425 group = find_idlest_group(sd, p, cpu, load_idx);
1426 if (!group) {
1427 sd = sd->child;
1428 continue;
1429 }
1309 1430
1310 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1431 new_cpu = find_idlest_cpu(group, p, cpu);
1311 load, this_load, imbalance)) 1432 if (new_cpu == -1 || new_cpu == cpu) {
1312 return this_cpu; 1433 /* Now try balancing at a lower domain level of cpu */
1434 sd = sd->child;
1435 continue;
1436 }
1313 1437
1314 /* 1438 /* Now try balancing at a lower domain level of new_cpu */
1315 * Start passive balancing when half the imbalance_pct 1439 cpu = new_cpu;
1316 * limit is reached. 1440 weight = cpumask_weight(sched_domain_span(sd));
1317 */ 1441 sd = NULL;
1318 if (this_sd->flags & SD_WAKE_BALANCE) { 1442 for_each_domain(cpu, tmp) {
1319 if (imbalance*this_load <= 100*load) { 1443 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1320 schedstat_inc(this_sd, ttwu_move_balance); 1444 break;
1321 schedstat_inc(p, se.nr_wakeups_passive); 1445 if (tmp->flags & sd_flag)
1322 return this_cpu; 1446 sd = tmp;
1323 } 1447 }
1448 /* while loop will break here if sd == NULL */
1324 } 1449 }
1325 1450
1326out: 1451out:
1327 return wake_idle(new_cpu, p); 1452 rcu_read_unlock();
1453 return new_cpu;
1328} 1454}
1329#endif /* CONFIG_SMP */ 1455#endif /* CONFIG_SMP */
1330 1456
@@ -1437,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
1437/* 1563/*
1438 * Preempt the current task with a newly woken task if needed: 1564 * Preempt the current task with a newly woken task if needed:
1439 */ 1565 */
1440static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1566static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1441{ 1567{
1442 struct task_struct *curr = rq->curr; 1568 struct task_struct *curr = rq->curr;
1443 struct sched_entity *se = &curr->se, *pse = &p->se; 1569 struct sched_entity *se = &curr->se, *pse = &p->se;
1444 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1570 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1571 int sync = wake_flags & WF_SYNC;
1445 1572
1446 update_curr(cfs_rq); 1573 update_curr(cfs_rq);
1447 1574
@@ -1467,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1467 */ 1594 */
1468 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1595 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1469 set_last_buddy(se); 1596 set_last_buddy(se);
1470 set_next_buddy(pse); 1597 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1598 set_next_buddy(pse);
1471 1599
1472 /* 1600 /*
1473 * We can come here with TIF_NEED_RESCHED already set from new task 1601 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1489,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1489 return; 1617 return;
1490 } 1618 }
1491 1619
1492 if (!sched_feat(WAKEUP_PREEMPT)) 1620 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1493 return; 1621 (sched_feat(WAKEUP_OVERLAP) &&
1494 1622 (se->avg_overlap < sysctl_sched_migration_cost &&
1495 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1623 pse->avg_overlap < sysctl_sched_migration_cost))) {
1496 (se->avg_overlap < sysctl_sched_migration_cost &&
1497 pse->avg_overlap < sysctl_sched_migration_cost))) {
1498 resched_task(curr); 1624 resched_task(curr);
1499 return; 1625 return;
1500 } 1626 }
1501 1627
1628 if (sched_feat(WAKEUP_RUNNING)) {
1629 if (pse->avg_running < se->avg_running) {
1630 set_next_buddy(pse);
1631 resched_task(curr);
1632 return;
1633 }
1634 }
1635
1636 if (!sched_feat(WAKEUP_PREEMPT))
1637 return;
1638
1502 find_matching_se(&se, &pse); 1639 find_matching_se(&se, &pse);
1503 1640
1504 BUG_ON(!pse); 1641 BUG_ON(!pse);
@@ -1521,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1521 /* 1658 /*
1522 * If se was a buddy, clear it so that it will have to earn 1659 * If se was a buddy, clear it so that it will have to earn
1523 * the favour again. 1660 * the favour again.
1661 *
1662 * If se was not a buddy, clear the buddies because neither
1663 * was elegible to run, let them earn it again.
1664 *
1665 * IOW. unconditionally clear buddies.
1524 */ 1666 */
1525 __clear_buddies(cfs_rq, se); 1667 __clear_buddies(cfs_rq, NULL);
1526 set_next_entity(cfs_rq, se); 1668 set_next_entity(cfs_rq, se);
1527 cfs_rq = group_cfs_rq(se); 1669 cfs_rq = group_cfs_rq(se);
1528 } while (cfs_rq); 1670 } while (cfs_rq);
@@ -1721,6 +1863,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1863 sched_info_queued(p);
1722 1864
1723 update_curr(cfs_rq); 1865 update_curr(cfs_rq);
1866 if (curr)
1867 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1868 place_entity(cfs_rq, se, 1);
1725 1869
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1870 /* 'curr' will be NULL if the child belongs to a different group */
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..a8b448af004b 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..13de7126a6ab 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -927,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
927#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
928static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
929 940
930static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
931{ 942{
932 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
933 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
934 /* 948 /*
935 * If the current task is an RT task, then 949 * If the current task is an RT task, then
936 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -988,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
988/* 1002/*
989 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
990 */ 1004 */
991static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
992{ 1006{
993 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
994 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1064,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1078 if (p)
1065 dequeue_pushable_task(rq, p); 1079 dequeue_pushable_task(rq, p);
1066 1080
1081#ifdef CONFIG_SMP
1082 /*
1083 * We detect this state here so that we can avoid taking the RQ
1084 * lock again later if there is no need to push
1085 */
1086 rq->post_schedule = has_pushable_tasks(rq);
1087#endif
1088
1067 return p; 1089 return p;
1068} 1090}
1069 1091
@@ -1162,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1184 return -1; /* No targets found */
1163 1185
1164 /* 1186 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1187 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1188 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1189 * the best one based on our affinity and topology.
@@ -1262,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1277 return lowest_rq;
1263} 1278}
1264 1279
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1280static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1281{
1272 struct task_struct *p; 1282 struct task_struct *p;
@@ -1466,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1476 pull_rt_task(rq);
1467} 1477}
1468 1478
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1479static void post_schedule_rt(struct rq *rq)
1478{ 1480{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1482}
1487 1483
1488/* 1484/*
@@ -1758,7 +1754,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1754 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1755 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1756 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1757 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1758 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1759 .switched_from = switched_from_rt,
diff --git a/kernel/smp.c b/kernel/smp.c
index 94188b8ecc33..8e218500ab14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -177,6 +177,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 177 int cpu = get_cpu();
178 178
179 /* 179 /*
180 * Shouldn't receive this interrupt on a cpu that is not yet online.
181 */
182 WARN_ON_ONCE(!cpu_online(cpu));
183
184 /*
180 * Ensure entry is visible on call_function_queue after we have 185 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 186 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 187 * If we don't have this, then we may miss an entry on the list
@@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 235 unsigned int data_flags;
231 LIST_HEAD(list); 236 LIST_HEAD(list);
232 237
238 /*
239 * Shouldn't receive this interrupt on a cpu that is not yet online.
240 */
241 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
242
233 spin_lock(&q->lock); 243 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 244 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 245 spin_unlock(&q->lock);
@@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 295 */
286 this_cpu = get_cpu(); 296 this_cpu = get_cpu();
287 297
288 /* Can deadlock when called with interrupts disabled */ 298 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 299 * Can deadlock when called with interrupts disabled.
300 * We allow cpu's that are not yet online though, as no one else can
301 * send smp call function interrupt to this cpu and as such deadlocks
302 * can't happen.
303 */
304 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
305 && !oops_in_progress);
290 306
291 if (cpu == this_cpu) { 307 if (cpu == this_cpu) {
292 local_irq_save(flags); 308 local_irq_save(flags);
@@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 345{
330 csd_lock(data); 346 csd_lock(data);
331 347
332 /* Can deadlock when called with interrupts disabled */ 348 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 349 * Can deadlock when called with interrupts disabled.
350 * We allow cpu's that are not yet online though, as no one else can
351 * send smp call function interrupt to this cpu and as such deadlocks
352 * can't happen.
353 */
354 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
355 && !oops_in_progress);
334 356
335 generic_exec_single(cpu, data, wait); 357 generic_exec_single(cpu, data, wait);
336} 358}
@@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 387 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 388 int cpu, next_cpu, this_cpu = smp_processor_id();
367 389
368 /* Can deadlock when called with interrupts disabled */ 390 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 391 * Can deadlock when called with interrupts disabled.
392 * We allow cpu's that are not yet online though, as no one else can
393 * send smp call function interrupt to this cpu and as such deadlocks
394 * can't happen.
395 */
396 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
397 && !oops_in_progress);
370 398
371 /* So, what's a CPU they want? Ignoring this one. */ 399 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 400 cpu = cpumask_first_and(mask, cpu_online_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..1a631ba684a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,7 +49,6 @@
49#include <linux/acpi.h> 49#include <linux/acpi.h>
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
53#include <linux/slow-work.h> 52#include <linux/slow-work.h>
54#include <linux/perf_counter.h> 53#include <linux/perf_counter.h>
55 54
@@ -92,6 +91,9 @@ extern int sysctl_nr_trim_pages;
92#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
93extern int rcutorture_runnable; 92extern int rcutorture_runnable;
94#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
95 97
96/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
97#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -246,6 +248,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
246#endif 248#endif
247 249
248static struct ctl_table kern_table[] = { 250static struct ctl_table kern_table[] = {
251 {
252 .ctl_name = CTL_UNNUMBERED,
253 .procname = "sched_child_runs_first",
254 .data = &sysctl_sched_child_runs_first,
255 .maxlen = sizeof(unsigned int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec,
258 },
249#ifdef CONFIG_SCHED_DEBUG 259#ifdef CONFIG_SCHED_DEBUG
250 { 260 {
251 .ctl_name = CTL_UNNUMBERED, 261 .ctl_name = CTL_UNNUMBERED,
@@ -300,14 +310,6 @@ static struct ctl_table kern_table[] = {
300 }, 310 },
301 { 311 {
302 .ctl_name = CTL_UNNUMBERED, 312 .ctl_name = CTL_UNNUMBERED,
303 .procname = "sched_child_runs_first",
304 .data = &sysctl_sched_child_runs_first,
305 .maxlen = sizeof(unsigned int),
306 .mode = 0644,
307 .proc_handler = &proc_dointvec,
308 },
309 {
310 .ctl_name = CTL_UNNUMBERED,
311 .procname = "sched_features", 313 .procname = "sched_features",
312 .data = &sysctl_sched_features, 314 .data = &sysctl_sched_features,
313 .maxlen = sizeof(unsigned int), 315 .maxlen = sizeof(unsigned int),
@@ -332,6 +334,14 @@ static struct ctl_table kern_table[] = {
332 }, 334 },
333 { 335 {
334 .ctl_name = CTL_UNNUMBERED, 336 .ctl_name = CTL_UNNUMBERED,
337 .procname = "sched_time_avg",
338 .data = &sysctl_sched_time_avg,
339 .maxlen = sizeof(unsigned int),
340 .mode = 0644,
341 .proc_handler = &proc_dointvec,
342 },
343 {
344 .ctl_name = CTL_UNNUMBERED,
335 .procname = "timer_migration", 345 .procname = "timer_migration",
336 .data = &sysctl_timer_migration, 346 .data = &sysctl_timer_migration,
337 .maxlen = sizeof(unsigned int), 347 .maxlen = sizeof(unsigned int),
@@ -990,7 +1000,16 @@ static struct ctl_table kern_table[] = {
990 .proc_handler = &proc_dointvec, 1000 .proc_handler = &proc_dointvec,
991 }, 1001 },
992#endif 1002#endif
993 1003#ifdef CONFIG_BLOCK
1004 {
1005 .ctl_name = CTL_UNNUMBERED,
1006 .procname = "blk_iopoll",
1007 .data = &blk_iopoll_enabled,
1008 .maxlen = sizeof(int),
1009 .mode = 0644,
1010 .proc_handler = &proc_dointvec,
1011 },
1012#endif
994/* 1013/*
995 * NOTE: do not add new entries to this table unless you have read 1014 * NOTE: do not add new entries to this table unless you have read
996 * Documentation/sysctl/ctl_unnumbered.txt 1015 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7466cb811251..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
165 154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 mutex_lock(&clocksource_mutex);
300 398
301 list_for_each_entry(cs, &clocksource_list, list) { 399 list_for_each_entry(cs, &clocksource_list, list)
302 if (cs->resume) 400 if (cs->resume)
303 cs->resume(); 401 cs->resume();
304 }
305 402
306 clocksource_resume_watchdog(); 403 clocksource_resume_watchdog();
307 404
308 spin_unlock_irqrestore(&clocksource_lock, flags); 405 mutex_unlock(&clocksource_mutex);
309} 406}
310 407
311/** 408/**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 417 clocksource_resume_watchdog();
321} 418}
322 419
420#ifdef CONFIG_GENERIC_TIME
421
323/** 422/**
324 * clocksource_get_next - Returns the selected clocksource 423 * clocksource_select - Select the best clocksource available
424 *
425 * Private function. Must hold clocksource_mutex when called.
325 * 426 *
427 * Select the clocksource with the best rating, or the clocksource,
428 * which is selected by userspace override.
326 */ 429 */
327struct clocksource *clocksource_get_next(void) 430static void clocksource_select(void)
328{ 431{
329 unsigned long flags; 432 struct clocksource *best, *cs;
330 433
331 spin_lock_irqsave(&clocksource_lock, flags); 434 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 435 return;
333 curr_clocksource = next_clocksource; 436 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 437 best = list_first_entry(&clocksource_list, struct clocksource, list);
438 /* Check for the override clocksource. */
439 list_for_each_entry(cs, &clocksource_list, list) {
440 if (strcmp(cs->name, override_name) != 0)
441 continue;
442 /*
443 * Check to make sure we don't switch to a non-highres
444 * capable clocksource if the tick code is in oneshot
445 * mode (highres or nohz)
446 */
447 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
448 tick_oneshot_mode_active()) {
449 /* Override clocksource cannot be used. */
450 printk(KERN_WARNING "Override clocksource %s is not "
451 "HRT compatible. Cannot switch while in "
452 "HRT/NOHZ mode\n", cs->name);
453 override_name[0] = 0;
454 } else
455 /* Override clocksource can be used. */
456 best = cs;
457 break;
458 }
459 if (curr_clocksource != best) {
460 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
461 curr_clocksource = best;
462 timekeeping_notify(curr_clocksource);
335 } 463 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 464}
340 465
341/** 466#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 467
343 * 468static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 469
470#endif
471
472/*
473 * clocksource_done_booting - Called near the end of core bootup
345 * 474 *
346 * Select the clocksource with the best rating, or the clocksource, 475 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 476 * We use fs_initcall because we want this to start before
477 * device_initcall but after subsys_initcall.
348 */ 478 */
349static struct clocksource *select_clocksource(void) 479static int __init clocksource_done_booting(void)
350{ 480{
351 struct clocksource *next; 481 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 482
362 if (next == curr_clocksource) 483 /*
363 return NULL; 484 * Run the watchdog first to eliminate unstable clock sources
485 */
486 clocksource_watchdog_kthread(NULL);
364 487
365 return next; 488 mutex_lock(&clocksource_mutex);
489 clocksource_select();
490 mutex_unlock(&clocksource_mutex);
491 return 0;
366} 492}
493fs_initcall(clocksource_done_booting);
367 494
368/* 495/*
369 * Enqueue the clocksource sorted by rating 496 * Enqueue the clocksource sorted by rating
370 */ 497 */
371static int clocksource_enqueue(struct clocksource *c) 498static void clocksource_enqueue(struct clocksource *cs)
372{ 499{
373 struct list_head *tmp, *entry = &clocksource_list; 500 struct list_head *entry = &clocksource_list;
501 struct clocksource *tmp;
374 502
375 list_for_each(tmp, &clocksource_list) { 503 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 504 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 505 if (tmp->rating >= cs->rating)
383 entry = tmp; 506 entry = &tmp->list;
384 } 507 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 508}
393 509
394/** 510/**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 513 *
398 * Returns -EBUSY if registration fails, zero otherwise. 514 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 515 */
400int clocksource_register(struct clocksource *c) 516int clocksource_register(struct clocksource *cs)
401{ 517{
402 unsigned long flags; 518 mutex_lock(&clocksource_mutex);
403 int ret; 519 clocksource_enqueue(cs);
404 520 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 521 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 522 mutex_unlock(&clocksource_mutex);
407 if (!ret) 523 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 524}
414EXPORT_SYMBOL(clocksource_register); 525EXPORT_SYMBOL(clocksource_register);
415 526
527static void __clocksource_change_rating(struct clocksource *cs, int rating)
528{
529 list_del(&cs->list);
530 cs->rating = rating;
531 clocksource_enqueue(cs);
532 clocksource_select();
533}
534
416/** 535/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 536 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 537 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 538void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 539{
422 unsigned long flags; 540 mutex_lock(&clocksource_mutex);
423 541 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 542 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 543}
544EXPORT_SYMBOL(clocksource_change_rating);
431 545
432/** 546/**
433 * clocksource_unregister - remove a registered clocksource 547 * clocksource_unregister - remove a registered clocksource
434 */ 548 */
435void clocksource_unregister(struct clocksource *cs) 549void clocksource_unregister(struct clocksource *cs)
436{ 550{
437 unsigned long flags; 551 mutex_lock(&clocksource_mutex);
438 552 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 553 list_del(&cs->list);
441 if (clocksource_override == cs) 554 clocksource_select();
442 clocksource_override = NULL; 555 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 556}
557EXPORT_SYMBOL(clocksource_unregister);
446 558
447#ifdef CONFIG_SYSFS 559#ifdef CONFIG_SYSFS
448/** 560/**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 570{
459 ssize_t count = 0; 571 ssize_t count = 0;
460 572
461 spin_lock_irq(&clocksource_lock); 573 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 574 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 575 mutex_unlock(&clocksource_mutex);
464 576
465 return count; 577 return count;
466} 578}
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 590 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 591 const char *buf, size_t count)
480{ 592{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 593 size_t ret = count;
483 int len;
484 594
485 /* strings from sysfs write are not 0 terminated! */ 595 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 596 if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 600 if (buf[count-1] == '\n')
491 count--; 601 count--;
492 602
493 spin_lock_irq(&clocksource_lock); 603 mutex_lock(&clocksource_mutex);
494 604
495 if (count > 0) 605 if (count > 0)
496 memcpy(override_name, buf, count); 606 memcpy(override_name, buf, count);
497 override_name[count] = 0; 607 override_name[count] = 0;
608 clocksource_select();
498 609
499 len = strlen(override_name); 610 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() && ovr &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 611
532 return ret; 612 return ret;
533} 613}
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 627 struct clocksource *src;
548 ssize_t count = 0; 628 ssize_t count = 0;
549 629
550 spin_lock_irq(&clocksource_lock); 630 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 631 list_for_each_entry(src, &clocksource_list, list) {
552 /* 632 /*
553 * Don't show non-HRES clocksource if the tick code is 633 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 639 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 640 "%s ", src->name);
561 } 641 }
562 spin_unlock_irq(&clocksource_lock); 642 mutex_unlock(&clocksource_mutex);
563 643
564 count += snprintf(buf + count, 644 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 645 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 694 */
615static int __init boot_override_clocksource(char* str) 695static int __init boot_override_clocksource(char* str)
616{ 696{
617 unsigned long flags; 697 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 698 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 699 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 700 mutex_unlock(&clocksource_mutex);
622 return 1; 701 return 1;
623} 702}
624 703
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/tick.h> 20#include <linux/tick.h>
21#include <linux/stop_machine.h>
22
23/* Structure holding internal timekeeping values. */
24struct timekeeper {
25 /* Current clocksource used for timekeeping. */
26 struct clocksource *clock;
27 /* The shift value of the current clocksource. */
28 int shift;
29
30 /* Number of clock cycles in one NTP interval. */
31 cycle_t cycle_interval;
32 /* Number of clock shifted nano seconds in one NTP interval. */
33 u64 xtime_interval;
34 /* Raw nano seconds accumulated per NTP interval. */
35 u32 raw_interval;
36
37 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
38 u64 xtime_nsec;
39 /* Difference between accumulated time and NTP time in ntp
40 * shifted nano seconds. */
41 s64 ntp_error;
42 /* Shift conversion between clock shifted nano seconds and
43 * ntp shifted nano seconds. */
44 int ntp_error_shift;
45 /* NTP adjusted clock multiplier */
46 u32 mult;
47};
48
49struct timekeeper timekeeper;
50
51/**
52 * timekeeper_setup_internals - Set up internals to use clocksource clock.
53 *
54 * @clock: Pointer to clocksource.
55 *
56 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
57 * pair and interval request.
58 *
59 * Unless you're the timekeeping code, you should not be using this!
60 */
61static void timekeeper_setup_internals(struct clocksource *clock)
62{
63 cycle_t interval;
64 u64 tmp;
65
66 timekeeper.clock = clock;
67 clock->cycle_last = clock->read(clock);
21 68
69 /* Do the ns -> cycle conversion first, using original mult */
70 tmp = NTP_INTERVAL_LENGTH;
71 tmp <<= clock->shift;
72 tmp += clock->mult/2;
73 do_div(tmp, clock->mult);
74 if (tmp == 0)
75 tmp = 1;
76
77 interval = (cycle_t) tmp;
78 timekeeper.cycle_interval = interval;
79
80 /* Go back from cycles -> shifted ns */
81 timekeeper.xtime_interval = (u64) interval * clock->mult;
82 timekeeper.raw_interval =
83 ((u64) interval * clock->mult) >> clock->shift;
84
85 timekeeper.xtime_nsec = 0;
86 timekeeper.shift = clock->shift;
87
88 timekeeper.ntp_error = 0;
89 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
90
91 /*
92 * The timekeeper keeps its own mult values for the currently
93 * active clocksource. These value will be adjusted via NTP
94 * to counteract clock drifting.
95 */
96 timekeeper.mult = clock->mult;
97}
98
99/* Timekeeper helper functions. */
100static inline s64 timekeeping_get_ns(void)
101{
102 cycle_t cycle_now, cycle_delta;
103 struct clocksource *clock;
104
105 /* read clocksource: */
106 clock = timekeeper.clock;
107 cycle_now = clock->read(clock);
108
109 /* calculate the delta since the last update_wall_time: */
110 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
111
112 /* return delta convert to nanoseconds using ntp adjusted mult. */
113 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
114 timekeeper.shift);
115}
116
117static inline s64 timekeeping_get_ns_raw(void)
118{
119 cycle_t cycle_now, cycle_delta;
120 struct clocksource *clock;
121
122 /* read clocksource: */
123 clock = timekeeper.clock;
124 cycle_now = clock->read(clock);
125
126 /* calculate the delta since the last update_wall_time: */
127 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
128
129 /* return delta convert to nanoseconds using ntp adjusted mult. */
130 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
131}
22 132
23/* 133/*
24 * This read-write spinlock protects us from races in SMP while 134 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 154 */
45struct timespec xtime __attribute__ ((aligned (16))); 155struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 156struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 157static struct timespec total_sleep_time;
158
159/*
160 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
161 */
162struct timespec raw_time;
48 163
49/* flag for if timekeeping is suspended */ 164/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 165int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 171 timespec_add_ns(&xtime_cache, nsec);
57} 172}
58 173
59struct clocksource *clock; 174/* must hold xtime_lock */
60 175void timekeeping_leap_insert(int leapsecond)
176{
177 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock);
180}
61 181
62#ifdef CONFIG_GENERIC_TIME 182#ifdef CONFIG_GENERIC_TIME
183
63/** 184/**
64 * clocksource_forward_now - update clock to the current time 185 * timekeeping_forward_now - update clock to the current time
65 * 186 *
66 * Forward the current clock to update its state since the last call to 187 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 188 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 189 * as it avoids having to deal with this time offset explicitly.
69 */ 190 */
70static void clocksource_forward_now(void) 191static void timekeeping_forward_now(void)
71{ 192{
72 cycle_t cycle_now, cycle_delta; 193 cycle_t cycle_now, cycle_delta;
194 struct clocksource *clock;
73 s64 nsec; 195 s64 nsec;
74 196
75 cycle_now = clocksource_read(clock); 197 clock = timekeeper.clock;
198 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 200 clock->cycle_last = cycle_now;
78 201
79 nsec = cyc2ns(clock, cycle_delta); 202 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
203 timekeeper.shift);
80 204
81 /* If arch requires, add in gettimeoffset() */ 205 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 206 nsec += arch_gettimeoffset();
83 207
84 timespec_add_ns(&xtime, nsec); 208 timespec_add_ns(&xtime, nsec);
85 209
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 210 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 211 timespec_add_ns(&raw_time, nsec);
88} 212}
89 213
90/** 214/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
95 */ 219 */
96void getnstimeofday(struct timespec *ts) 220void getnstimeofday(struct timespec *ts)
97{ 221{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 222 unsigned long seq;
100 s64 nsecs; 223 s64 nsecs;
101 224
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 228 seq = read_seqbegin(&xtime_lock);
106 229
107 *ts = xtime; 230 *ts = xtime;
108 231 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 232
118 /* If arch requires, add in gettimeoffset() */ 233 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 234 nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
125 240
126EXPORT_SYMBOL(getnstimeofday); 241EXPORT_SYMBOL(getnstimeofday);
127 242
243ktime_t ktime_get(void)
244{
245 unsigned int seq;
246 s64 secs, nsecs;
247
248 WARN_ON(timekeeping_suspended);
249
250 do {
251 seq = read_seqbegin(&xtime_lock);
252 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
253 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
254 nsecs += timekeeping_get_ns();
255
256 } while (read_seqretry(&xtime_lock, seq));
257 /*
258 * Use ktime_set/ktime_add_ns to create a proper ktime on
259 * 32-bit architectures without CONFIG_KTIME_SCALAR.
260 */
261 return ktime_add_ns(ktime_set(secs, 0), nsecs);
262}
263EXPORT_SYMBOL_GPL(ktime_get);
264
265/**
266 * ktime_get_ts - get the monotonic clock in timespec format
267 * @ts: pointer to timespec variable
268 *
269 * The function calculates the monotonic clock from the realtime
270 * clock and the wall_to_monotonic offset and stores the result
271 * in normalized timespec format in the variable pointed to by @ts.
272 */
273void ktime_get_ts(struct timespec *ts)
274{
275 struct timespec tomono;
276 unsigned int seq;
277 s64 nsecs;
278
279 WARN_ON(timekeeping_suspended);
280
281 do {
282 seq = read_seqbegin(&xtime_lock);
283 *ts = xtime;
284 tomono = wall_to_monotonic;
285 nsecs = timekeeping_get_ns();
286
287 } while (read_seqretry(&xtime_lock, seq));
288
289 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
290 ts->tv_nsec + tomono.tv_nsec + nsecs);
291}
292EXPORT_SYMBOL_GPL(ktime_get_ts);
293
128/** 294/**
129 * do_gettimeofday - Returns the time of day in a timeval 295 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 296 * @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
157 323
158 write_seqlock_irqsave(&xtime_lock, flags); 324 write_seqlock_irqsave(&xtime_lock, flags);
159 325
160 clocksource_forward_now(); 326 timekeeping_forward_now();
161 327
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 328 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 329 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
167 333
168 update_xtime_cache(0); 334 update_xtime_cache(0);
169 335
170 clock->error = 0; 336 timekeeper.ntp_error = 0;
171 ntp_clear(); 337 ntp_clear();
172 338
173 update_vsyscall(&xtime, clock); 339 update_vsyscall(&xtime, timekeeper.clock);
174 340
175 write_sequnlock_irqrestore(&xtime_lock, flags); 341 write_sequnlock_irqrestore(&xtime_lock, flags);
176 342
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 353 *
188 * Accumulates current time interval and initializes new clocksource 354 * Accumulates current time interval and initializes new clocksource
189 */ 355 */
190static void change_clocksource(void) 356static int change_clocksource(void *data)
191{ 357{
192 struct clocksource *new, *old; 358 struct clocksource *new, *old;
193 359
194 new = clocksource_get_next(); 360 new = (struct clocksource *) data;
361
362 timekeeping_forward_now();
363 if (!new->enable || new->enable(new) == 0) {
364 old = timekeeper.clock;
365 timekeeper_setup_internals(new);
366 if (old->disable)
367 old->disable(old);
368 }
369 return 0;
370}
195 371
196 if (clock == new) 372/**
373 * timekeeping_notify - Install a new clock source
374 * @clock: pointer to the clock source
375 *
376 * This function is called from clocksource.c after a new, better clock
377 * source has been registered. The caller holds the clocksource_mutex.
378 */
379void timekeeping_notify(struct clocksource *clock)
380{
381 if (timekeeper.clock == clock)
197 return; 382 return;
383 stop_machine(change_clocksource, clock, NULL);
384 tick_clock_notify();
385}
198 386
199 clocksource_forward_now(); 387#else /* GENERIC_TIME */
200 388
201 if (clocksource_enable(new)) 389static inline void timekeeping_forward_now(void) { }
202 return;
203 390
204 new->raw_time = clock->raw_time; 391/**
205 old = clock; 392 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 393 *
207 clocksource_disable(old); 394 * returns the time in ktime_t format
395 */
396ktime_t ktime_get(void)
397{
398 struct timespec now;
208 399
209 clock->cycle_last = 0; 400 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 401
215 tick_clock_notify(); 402 return timespec_to_ktime(now);
403}
404EXPORT_SYMBOL_GPL(ktime_get);
216 405
217 /* 406/**
218 * We're holding xtime lock and waking up klogd would deadlock 407 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 408 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 409 *
221 clock->name); 410 * The function calculates the monotonic clock from the realtime
222 */ 411 * clock and the wall_to_monotonic offset and stores the result
412 * in normalized timespec format in the variable pointed to by @ts.
413 */
414void ktime_get_ts(struct timespec *ts)
415{
416 struct timespec tomono;
417 unsigned long seq;
418
419 do {
420 seq = read_seqbegin(&xtime_lock);
421 getnstimeofday(ts);
422 tomono = wall_to_monotonic;
423
424 } while (read_seqretry(&xtime_lock, seq));
425
426 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
427 ts->tv_nsec + tomono.tv_nsec);
223} 428}
224#else 429EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 430
226static inline void change_clocksource(void) { } 431#endif /* !GENERIC_TIME */
227#endif 432
433/**
434 * ktime_get_real - get the real (wall-) time in ktime_t format
435 *
436 * returns the time in ktime_t format
437 */
438ktime_t ktime_get_real(void)
439{
440 struct timespec now;
441
442 getnstimeofday(&now);
443
444 return timespec_to_ktime(now);
445}
446EXPORT_SYMBOL_GPL(ktime_get_real);
228 447
229/** 448/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 449 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 455{
237 unsigned long seq; 456 unsigned long seq;
238 s64 nsecs; 457 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 458
241 do { 459 do {
242 seq = read_seqbegin(&xtime_lock); 460 seq = read_seqbegin(&xtime_lock);
243 461 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 462 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 463
255 } while (read_seqretry(&xtime_lock, seq)); 464 } while (read_seqretry(&xtime_lock, seq));
256 465
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
270 do { 479 do {
271 seq = read_seqbegin(&xtime_lock); 480 seq = read_seqbegin(&xtime_lock);
272 481
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 482 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 483
275 } while (read_seqretry(&xtime_lock, seq)); 484 } while (read_seqretry(&xtime_lock, seq));
276 485
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
278} 487}
279 488
280/** 489/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 490 * read_persistent_clock - Return time from the persistent clock.
282 * 491 *
283 * Weak dummy function for arches that do not yet support it. 492 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 493 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 494 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 495 *
287 * XXX - Do be sure to remove it once all arches implement it. 496 * XXX - Do be sure to remove it once all arches implement it.
288 */ 497 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 498void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 499{
291 return 0; 500 ts->tv_sec = 0;
501 ts->tv_nsec = 0;
502}
503
504/**
505 * read_boot_clock - Return time of the system start.
506 *
507 * Weak dummy function for arches that do not yet support it.
508 * Function to read the exact time the system has been started.
509 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
510 *
511 * XXX - Do be sure to remove it once all arches implement it.
512 */
513void __attribute__((weak)) read_boot_clock(struct timespec *ts)
514{
515 ts->tv_sec = 0;
516 ts->tv_nsec = 0;
292} 517}
293 518
294/* 519/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 521 */
297void __init timekeeping_init(void) 522void __init timekeeping_init(void)
298{ 523{
524 struct clocksource *clock;
299 unsigned long flags; 525 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 526 struct timespec now, boot;
527
528 read_persistent_clock(&now);
529 read_boot_clock(&boot);
301 530
302 write_seqlock_irqsave(&xtime_lock, flags); 531 write_seqlock_irqsave(&xtime_lock, flags);
303 532
304 ntp_init(); 533 ntp_init();
305 534
306 clock = clocksource_get_next(); 535 clock = clocksource_default_clock();
307 clocksource_enable(clock); 536 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 537 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 538 timekeeper_setup_internals(clock);
310 539
311 xtime.tv_sec = sec; 540 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 541 xtime.tv_nsec = now.tv_nsec;
542 raw_time.tv_sec = 0;
543 raw_time.tv_nsec = 0;
544 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
545 boot.tv_sec = xtime.tv_sec;
546 boot.tv_nsec = xtime.tv_nsec;
547 }
313 set_normalized_timespec(&wall_to_monotonic, 548 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 549 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 550 update_xtime_cache(0);
316 total_sleep_time = 0; 551 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 553 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 554}
319 555
320/* time in seconds when suspend began */ 556/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 557static struct timespec timekeeping_suspend_time;
322 558
323/** 559/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 560 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 567static int timekeeping_resume(struct sys_device *dev)
332{ 568{
333 unsigned long flags; 569 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 570 struct timespec ts;
571
572 read_persistent_clock(&ts);
335 573
336 clocksource_resume(); 574 clocksource_resume();
337 575
338 write_seqlock_irqsave(&xtime_lock, flags); 576 write_seqlock_irqsave(&xtime_lock, flags);
339 577
340 if (now && (now > timekeeping_suspend_time)) { 578 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 579 ts = timespec_sub(ts, timekeeping_suspend_time);
342 580 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 582 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 583 }
347 update_xtime_cache(0); 584 update_xtime_cache(0);
348 /* re-base the last cycle value */ 585 /* re-base the last cycle value */
349 clock->cycle_last = 0; 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 587 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 588 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 589 write_sequnlock_irqrestore(&xtime_lock, flags);
354 590
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 602{
367 unsigned long flags; 603 unsigned long flags;
368 604
369 timekeeping_suspend_time = read_persistent_clock(); 605 read_persistent_clock(&timekeeping_suspend_time);
370 606
371 write_seqlock_irqsave(&xtime_lock, flags); 607 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 608 timekeeping_forward_now();
373 timekeeping_suspended = 1; 609 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 610 write_sequnlock_irqrestore(&xtime_lock, flags);
375 611
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 640 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 641 * to compensate for late or lost adjustments.
406 */ 642 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 643static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 644 s64 *offset)
409{ 645{
410 s64 tick_error, i; 646 s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 656 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 657 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 658 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 659 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 660 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 661 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 662 error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 665 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 666 * remove the single look ahead already included in the error.
431 */ 667 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 668 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 669 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 670 error = ((error - tick_error) >> look_ahead) + tick_error;
435 671
436 /* Finally calculate the adjustment shift value. */ 672 /* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 691 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 692 * for other values we can do a bit more work.
457 */ 693 */
458static void clocksource_adjust(s64 offset) 694static void timekeeping_adjust(s64 offset)
459{ 695{
460 s64 error, interval = clock->cycle_interval; 696 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 697 int adj;
462 698
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 699 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 700 if (error > interval) {
465 error >>= 2; 701 error >>= 2;
466 if (likely(error <= interval)) 702 if (likely(error <= interval))
467 adj = 1; 703 adj = 1;
468 else 704 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 705 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 706 } else if (error < -interval) {
471 error >>= 2; 707 error >>= 2;
472 if (likely(error >= -interval)) { 708 if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 710 interval = -interval;
475 offset = -offset; 711 offset = -offset;
476 } else 712 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 713 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 714 } else
479 return; 715 return;
480 716
481 clock->mult += adj; 717 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 718 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 719 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 720 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 721 timekeeper.ntp_error_shift;
486} 722}
487 723
488/** 724/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
492 */ 728 */
493void update_wall_time(void) 729void update_wall_time(void)
494{ 730{
731 struct clocksource *clock;
495 cycle_t offset; 732 cycle_t offset;
733 u64 nsecs;
496 734
497 /* Make sure we're fully resumed: */ 735 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 736 if (unlikely(timekeeping_suspended))
499 return; 737 return;
500 738
739 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 740#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 741 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 742#else
504 offset = clock->cycle_interval; 743 offset = timekeeper.cycle_interval;
505#endif 744#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 746
508 /* normally this loop will run just once, however in the 747 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 748 * case of lost or late ticks, it will accumulate correctly.
510 */ 749 */
511 while (offset >= clock->cycle_interval) { 750 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
752
512 /* accumulate one interval */ 753 /* accumulate one interval */
513 offset -= clock->cycle_interval; 754 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 755 clock->cycle_last += timekeeper.cycle_interval;
515 756
516 clock->xtime_nsec += clock->xtime_interval; 757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 758 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 759 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 760 xtime.tv_sec++;
520 second_overflow(); 761 second_overflow();
521 } 762 }
522 763
523 clock->raw_time.tv_nsec += clock->raw_interval; 764 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 766 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 767 raw_time.tv_sec++;
527 } 768 }
528 769
529 /* accumulate error between NTP and clock interval */ 770 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 771 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
532 } 774 }
533 775
534 /* correct the clock when NTP error is too big */ 776 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 777 timekeeping_adjust(offset);
536 778
537 /* 779 /*
538 * Since in the loop above, we accumulate any amount of time 780 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 781 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 782 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 783 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 784 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 785 * cause it to underflow.
544 * 786 *
@@ -550,24 +792,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 792 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 793 * xtime_nsec is not as small.
552 */ 794 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 795 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 796 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 797 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 798 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 799 }
558 800
559 /* store full nanoseconds into xtime after rounding it up and 801 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 802 * add the remainder to the error difference.
561 */ 803 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 804 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 805 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 806 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift;
565 808
566 update_xtime_cache(cyc2ns(clock, offset)); 809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
567 811
568 /* check to see if there is a new clocksource to use */ 812 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 813 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 814}
572 815
573/** 816/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
583 */ 826 */
584void getboottime(struct timespec *ts) 827void getboottime(struct timespec *ts)
585{ 828{
586 set_normalized_timespec(ts, 829 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 830 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 831 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
832 };
833
834 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 835}
590 836
591/** 837/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
594 */ 840 */
595void monotonic_to_bootbased(struct timespec *ts) 841void monotonic_to_bootbased(struct timespec *ts)
596{ 842{
597 ts->tv_sec += total_sleep_time; 843 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 844}
599 845
600unsigned long get_seconds(void) 846unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
603} 849}
604EXPORT_SYMBOL(get_seconds); 850EXPORT_SYMBOL(get_seconds);
605 851
852struct timespec __current_kernel_time(void)
853{
854 return xtime_cache;
855}
606 856
607struct timespec current_kernel_time(void) 857struct timespec current_kernel_time(void)
608{ 858{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
618 return now; 868 return now;
619} 869}
620EXPORT_SYMBOL(current_kernel_time); 870EXPORT_SYMBOL(current_kernel_time);
871
872struct timespec get_monotonic_coarse(void)
873{
874 struct timespec now, mono;
875 unsigned long seq;
876
877 do {
878 seq = read_seqbegin(&xtime_lock);
879
880 now = xtime_cache;
881 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq));
883
884 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
885 now.tv_nsec + mono.tv_nsec);
886 return now;
887}
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..bbb51074680e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -72,6 +72,7 @@ struct tvec_base {
72 spinlock_t lock; 72 spinlock_t lock;
73 struct timer_list *running_timer; 73 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 74 unsigned long timer_jiffies;
75 unsigned long next_timer;
75 struct tvec_root tv1; 76 struct tvec_root tv1;
76 struct tvec tv2; 77 struct tvec tv2;
77 struct tvec tv3; 78 struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 623
623 if (timer_pending(timer)) { 624 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 625 detach_timer(timer, 0);
626 if (timer->expires == base->next_timer &&
627 !tbase_get_deferrable(timer->base))
628 base->next_timer = base->timer_jiffies;
625 ret = 1; 629 ret = 1;
626 } else { 630 } else {
627 if (pending_only) 631 if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 667 }
664 668
665 timer->expires = expires; 669 timer->expires = expires;
670 if (time_before(timer->expires, base->next_timer) &&
671 !tbase_get_deferrable(timer->base))
672 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 673 internal_add_timer(base, timer);
667 674
668out_unlock: 675out_unlock:
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
781 spin_lock_irqsave(&base->lock, flags); 788 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 789 timer_set_base(timer, base);
783 debug_timer_activate(timer); 790 debug_timer_activate(timer);
791 if (time_before(timer->expires, base->next_timer) &&
792 !tbase_get_deferrable(timer->base))
793 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 794 internal_add_timer(base, timer);
785 /* 795 /*
786 * Check whether the other CPU is idle and needs to be 796 * Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 827 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 828 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 829 detach_timer(timer, 1);
830 if (timer->expires == base->next_timer &&
831 !tbase_get_deferrable(timer->base))
832 base->next_timer = base->timer_jiffies;
820 ret = 1; 833 ret = 1;
821 } 834 }
822 spin_unlock_irqrestore(&base->lock, flags); 835 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 863 ret = 0;
851 if (timer_pending(timer)) { 864 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 865 detach_timer(timer, 1);
866 if (timer->expires == base->next_timer &&
867 !tbase_get_deferrable(timer->base))
868 base->next_timer = base->timer_jiffies;
853 ret = 1; 869 ret = 1;
854 } 870 }
855out: 871out:
@@ -1007,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1023#ifdef CONFIG_NO_HZ
1008/* 1024/*
1009 * Find out when the next timer event is due to happen. This 1025 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1026 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1027 * This function needs to be called with interrupts disabled.
1012 */ 1028 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1029static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1030{
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1150 unsigned long expires;
1135 1151
1136 spin_lock(&base->lock); 1152 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1153 if (time_before_eq(base->next_timer, base->timer_jiffies))
1154 base->next_timer = __next_timer_interrupt(base);
1155 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1156 spin_unlock(&base->lock);
1139 1157
1140 if (time_before_eq(expires, now)) 1158 if (time_before_eq(expires, now))
@@ -1156,8 +1174,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1174 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1175 account_process_tick(p, user_tick);
1158 run_local_timers(); 1176 run_local_timers();
1159 if (rcu_pending(cpu)) 1177 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1178 printk_tick();
1162 scheduler_tick(); 1179 scheduler_tick();
1163 run_posix_cpu_timers(p); 1180 run_posix_cpu_timers(p);
@@ -1523,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1523 INIT_LIST_HEAD(base->tv1.vec + j); 1540 INIT_LIST_HEAD(base->tv1.vec + j);
1524 1541
1525 base->timer_jiffies = jiffies; 1542 base->timer_jiffies = jiffies;
1543 base->next_timer = base->timer_jiffies;
1526 return 0; 1544 return 0;
1527} 1545}
1528 1546
@@ -1535,6 +1553,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1535 timer = list_first_entry(head, struct timer_list, entry); 1553 timer = list_first_entry(head, struct timer_list, entry);
1536 detach_timer(timer, 0); 1554 detach_timer(timer, 0);
1537 timer_set_base(timer, new_base); 1555 timer_set_base(timer, new_base);
1556 if (time_before(timer->expires, new_base->next_timer) &&
1557 !tbase_get_deferrable(timer->base))
1558 new_base->next_timer = timer->expires;
1538 internal_add_timer(new_base, timer); 1559 internal_add_timer(new_base, timer);
1539 } 1560 }
1540} 1561}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 019f380fd764..e71634604400 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_FTRACE_SYSCALLS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -60,9 +70,14 @@ config EVENT_TRACING
60 bool 70 bool
61 71
62config CONTEXT_SWITCH_TRACER 72config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool 73 bool
65 74
75config RING_BUFFER_ALLOW_SWAP
76 bool
77 help
78 Allow the use of ring_buffer_swap_cpu.
79 Adds a very slight overhead to tracing when enabled.
80
66# All tracer options should select GENERIC_TRACER. For those options that are 81# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING. 82# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
@@ -147,6 +162,7 @@ config IRQSOFF_TRACER
147 select TRACE_IRQFLAGS 162 select TRACE_IRQFLAGS
148 select GENERIC_TRACER 163 select GENERIC_TRACER
149 select TRACER_MAX_TRACE 164 select TRACER_MAX_TRACE
165 select RING_BUFFER_ALLOW_SWAP
150 help 166 help
151 This option measures the time spent in irqs-off critical 167 This option measures the time spent in irqs-off critical
152 sections, with microsecond accuracy. 168 sections, with microsecond accuracy.
@@ -168,6 +184,7 @@ config PREEMPT_TRACER
168 depends on PREEMPT 184 depends on PREEMPT
169 select GENERIC_TRACER 185 select GENERIC_TRACER
170 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP
171 help 188 help
172 This option measures the time spent in preemption off critical 189 This option measures the time spent in preemption off critical
173 sections, with microsecond accuracy. 190 sections, with microsecond accuracy.
@@ -211,7 +228,7 @@ config ENABLE_DEFAULT_TRACERS
211 228
212config FTRACE_SYSCALLS 229config FTRACE_SYSCALLS
213 bool "Trace syscalls" 230 bool "Trace syscalls"
214 depends on HAVE_FTRACE_SYSCALLS 231 depends on HAVE_SYSCALL_TRACEPOINTS
215 select GENERIC_TRACER 232 select GENERIC_TRACER
216 select KALLSYMS 233 select KALLSYMS
217 help 234 help
@@ -462,6 +479,18 @@ config FTRACE_STARTUP_TEST
462 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
463 tracers of ftrace. 480 tracers of ftrace.
464 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
465config MMIOTRACE 494config MMIOTRACE
466 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
467 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7a34cb563fec..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -65,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
65{ 65{
66 struct blk_io_trace *t; 66 struct blk_io_trace *t;
67 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
68 int pc = 0; 69 int pc = 0;
69 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
70 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
71 72
72 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
73 pc = preempt_count(); 75 pc = preempt_count();
74 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
75 sizeof(*t) + len, 77 sizeof(*t) + len,
76 0, pc); 78 0, pc);
77 if (!event) 79 if (!event)
@@ -96,7 +98,7 @@ record_it:
96 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
97 99
98 if (blk_tracer) 100 if (blk_tracer)
99 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
100 } 102 }
101} 103}
102 104
@@ -179,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
179{ 181{
180 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
181 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
182 struct blk_io_trace *t; 185 struct blk_io_trace *t;
183 unsigned long flags = 0; 186 unsigned long flags = 0;
184 unsigned long *sequence; 187 unsigned long *sequence;
@@ -204,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
204 if (blk_tracer) { 207 if (blk_tracer) {
205 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
206 209
210 buffer = blk_tr->buffer;
207 pc = preempt_count(); 211 pc = preempt_count();
208 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
209 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
210 0, pc); 214 0, pc);
211 if (!event) 215 if (!event)
@@ -252,7 +256,7 @@ record_it:
252 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
253 257
254 if (blk_tracer) { 258 if (blk_tracer) {
255 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
256 return; 260 return;
257 } 261 }
258 } 262 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 25edd5cc5935..cc615f84751b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1017{ 1017{
1018 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1019 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1020 1020
1021 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1022 1022
1023 ip = rec->ip;
1024
1025 /* 1023 /*
1026 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1027 * it is not enabled then do nothing. 1025 * then disable it.
1028 * 1026 *
1029 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1030 * it is enabled then disable it.
1031 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1032 */ 1031 */
1033 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1034 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1035 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1036 else 1035 }
1037 return 0;
1038
1039 } else if (ftrace_filtered && enable) {
1040 /*
1041 * Filtering is on:
1042 */
1043
1044 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1045
1046 /* Record is filtered and enabled, do nothing */
1047 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1048 return 0;
1049
1050 /* Record is not filtered or enabled, do nothing */
1051 if (!fl)
1052 return 0;
1053
1054 /* Record is not filtered but enabled, disable it */
1055 if (fl == FTRACE_FL_ENABLED)
1056 rec->flags &= ~FTRACE_FL_ENABLED;
1057 else
1058 /* Otherwise record is filtered but not enabled, enable it */
1059 rec->flags |= FTRACE_FL_ENABLED;
1060 } else {
1061 /* Disable or not filtered */
1062
1063 if (enable) {
1064 /* if record is enabled, do nothing */
1065 if (rec->flags & FTRACE_FL_ENABLED)
1066 return 0;
1067
1068 rec->flags |= FTRACE_FL_ENABLED;
1069
1070 } else {
1071 1036
1072 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1073 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1074 return 0; 1039 return 0;
1075 1040
1076 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1077 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1078 } 1044 }
1079 1045
1080 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1081 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1082 else
1083 return ftrace_make_nop(NULL, rec, ftrace_addr);
1084} 1048}
1085 1049
1086static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1359,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1359 1323
1360enum { 1324enum {
1361 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1362 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1363 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1364 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1365 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1366 FTRACE_ITER_HASH = (1 << 5),
1367}; 1330};
1368 1331
1369#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1373,9 +1336,7 @@ struct ftrace_iterator {
1373 int hidx; 1336 int hidx;
1374 int idx; 1337 int idx;
1375 unsigned flags; 1338 unsigned flags;
1376 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1377 unsigned buffer_idx;
1378 unsigned filtered;
1379}; 1340};
1380 1341
1381static void * 1342static void *
@@ -1438,18 +1399,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1438{ 1399{
1439 struct ftrace_func_probe *rec; 1400 struct ftrace_func_probe *rec;
1440 struct hlist_node *hnd = v; 1401 struct hlist_node *hnd = v;
1441 char str[KSYM_SYMBOL_LEN];
1442 1402
1443 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1403 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1444 1404
1445 if (rec->ops->print) 1405 if (rec->ops->print)
1446 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1447 1407
1448 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1449 seq_printf(m, "%s:", str);
1450
1451 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1452 seq_printf(m, "%s", str);
1453 1409
1454 if (rec->data) 1410 if (rec->data)
1455 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1503,6 @@ static int t_show(struct seq_file *m, void *v)
1547{ 1503{
1548 struct ftrace_iterator *iter = m->private; 1504 struct ftrace_iterator *iter = m->private;
1549 struct dyn_ftrace *rec = v; 1505 struct dyn_ftrace *rec = v;
1550 char str[KSYM_SYMBOL_LEN];
1551 1506
1552 if (iter->flags & FTRACE_ITER_HASH) 1507 if (iter->flags & FTRACE_ITER_HASH)
1553 return t_hash_show(m, v); 1508 return t_hash_show(m, v);
@@ -1560,9 +1515,7 @@ static int t_show(struct seq_file *m, void *v)
1560 if (!rec) 1515 if (!rec)
1561 return 0; 1516 return 0;
1562 1517
1563 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1564
1565 seq_printf(m, "%s\n", str);
1566 1519
1567 return 0; 1520 return 0;
1568} 1521}
@@ -1601,17 +1554,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1601 return ret; 1554 return ret;
1602} 1555}
1603 1556
1604int ftrace_avail_release(struct inode *inode, struct file *file)
1605{
1606 struct seq_file *m = (struct seq_file *)file->private_data;
1607 struct ftrace_iterator *iter = m->private;
1608
1609 seq_release(inode, file);
1610 kfree(iter);
1611
1612 return 0;
1613}
1614
1615static int 1557static int
1616ftrace_failures_open(struct inode *inode, struct file *file) 1558ftrace_failures_open(struct inode *inode, struct file *file)
1617{ 1559{
@@ -1660,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1660 if (!iter) 1602 if (!iter)
1661 return -ENOMEM; 1603 return -ENOMEM;
1662 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1663 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1664 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1665 (file->f_flags & O_TRUNC)) 1612 (file->f_flags & O_TRUNC))
@@ -2115,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2115 int i, len = 0; 2062 int i, len = 0;
2116 char *search; 2063 char *search;
2117 2064
2118 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2065 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2119 glob = NULL; 2066 glob = NULL;
2120 else { 2067 else if (glob) {
2121 int not; 2068 int not;
2122 2069
2123 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2252,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2252 size_t cnt, loff_t *ppos, int enable) 2199 size_t cnt, loff_t *ppos, int enable)
2253{ 2200{
2254 struct ftrace_iterator *iter; 2201 struct ftrace_iterator *iter;
2255 char ch; 2202 struct trace_parser *parser;
2256 size_t read = 0; 2203 ssize_t ret, read;
2257 ssize_t ret;
2258 2204
2259 if (!cnt || cnt < 0) 2205 if (!cnt || cnt < 0)
2260 return 0; 2206 return 0;
@@ -2267,73 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2267 } else 2213 } else
2268 iter = file->private_data; 2214 iter = file->private_data;
2269 2215
2270 if (!*ppos) { 2216 parser = &iter->parser;
2271 iter->flags &= ~FTRACE_ITER_CONT; 2217 read = trace_get_user(parser, ubuf, cnt, ppos);
2272 iter->buffer_idx = 0;
2273 }
2274
2275 ret = get_user(ch, ubuf++);
2276 if (ret)
2277 goto out;
2278 read++;
2279 cnt--;
2280
2281 /*
2282 * If the parser haven't finished with the last write,
2283 * continue reading the user input without skipping spaces.
2284 */
2285 if (!(iter->flags & FTRACE_ITER_CONT)) {
2286 /* skip white space */
2287 while (cnt && isspace(ch)) {
2288 ret = get_user(ch, ubuf++);
2289 if (ret)
2290 goto out;
2291 read++;
2292 cnt--;
2293 }
2294 2218
2295 /* only spaces were written */ 2219 if (trace_parser_loaded(parser) &&
2296 if (isspace(ch)) { 2220 !trace_parser_cont(parser)) {
2297 *ppos += read; 2221 ret = ftrace_process_regex(parser->buffer,
2298 ret = read; 2222 parser->idx, enable);
2299 goto out;
2300 }
2301
2302 iter->buffer_idx = 0;
2303 }
2304
2305 while (cnt && !isspace(ch)) {
2306 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2307 iter->buffer[iter->buffer_idx++] = ch;
2308 else {
2309 ret = -EINVAL;
2310 goto out;
2311 }
2312 ret = get_user(ch, ubuf++);
2313 if (ret) 2223 if (ret)
2314 goto out; 2224 goto out;
2315 read++;
2316 cnt--;
2317 }
2318 2225
2319 if (isspace(ch)) { 2226 trace_parser_clear(parser);
2320 iter->filtered++;
2321 iter->buffer[iter->buffer_idx] = 0;
2322 ret = ftrace_process_regex(iter->buffer,
2323 iter->buffer_idx, enable);
2324 if (ret)
2325 goto out;
2326 iter->buffer_idx = 0;
2327 } else {
2328 iter->flags |= FTRACE_ITER_CONT;
2329 iter->buffer[iter->buffer_idx++] = ch;
2330 } 2227 }
2331 2228
2332 *ppos += read;
2333 ret = read; 2229 ret = read;
2334 out:
2335 mutex_unlock(&ftrace_regex_lock);
2336 2230
2231 mutex_unlock(&ftrace_regex_lock);
2232out:
2337 return ret; 2233 return ret;
2338} 2234}
2339 2235
@@ -2438,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2438{ 2334{
2439 struct seq_file *m = (struct seq_file *)file->private_data; 2335 struct seq_file *m = (struct seq_file *)file->private_data;
2440 struct ftrace_iterator *iter; 2336 struct ftrace_iterator *iter;
2337 struct trace_parser *parser;
2441 2338
2442 mutex_lock(&ftrace_regex_lock); 2339 mutex_lock(&ftrace_regex_lock);
2443 if (file->f_mode & FMODE_READ) { 2340 if (file->f_mode & FMODE_READ) {
@@ -2447,10 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2447 } else 2344 } else
2448 iter = file->private_data; 2345 iter = file->private_data;
2449 2346
2450 if (iter->buffer_idx) { 2347 parser = &iter->parser;
2451 iter->filtered++; 2348 if (trace_parser_loaded(parser)) {
2452 iter->buffer[iter->buffer_idx] = 0; 2349 parser->buffer[parser->idx] = 0;
2453 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2350 ftrace_match_records(parser->buffer, parser->idx, enable);
2454 } 2351 }
2455 2352
2456 mutex_lock(&ftrace_lock); 2353 mutex_lock(&ftrace_lock);
@@ -2458,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2458 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2355 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2459 mutex_unlock(&ftrace_lock); 2356 mutex_unlock(&ftrace_lock);
2460 2357
2358 trace_parser_put(parser);
2461 kfree(iter); 2359 kfree(iter);
2360
2462 mutex_unlock(&ftrace_regex_lock); 2361 mutex_unlock(&ftrace_regex_lock);
2463 return 0; 2362 return 0;
2464} 2363}
@@ -2479,14 +2378,14 @@ static const struct file_operations ftrace_avail_fops = {
2479 .open = ftrace_avail_open, 2378 .open = ftrace_avail_open,
2480 .read = seq_read, 2379 .read = seq_read,
2481 .llseek = seq_lseek, 2380 .llseek = seq_lseek,
2482 .release = ftrace_avail_release, 2381 .release = seq_release_private,
2483}; 2382};
2484 2383
2485static const struct file_operations ftrace_failures_fops = { 2384static const struct file_operations ftrace_failures_fops = {
2486 .open = ftrace_failures_open, 2385 .open = ftrace_failures_open,
2487 .read = seq_read, 2386 .read = seq_read,
2488 .llseek = seq_lseek, 2387 .llseek = seq_lseek,
2489 .release = ftrace_avail_release, 2388 .release = seq_release_private,
2490}; 2389};
2491 2390
2492static const struct file_operations ftrace_filter_fops = { 2391static const struct file_operations ftrace_filter_fops = {
@@ -2548,7 +2447,6 @@ static void g_stop(struct seq_file *m, void *p)
2548static int g_show(struct seq_file *m, void *v) 2447static int g_show(struct seq_file *m, void *v)
2549{ 2448{
2550 unsigned long *ptr = v; 2449 unsigned long *ptr = v;
2551 char str[KSYM_SYMBOL_LEN];
2552 2450
2553 if (!ptr) 2451 if (!ptr)
2554 return 0; 2452 return 0;
@@ -2558,9 +2456,7 @@ static int g_show(struct seq_file *m, void *v)
2558 return 0; 2456 return 0;
2559 } 2457 }
2560 2458
2561 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2459 seq_printf(m, "%ps\n", (void *)*ptr);
2562
2563 seq_printf(m, "%s\n", str);
2564 2460
2565 return 0; 2461 return 0;
2566} 2462}
@@ -2663,12 +2559,10 @@ static ssize_t
2663ftrace_graph_write(struct file *file, const char __user *ubuf, 2559ftrace_graph_write(struct file *file, const char __user *ubuf,
2664 size_t cnt, loff_t *ppos) 2560 size_t cnt, loff_t *ppos)
2665{ 2561{
2666 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2562 struct trace_parser parser;
2667 unsigned long *array; 2563 unsigned long *array;
2668 size_t read = 0; 2564 size_t read = 0;
2669 ssize_t ret; 2565 ssize_t ret;
2670 int index = 0;
2671 char ch;
2672 2566
2673 if (!cnt || cnt < 0) 2567 if (!cnt || cnt < 0)
2674 return 0; 2568 return 0;
@@ -2686,51 +2580,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2686 } else 2580 } else
2687 array = file->private_data; 2581 array = file->private_data;
2688 2582
2689 ret = get_user(ch, ubuf++); 2583 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2690 if (ret) 2584 ret = -ENOMEM;
2691 goto out; 2585 goto out;
2692 read++;
2693 cnt--;
2694
2695 /* skip white space */
2696 while (cnt && isspace(ch)) {
2697 ret = get_user(ch, ubuf++);
2698 if (ret)
2699 goto out;
2700 read++;
2701 cnt--;
2702 } 2586 }
2703 2587
2704 if (isspace(ch)) { 2588 read = trace_get_user(&parser, ubuf, cnt, ppos);
2705 *ppos += read;
2706 ret = read;
2707 goto out;
2708 }
2709 2589
2710 while (cnt && !isspace(ch)) { 2590 if (trace_parser_loaded((&parser))) {
2711 if (index < FTRACE_BUFF_MAX) 2591 parser.buffer[parser.idx] = 0;
2712 buffer[index++] = ch; 2592
2713 else { 2593 /* we allow only one expression at a time */
2714 ret = -EINVAL; 2594 ret = ftrace_set_func(array, &ftrace_graph_count,
2715 goto out; 2595 parser.buffer);
2716 }
2717 ret = get_user(ch, ubuf++);
2718 if (ret) 2596 if (ret)
2719 goto out; 2597 goto out;
2720 read++;
2721 cnt--;
2722 } 2598 }
2723 buffer[index] = 0;
2724
2725 /* we allow only one expression at a time */
2726 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2727 if (ret)
2728 goto out;
2729
2730 file->f_pos += read;
2731 2599
2732 ret = read; 2600 ret = read;
2733 out: 2601 out:
2602 trace_parser_put(&parser);
2734 mutex_unlock(&graph_lock); 2603 mutex_unlock(&graph_lock);
2735 2604
2736 return ret; 2605 return ret;
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu(cpu, cpu_possible_mask) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..6eef38923b07 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -218,17 +218,12 @@ enum {
218 218
219static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
220{ 220{
221 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
222 && event->time_delta == 0;
223}
224
225static inline int rb_discarded_event(struct ring_buffer_event *event)
226{
227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
228} 222}
229 223
230static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
231{ 225{
226 /* padding has a NULL time_delta */
232 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
233 event->time_delta = 0; 228 event->time_delta = 0;
234} 229}
@@ -322,6 +317,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
323}; 318};
324 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
325struct buffer_page { 328struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -330,6 +333,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
331}; 334};
332 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
333static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
334{ 352{
335 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -403,21 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
404 int cpu; 422 int cpu;
405 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 425 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
409 struct list_head pages; 427 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 432 local_t commit_overrun;
415 unsigned long commit_overrun; 433 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 434 local_t entries;
419 local_t committing; 435 local_t committing;
420 local_t commits; 436 local_t commits;
437 unsigned long read;
421 u64 write_stamp; 438 u64 write_stamp;
422 u64 read_stamp; 439 u64 read_stamp;
423 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -450,14 +467,19 @@ struct ring_buffer_iter {
450}; 467};
451 468
452/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
453#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
454 ({ \ 471 ({ \
455 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
456 if (_____ret) { \ 473 if (_____ret) { \
457 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
458 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
459 } \ 476 (void *)b; \
460 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
461 }) 483 })
462 484
463/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 511}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = cmpxchg((unsigned long *)&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
492/** 898/**
493 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 904 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 906{
501 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
503 909
910 rb_head_page_deactivate(cpu_buffer);
911
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 913 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 915 return -1;
508 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
509 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
515 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
516 } 929 }
517 930
931 rb_head_page_activate(cpu_buffer);
932
518 return 0; 933 return 0;
519} 934}
520 935
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 937 unsigned nr_pages)
523{ 938{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 940 unsigned long addr;
527 LIST_HEAD(pages); 941 LIST_HEAD(pages);
528 unsigned i; 942 unsigned i;
529 943
944 WARN_ON(!nr_pages);
945
530 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 949 if (!bpage)
534 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
535 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
536 955
537 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
542 } 961 }
543 962
544 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
545 970
546 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
547 972
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 1001
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1004 if (!bpage)
581 goto fail_free_buffer; 1005 goto fail_free_buffer;
582 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
583 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1011 if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1020 goto fail_free_reader;
595 1021
596 cpu_buffer->head_page 1022 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1025
1026 rb_head_page_activate(cpu_buffer);
1027
600 return cpu_buffer; 1028 return cpu_buffer;
601 1029
602 fail_free_reader: 1030 fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1037
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1039{
612 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
614 1042
615 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
616 1044
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
620 } 1054 }
1055
621 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
622} 1057}
623 1058
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
760 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
761 synchronize_sched(); 1196 synchronize_sched();
762 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
763 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
764 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
765 return; 1202 return;
766 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
767 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
768 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
769 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
770 } 1207 }
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1209 return;
773 1210
774 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
790 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
791 synchronize_sched(); 1228 synchronize_sched();
792 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
793 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
794 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
795 return; 1235 return;
796 p = pages->next; 1236 p = pages->next;
797 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
798 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
799 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
800 } 1240 }
801 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
802 1243
803 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
804 1245
@@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
949} 1390}
950 1391
951static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
952rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
953{
954 return __rb_page_index(cpu_buffer->head_page,
955 cpu_buffer->head_page->read);
956}
957
958static inline struct ring_buffer_event *
959rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
960{ 1394{
961 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
962} 1396}
963 1397
964static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
965{ 1399{
966 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
967} 1401}
968 1402
969static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
971 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
972} 1406}
973 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
974/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
975static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
976{ 1415{
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
983 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
984} 1423}
985 1424
986static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
987{
988 return rb_page_commit(cpu_buffer->head_page);
989}
990
991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
992 struct buffer_page **bpage)
993{
994 struct list_head *p = (*bpage)->list.next;
995
996 if (p == &cpu_buffer->pages)
997 p = p->next;
998
999 *bpage = list_entry(p, struct buffer_page, list);
1000}
1001
1002static inline unsigned 1425static inline unsigned
1003rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1004{ 1427{
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1024static void 1447static void
1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1026{ 1449{
1450 unsigned long max_count;
1451
1027 /* 1452 /*
1028 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1029 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033 * assign the commit to the tail. 1458 * assign the commit to the tail.
1034 */ 1459 */
1035 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1036 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1037 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1038 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1039 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1040 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1041 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1044 } 1476 }
1045 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1046 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1047 cpu_buffer->commit_page->page->commit = 1479
1048 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1049 barrier(); 1485 barrier();
1050 } 1486 }
1051 1487
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1078 * to the head page instead of next. 1514 * to the head page instead of next.
1079 */ 1515 */
1080 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1081 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1082 else 1518 else
1083 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1084 1520
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1122 } 1558 }
1123} 1559}
1124 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1125static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1126{ 1719{
1127 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1185,9 +1778,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1185 event->type_len = RINGBUF_TYPE_PADDING; 1778 event->type_len = RINGBUF_TYPE_PADDING;
1186 /* time delta must be non zero */ 1779 /* time delta must be non zero */
1187 event->time_delta = 1; 1780 event->time_delta = 1;
1188 /* Account for this as an entry */
1189 local_inc(&tail_page->entries);
1190 local_inc(&cpu_buffer->entries);
1191 1781
1192 /* Set write to end of buffer */ 1782 /* Set write to end of buffer */
1193 length = (tail + length) - BUF_PAGE_SIZE; 1783 length = (tail + length) - BUF_PAGE_SIZE;
@@ -1200,96 +1790,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1200 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1201 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1202{ 1792{
1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false; 1794 struct buffer_page *next_page;
1206 unsigned long flags; 1795 int ret;
1207 1796
1208 next_page = tail_page; 1797 next_page = tail_page;
1209 1798
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1231 goto out_reset;
1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1235
1236 lock_taken = true;
1237
1238 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1239 1800
1240 head_page = cpu_buffer->head_page;
1241 reader_page = cpu_buffer->reader_page;
1242
1243 /* we grabbed the lock before incrementing */
1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1245 goto out_reset;
1246
1247 /* 1801 /*
1248 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1250 * about it. 1804 * about it.
1251 */ 1805 */
1252 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1254 goto out_reset; 1808 goto out_reset;
1255 } 1809 }
1256 1810
1257 if (next_page == head_page) { 1811 /*
1258 if (!(buffer->flags & RB_FL_OVERWRITE)) 1812 * This is where the fun begins!
1259 goto out_reset; 1813 *
1260 1814 * We are fighting against races between a reader that
1261 /* tail_page has not moved yet? */ 1815 * could be on another CPU trying to swap its reader
1262 if (tail_page == cpu_buffer->tail_page) { 1816 * page with the buffer head.
1263 /* count overflows */ 1817 *
1264 cpu_buffer->overrun += 1818 * We are also fighting against interrupts coming in and
1265 local_read(&head_page->entries); 1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1824 */
1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1266 1826
1267 rb_inc_page(cpu_buffer, &head_page); 1827 /*
1268 cpu_buffer->head_page = head_page; 1828 * If the commit is not on the reader page, then
1269 cpu_buffer->head_page->read = 0; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1270 } 1864 }
1271 } 1865 }
1272 1866
1273 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1274 * If the tail page is still the same as what we think 1868 if (ret) {
1275 * it is, then it is up to us to update the tail 1869 /*
1276 * pointer. 1870 * Nested commits always have zero deltas, so
1277 */ 1871 * just reread the time stamp
1278 if (tail_page == cpu_buffer->tail_page) { 1872 */
1279 local_set(&next_page->write, 0);
1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts; 1874 next_page->page->time_stamp = *ts;
1287 } 1875 }
1288 1876
1289 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1877 out_again:
1290 1878
1291 __raw_spin_unlock(&cpu_buffer->lock); 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1292 local_irq_restore(flags);
1293 1880
1294 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1295 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
@@ -1298,9 +1885,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1298 /* reset write */ 1885 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1300 1887
1301 if (likely(lock_taken))
1302 __raw_spin_unlock(&cpu_buffer->lock);
1303 local_irq_restore(flags);
1304 return NULL; 1888 return NULL;
1305} 1889}
1306 1890
@@ -1317,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1317 barrier(); 1901 barrier();
1318 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1320 tail = write - length; 1907 tail = write - length;
1321 1908
1322 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1361 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1362 1949
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1364 /* 1953 /*
1365 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1367 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1368 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1369 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1370 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index) 1962 if (index == old_index)
1372 return 1; 1963 return 1;
@@ -1482,7 +2073,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1482} 2073}
1483 2074
1484static struct ring_buffer_event * 2075static struct ring_buffer_event *
1485rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1486 unsigned long length) 2078 unsigned long length)
1487{ 2079{
1488 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1492,6 +2084,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1492 2084
1493 rb_start_commit(cpu_buffer); 2085 rb_start_commit(cpu_buffer);
1494 2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1495 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1496 again: 2103 again:
1497 /* 2104 /*
@@ -1652,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1652 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1653 goto out; 2260 goto out;
1654 2261
1655 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1656 if (!event) 2263 if (!event)
1657 goto out; 2264 goto out;
1658 2265
@@ -1675,18 +2282,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1675} 2282}
1676EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1677 2284
1678static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1679 struct ring_buffer_event *event) 2287 struct ring_buffer_event *event)
1680{ 2288{
1681 local_inc(&cpu_buffer->entries);
1682
1683 /* 2289 /*
1684 * The event first in the commit queue updates the 2290 * The event first in the commit queue updates the
1685 * time stamp. 2291 * time stamp.
1686 */ 2292 */
1687 if (rb_event_is_commit(cpu_buffer, event)) 2293 if (rb_event_is_commit(cpu_buffer, event))
1688 cpu_buffer->write_stamp += event->time_delta; 2294 cpu_buffer->write_stamp += event->time_delta;
2295}
1689 2296
2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2298 struct ring_buffer_event *event)
2299{
2300 local_inc(&cpu_buffer->entries);
2301 rb_update_write_stamp(cpu_buffer, event);
1690 rb_end_commit(cpu_buffer); 2302 rb_end_commit(cpu_buffer);
1691} 2303}
1692 2304
@@ -1733,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1733 event->time_delta = 1; 2345 event->time_delta = 1;
1734} 2346}
1735 2347
1736/** 2348/*
1737 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1738 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1739 * 2351 * to the page it is on. This may only be called before the commit
1740 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1741 * This function lets the user discard an event in the ring buffer
1742 * and then that event will not be read later.
1743 *
1744 * Note, it is up to the user to be careful with this, and protect
1745 * against races. If the user discards an event that has been consumed
1746 * it is possible that it could corrupt the ring buffer.
1747 */ 2353 */
1748void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1749{ 2357{
1750 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1751} 2386}
1752EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1753 2387
1754/** 2388/**
1755 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1756 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1757 * @event: non committed event to discard 2391 * @event: non committed event to discard
1758 * 2392 *
1759 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1760 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1761 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1762 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1763 * 2400 *
1764 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1786,14 +2423,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1786 */ 2423 */
1787 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1788 2425
2426 rb_decrement_entry(cpu_buffer, event);
1789 if (rb_try_to_discard(cpu_buffer, event)) 2427 if (rb_try_to_discard(cpu_buffer, event))
1790 goto out; 2428 goto out;
1791 2429
1792 /* 2430 /*
1793 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1794 * must increment entries. 2432 * must still update the timestamp.
1795 */ 2433 */
1796 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1797 out: 2435 out:
1798 rb_end_commit(cpu_buffer); 2436 rb_end_commit(cpu_buffer);
1799 2437
@@ -1854,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1854 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1855 goto out; 2493 goto out;
1856 2494
1857 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1858 if (!event) 2496 if (!event)
1859 goto out; 2497 goto out;
1860 2498
@@ -1875,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1875static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1876{ 2514{
1877 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1878 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1879 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1880 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1881 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1882 (commit == reader || 2524 (commit == reader ||
1883 (commit == head && 2525 (commit == head &&
@@ -1968,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1968 return 0; 2610 return 0;
1969 2611
1970 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1972 - cpu_buffer->read; 2614 - cpu_buffer->read;
1973 2615
1974 return ret; 2616 return ret;
@@ -1989,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989 return 0; 2631 return 0;
1990 2632
1991 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1992 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1993 2635
1994 return ret; 2636 return ret;
1995} 2637}
1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1997 2639
1998/** 2640/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2030 return 0; 2652 return 0;
2031 2653
2032 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
2034 2656
2035 return ret; 2657 return ret;
2036} 2658}
@@ -2053,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2053 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2054 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2055 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2057 } 2679 }
2058 2680
2059 return entries; 2681 return entries;
@@ -2076,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2076 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2077 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2078 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2079 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2080 } 2702 }
2081 2703
2082 return overruns; 2704 return overruns;
@@ -2089,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2089 2711
2090 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2091 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2092 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2093 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2094 } else { 2718 } else {
2095 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2096 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2208 unsigned long flags; 2832 unsigned long flags;
2209 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2210 2835
2211 local_irq_save(flags); 2836 local_irq_save(flags);
2212 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2240 goto out; 2865 goto out;
2241 2866
2242 /* 2867 /*
2243 * Splice the empty reader page into the list around the head.
2244 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2245 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2246 2873
2247 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2248 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2249 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2250 2881
2251 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2252 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2253 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2254 2888
2255 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2256 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2257 reader->list.next->prev = &cpu_buffer->reader_page->list;
2258 2891
2259 /* 2892 /*
2260 * If the tail is on the reader, then we must set the head 2893 * Here's the tricky part.
2261 * to the inserted page, otherwise we set it one before. 2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2262 */ 2901 */
2263 cpu_buffer->head_page = cpu_buffer->reader_page;
2264 2902
2265 if (cpu_buffer->commit_page != reader) 2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2266 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2904
2905 /*
2906 * If we did not convert it, then we must try again.
2907 */
2908 if (!ret)
2909 goto spin;
2910
2911 /*
2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2267 2918
2268 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2269 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2292,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2292 2943
2293 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2294 2945
2295 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2296 || rb_discarded_event(event))
2297 cpu_buffer->read++; 2947 cpu_buffer->read++;
2298 2948
2299 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2347,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2347} 2997}
2348 2998
2349static struct ring_buffer_event * 2999static struct ring_buffer_event *
2350rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3000rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
2351{ 3001{
2352 struct ring_buffer_per_cpu *cpu_buffer;
2353 struct ring_buffer_event *event; 3002 struct ring_buffer_event *event;
2354 struct buffer_page *reader; 3003 struct buffer_page *reader;
2355 int nr_loops = 0; 3004 int nr_loops = 0;
2356 3005
2357 cpu_buffer = buffer->buffers[cpu];
2358
2359 again: 3006 again:
2360 /* 3007 /*
2361 * We repeat when a timestamp is encountered. It is possible 3008 * We repeat when a timestamp is encountered. It is possible
@@ -2399,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2399 case RINGBUF_TYPE_DATA: 3046 case RINGBUF_TYPE_DATA:
2400 if (ts) { 3047 if (ts) {
2401 *ts = cpu_buffer->read_stamp + event->time_delta; 3048 *ts = cpu_buffer->read_stamp + event->time_delta;
2402 ring_buffer_normalize_time_stamp(buffer, 3049 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
2403 cpu_buffer->cpu, ts); 3050 cpu_buffer->cpu, ts);
2404 } 3051 }
2405 return event; 3052 return event;
@@ -2518,17 +3165,15 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2518 local_irq_save(flags); 3165 local_irq_save(flags);
2519 if (dolock) 3166 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock); 3167 spin_lock(&cpu_buffer->reader_lock);
2521 event = rb_buffer_peek(buffer, cpu, ts); 3168 event = rb_buffer_peek(cpu_buffer, ts);
2522 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3169 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2523 rb_advance_reader(cpu_buffer); 3170 rb_advance_reader(cpu_buffer);
2524 if (dolock) 3171 if (dolock)
2525 spin_unlock(&cpu_buffer->reader_lock); 3172 spin_unlock(&cpu_buffer->reader_lock);
2526 local_irq_restore(flags); 3173 local_irq_restore(flags);
2527 3174
2528 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3175 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2529 cpu_relax();
2530 goto again; 3176 goto again;
2531 }
2532 3177
2533 return event; 3178 return event;
2534} 3179}
@@ -2553,10 +3198,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2553 event = rb_iter_peek(iter, ts); 3198 event = rb_iter_peek(iter, ts);
2554 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3199 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2555 3200
2556 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3201 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2557 cpu_relax();
2558 goto again; 3202 goto again;
2559 }
2560 3203
2561 return event; 3204 return event;
2562} 3205}
@@ -2591,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2591 if (dolock) 3234 if (dolock)
2592 spin_lock(&cpu_buffer->reader_lock); 3235 spin_lock(&cpu_buffer->reader_lock);
2593 3236
2594 event = rb_buffer_peek(buffer, cpu, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts);
2595 if (event) 3238 if (event)
2596 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
2597 3240
@@ -2602,10 +3245,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2602 out: 3245 out:
2603 preempt_enable(); 3246 preempt_enable();
2604 3247
2605 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3248 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2606 cpu_relax();
2607 goto again; 3249 goto again;
2608 }
2609 3250
2610 return event; 3251 return event;
2611} 3252}
@@ -2685,21 +3326,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2685 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3326 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2686 unsigned long flags; 3327 unsigned long flags;
2687 3328
2688 again:
2689 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3329 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3330 again:
2690 event = rb_iter_peek(iter, ts); 3331 event = rb_iter_peek(iter, ts);
2691 if (!event) 3332 if (!event)
2692 goto out; 3333 goto out;
2693 3334
3335 if (event->type_len == RINGBUF_TYPE_PADDING)
3336 goto again;
3337
2694 rb_advance_iter(iter); 3338 rb_advance_iter(iter);
2695 out: 3339 out:
2696 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3340 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2697 3341
2698 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2699 cpu_relax();
2700 goto again;
2701 }
2702
2703 return event; 3342 return event;
2704} 3343}
2705EXPORT_SYMBOL_GPL(ring_buffer_read); 3344EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2717,8 +3356,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2717static void 3356static void
2718rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3357rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2719{ 3358{
3359 rb_head_page_deactivate(cpu_buffer);
3360
2720 cpu_buffer->head_page 3361 cpu_buffer->head_page
2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3362 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2722 local_set(&cpu_buffer->head_page->write, 0); 3363 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0); 3364 local_set(&cpu_buffer->head_page->entries, 0);
2724 local_set(&cpu_buffer->head_page->page->commit, 0); 3365 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3375,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2734 local_set(&cpu_buffer->reader_page->page->commit, 0); 3375 local_set(&cpu_buffer->reader_page->page->commit, 0);
2735 cpu_buffer->reader_page->read = 0; 3376 cpu_buffer->reader_page->read = 0;
2736 3377
2737 cpu_buffer->nmi_dropped = 0; 3378 local_set(&cpu_buffer->commit_overrun, 0);
2738 cpu_buffer->commit_overrun = 0; 3379 local_set(&cpu_buffer->overrun, 0);
2739 cpu_buffer->overrun = 0;
2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0); 3380 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0); 3381 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0); 3382 local_set(&cpu_buffer->commits, 0);
3383 cpu_buffer->read = 0;
2744 3384
2745 cpu_buffer->write_stamp = 0; 3385 cpu_buffer->write_stamp = 0;
2746 cpu_buffer->read_stamp = 0; 3386 cpu_buffer->read_stamp = 0;
3387
3388 rb_head_page_activate(cpu_buffer);
2747} 3389}
2748 3390
2749/** 3391/**
@@ -2763,12 +3405,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2763 3405
2764 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3406 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2765 3407
3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409 goto out;
3410
2766 __raw_spin_lock(&cpu_buffer->lock); 3411 __raw_spin_lock(&cpu_buffer->lock);
2767 3412
2768 rb_reset_cpu(cpu_buffer); 3413 rb_reset_cpu(cpu_buffer);
2769 3414
2770 __raw_spin_unlock(&cpu_buffer->lock); 3415 __raw_spin_unlock(&cpu_buffer->lock);
2771 3416
3417 out:
2772 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2773 3419
2774 atomic_dec(&cpu_buffer->record_disabled); 3420 atomic_dec(&cpu_buffer->record_disabled);
@@ -2851,6 +3497,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2851} 3497}
2852EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3498EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2853 3499
3500#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2854/** 3501/**
2855 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3502 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2856 * @buffer_a: One buffer to swap with 3503 * @buffer_a: One buffer to swap with
@@ -2905,20 +3552,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2905 atomic_inc(&cpu_buffer_a->record_disabled); 3552 atomic_inc(&cpu_buffer_a->record_disabled);
2906 atomic_inc(&cpu_buffer_b->record_disabled); 3553 atomic_inc(&cpu_buffer_b->record_disabled);
2907 3554
3555 ret = -EBUSY;
3556 if (local_read(&cpu_buffer_a->committing))
3557 goto out_dec;
3558 if (local_read(&cpu_buffer_b->committing))
3559 goto out_dec;
3560
2908 buffer_a->buffers[cpu] = cpu_buffer_b; 3561 buffer_a->buffers[cpu] = cpu_buffer_b;
2909 buffer_b->buffers[cpu] = cpu_buffer_a; 3562 buffer_b->buffers[cpu] = cpu_buffer_a;
2910 3563
2911 cpu_buffer_b->buffer = buffer_a; 3564 cpu_buffer_b->buffer = buffer_a;
2912 cpu_buffer_a->buffer = buffer_b; 3565 cpu_buffer_a->buffer = buffer_b;
2913 3566
3567 ret = 0;
3568
3569out_dec:
2914 atomic_dec(&cpu_buffer_a->record_disabled); 3570 atomic_dec(&cpu_buffer_a->record_disabled);
2915 atomic_dec(&cpu_buffer_b->record_disabled); 3571 atomic_dec(&cpu_buffer_b->record_disabled);
2916
2917 ret = 0;
2918out: 3572out:
2919 return ret; 3573 return ret;
2920} 3574}
2921EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3575EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3576#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2922 3577
2923/** 3578/**
2924 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3579 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3091,7 +3746,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3091 read = 0; 3746 read = 0;
3092 } else { 3747 } else {
3093 /* update the entry counter */ 3748 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries); 3749 cpu_buffer->read += rb_page_entries(reader);
3095 3750
3096 /* swap the pages */ 3751 /* swap the pages */
3097 rb_init_page(bpage); 3752 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c358395d338..fd52a19dd172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,14 +43,11 @@
43 43
44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
45 45
46unsigned long __read_mostly tracing_max_latency;
47unsigned long __read_mostly tracing_thresh;
48
49/* 46/*
50 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
51 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
52 */ 49 */
53static int ring_buffer_expanded; 50int ring_buffer_expanded;
54 51
55/* 52/*
56 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -64,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
64/* 61/*
65 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
66 */ 63 */
67static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
68 65
69/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
70static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
89 */ 86 */
90static int tracing_disabled = 1; 87static int tracing_disabled = 1;
91 88
92static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
93 90
94static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
95{ 92{
@@ -172,10 +169,11 @@ static struct trace_array global_trace;
172 169
173static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
174 171
175int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
176 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
177{ 175{
178 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
179} 177}
180EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
181 179
@@ -266,6 +264,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
266 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 264 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
267 TRACE_ITER_GRAPH_TIME; 265 TRACE_ITER_GRAPH_TIME;
268 266
267static int trace_stop_count;
268static DEFINE_SPINLOCK(tracing_start_lock);
269
269/** 270/**
270 * trace_wake_up - wake up tasks waiting for trace input 271 * trace_wake_up - wake up tasks waiting for trace input
271 * 272 *
@@ -323,49 +324,125 @@ static const char *trace_options[] = {
323 "printk-msg-only", 324 "printk-msg-only",
324 "context-info", 325 "context-info",
325 "latency-format", 326 "latency-format",
326 "global-clock",
327 "sleep-time", 327 "sleep-time",
328 "graph-time", 328 "graph-time",
329 NULL 329 NULL
330}; 330};
331 331
332static struct {
333 u64 (*func)(void);
334 const char *name;
335} trace_clocks[] = {
336 { trace_clock_local, "local" },
337 { trace_clock_global, "global" },
338};
339
340int trace_clock_id;
341
332/* 342/*
333 * ftrace_max_lock is used to protect the swapping of buffers 343 * trace_parser_get_init - gets the buffer for trace parser
334 * when taking a max snapshot. The buffers themselves are
335 * protected by per_cpu spinlocks. But the action of the swap
336 * needs its own lock.
337 *
338 * This is defined as a raw_spinlock_t in order to help
339 * with performance when lockdep debugging is enabled.
340 */ 344 */
341static raw_spinlock_t ftrace_max_lock = 345int trace_parser_get_init(struct trace_parser *parser, int size)
342 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 346{
347 memset(parser, 0, sizeof(*parser));
348
349 parser->buffer = kmalloc(size, GFP_KERNEL);
350 if (!parser->buffer)
351 return 1;
352
353 parser->size = size;
354 return 0;
355}
343 356
344/* 357/*
345 * Copy the new maximum trace into the separate maximum-trace 358 * trace_parser_put - frees the buffer for trace parser
346 * structure. (this way the maximum trace is permanently saved,
347 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
348 */ 359 */
349static void 360void trace_parser_put(struct trace_parser *parser)
350__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
351{ 361{
352 struct trace_array_cpu *data = tr->data[cpu]; 362 kfree(parser->buffer);
363}
353 364
354 max_tr.cpu = cpu; 365/*
355 max_tr.time_start = data->preempt_timestamp; 366 * trace_get_user - reads the user input string separated by space
367 * (matched by isspace(ch))
368 *
369 * For each string found the 'struct trace_parser' is updated,
370 * and the function returns.
371 *
372 * Returns number of bytes read.
373 *
374 * See kernel/trace/trace.h for 'struct trace_parser' details.
375 */
376int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
377 size_t cnt, loff_t *ppos)
378{
379 char ch;
380 size_t read = 0;
381 ssize_t ret;
356 382
357 data = max_tr.data[cpu]; 383 if (!*ppos)
358 data->saved_latency = tracing_max_latency; 384 trace_parser_clear(parser);
359 385
360 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 386 ret = get_user(ch, ubuf++);
361 data->pid = tsk->pid; 387 if (ret)
362 data->uid = task_uid(tsk); 388 goto out;
363 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
364 data->policy = tsk->policy;
365 data->rt_priority = tsk->rt_priority;
366 389
367 /* record this tasks comm */ 390 read++;
368 tracing_record_cmdline(tsk); 391 cnt--;
392
393 /*
394 * The parser is not finished with the last write,
395 * continue reading the user input without skipping spaces.
396 */
397 if (!parser->cont) {
398 /* skip white space */
399 while (cnt && isspace(ch)) {
400 ret = get_user(ch, ubuf++);
401 if (ret)
402 goto out;
403 read++;
404 cnt--;
405 }
406
407 /* only spaces were written */
408 if (isspace(ch)) {
409 *ppos += read;
410 ret = read;
411 goto out;
412 }
413
414 parser->idx = 0;
415 }
416
417 /* read the non-space input */
418 while (cnt && !isspace(ch)) {
419 if (parser->idx < parser->size)
420 parser->buffer[parser->idx++] = ch;
421 else {
422 ret = -EINVAL;
423 goto out;
424 }
425 ret = get_user(ch, ubuf++);
426 if (ret)
427 goto out;
428 read++;
429 cnt--;
430 }
431
432 /* We either got finished input or we have to wait for another call. */
433 if (isspace(ch)) {
434 parser->buffer[parser->idx] = 0;
435 parser->cont = false;
436 } else {
437 parser->cont = true;
438 parser->buffer[parser->idx++] = ch;
439 }
440
441 *ppos += read;
442 ret = read;
443
444out:
445 return ret;
369} 446}
370 447
371ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 448ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
@@ -411,6 +488,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
411 return cnt; 488 return cnt;
412} 489}
413 490
491/*
492 * ftrace_max_lock is used to protect the swapping of buffers
493 * when taking a max snapshot. The buffers themselves are
494 * protected by per_cpu spinlocks. But the action of the swap
495 * needs its own lock.
496 *
497 * This is defined as a raw_spinlock_t in order to help
498 * with performance when lockdep debugging is enabled.
499 *
500 * It is also used in other places outside the update_max_tr
501 * so it needs to be defined outside of the
502 * CONFIG_TRACER_MAX_TRACE.
503 */
504static raw_spinlock_t ftrace_max_lock =
505 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
506
507#ifdef CONFIG_TRACER_MAX_TRACE
508unsigned long __read_mostly tracing_max_latency;
509unsigned long __read_mostly tracing_thresh;
510
511/*
512 * Copy the new maximum trace into the separate maximum-trace
513 * structure. (this way the maximum trace is permanently saved,
514 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
515 */
516static void
517__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
518{
519 struct trace_array_cpu *data = tr->data[cpu];
520 struct trace_array_cpu *max_data = tr->data[cpu];
521
522 max_tr.cpu = cpu;
523 max_tr.time_start = data->preempt_timestamp;
524
525 max_data = max_tr.data[cpu];
526 max_data->saved_latency = tracing_max_latency;
527 max_data->critical_start = data->critical_start;
528 max_data->critical_end = data->critical_end;
529
530 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
531 max_data->pid = tsk->pid;
532 max_data->uid = task_uid(tsk);
533 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
534 max_data->policy = tsk->policy;
535 max_data->rt_priority = tsk->rt_priority;
536
537 /* record this tasks comm */
538 tracing_record_cmdline(tsk);
539}
540
414/** 541/**
415 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 542 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
416 * @tr: tracer 543 * @tr: tracer
@@ -425,16 +552,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
425{ 552{
426 struct ring_buffer *buf = tr->buffer; 553 struct ring_buffer *buf = tr->buffer;
427 554
555 if (trace_stop_count)
556 return;
557
428 WARN_ON_ONCE(!irqs_disabled()); 558 WARN_ON_ONCE(!irqs_disabled());
429 __raw_spin_lock(&ftrace_max_lock); 559 __raw_spin_lock(&ftrace_max_lock);
430 560
431 tr->buffer = max_tr.buffer; 561 tr->buffer = max_tr.buffer;
432 max_tr.buffer = buf; 562 max_tr.buffer = buf;
433 563
434 ftrace_disable_cpu();
435 ring_buffer_reset(tr->buffer);
436 ftrace_enable_cpu();
437
438 __update_max_tr(tr, tsk, cpu); 564 __update_max_tr(tr, tsk, cpu);
439 __raw_spin_unlock(&ftrace_max_lock); 565 __raw_spin_unlock(&ftrace_max_lock);
440} 566}
@@ -452,21 +578,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
452{ 578{
453 int ret; 579 int ret;
454 580
581 if (trace_stop_count)
582 return;
583
455 WARN_ON_ONCE(!irqs_disabled()); 584 WARN_ON_ONCE(!irqs_disabled());
456 __raw_spin_lock(&ftrace_max_lock); 585 __raw_spin_lock(&ftrace_max_lock);
457 586
458 ftrace_disable_cpu(); 587 ftrace_disable_cpu();
459 588
460 ring_buffer_reset(max_tr.buffer);
461 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 589 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
462 590
591 if (ret == -EBUSY) {
592 /*
593 * We failed to swap the buffer due to a commit taking
594 * place on this CPU. We fail to record, but we reset
595 * the max trace buffer (no one writes directly to it)
596 * and flag that it failed.
597 */
598 trace_array_printk(&max_tr, _THIS_IP_,
599 "Failed to swap buffers due to commit in progress\n");
600 }
601
463 ftrace_enable_cpu(); 602 ftrace_enable_cpu();
464 603
465 WARN_ON_ONCE(ret && ret != -EAGAIN); 604 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
466 605
467 __update_max_tr(tr, tsk, cpu); 606 __update_max_tr(tr, tsk, cpu);
468 __raw_spin_unlock(&ftrace_max_lock); 607 __raw_spin_unlock(&ftrace_max_lock);
469} 608}
609#endif /* CONFIG_TRACER_MAX_TRACE */
470 610
471/** 611/**
472 * register_tracer - register a tracer with the ftrace system. 612 * register_tracer - register a tracer with the ftrace system.
@@ -523,7 +663,6 @@ __acquires(kernel_lock)
523 if (type->selftest && !tracing_selftest_disabled) { 663 if (type->selftest && !tracing_selftest_disabled) {
524 struct tracer *saved_tracer = current_trace; 664 struct tracer *saved_tracer = current_trace;
525 struct trace_array *tr = &global_trace; 665 struct trace_array *tr = &global_trace;
526 int i;
527 666
528 /* 667 /*
529 * Run a selftest on this tracer. 668 * Run a selftest on this tracer.
@@ -532,8 +671,7 @@ __acquires(kernel_lock)
532 * internal tracing to verify that everything is in order. 671 * internal tracing to verify that everything is in order.
533 * If we fail, we do not register this tracer. 672 * If we fail, we do not register this tracer.
534 */ 673 */
535 for_each_tracing_cpu(i) 674 tracing_reset_online_cpus(tr);
536 tracing_reset(tr, i);
537 675
538 current_trace = type; 676 current_trace = type;
539 /* the test is responsible for initializing and enabling */ 677 /* the test is responsible for initializing and enabling */
@@ -546,8 +684,7 @@ __acquires(kernel_lock)
546 goto out; 684 goto out;
547 } 685 }
548 /* Only reset on passing, to avoid touching corrupted buffers */ 686 /* Only reset on passing, to avoid touching corrupted buffers */
549 for_each_tracing_cpu(i) 687 tracing_reset_online_cpus(tr);
550 tracing_reset(tr, i);
551 688
552 printk(KERN_CONT "PASSED\n"); 689 printk(KERN_CONT "PASSED\n");
553 } 690 }
@@ -622,21 +759,42 @@ void unregister_tracer(struct tracer *type)
622 mutex_unlock(&trace_types_lock); 759 mutex_unlock(&trace_types_lock);
623} 760}
624 761
625void tracing_reset(struct trace_array *tr, int cpu) 762static void __tracing_reset(struct trace_array *tr, int cpu)
626{ 763{
627 ftrace_disable_cpu(); 764 ftrace_disable_cpu();
628 ring_buffer_reset_cpu(tr->buffer, cpu); 765 ring_buffer_reset_cpu(tr->buffer, cpu);
629 ftrace_enable_cpu(); 766 ftrace_enable_cpu();
630} 767}
631 768
769void tracing_reset(struct trace_array *tr, int cpu)
770{
771 struct ring_buffer *buffer = tr->buffer;
772
773 ring_buffer_record_disable(buffer);
774
775 /* Make sure all commits have finished */
776 synchronize_sched();
777 __tracing_reset(tr, cpu);
778
779 ring_buffer_record_enable(buffer);
780}
781
632void tracing_reset_online_cpus(struct trace_array *tr) 782void tracing_reset_online_cpus(struct trace_array *tr)
633{ 783{
784 struct ring_buffer *buffer = tr->buffer;
634 int cpu; 785 int cpu;
635 786
787 ring_buffer_record_disable(buffer);
788
789 /* Make sure all commits have finished */
790 synchronize_sched();
791
636 tr->time_start = ftrace_now(tr->cpu); 792 tr->time_start = ftrace_now(tr->cpu);
637 793
638 for_each_online_cpu(cpu) 794 for_each_online_cpu(cpu)
639 tracing_reset(tr, cpu); 795 __tracing_reset(tr, cpu);
796
797 ring_buffer_record_enable(buffer);
640} 798}
641 799
642void tracing_reset_current(int cpu) 800void tracing_reset_current(int cpu)
@@ -667,8 +825,10 @@ static void trace_init_cmdlines(void)
667 cmdline_idx = 0; 825 cmdline_idx = 0;
668} 826}
669 827
670static int trace_stop_count; 828int is_tracing_stopped(void)
671static DEFINE_SPINLOCK(tracing_start_lock); 829{
830 return trace_stop_count;
831}
672 832
673/** 833/**
674 * ftrace_off_permanent - disable all ftrace code permanently 834 * ftrace_off_permanent - disable all ftrace code permanently
@@ -837,7 +997,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
837 997
838 entry->preempt_count = pc & 0xff; 998 entry->preempt_count = pc & 0xff;
839 entry->pid = (tsk) ? tsk->pid : 0; 999 entry->pid = (tsk) ? tsk->pid : 0;
840 entry->tgid = (tsk) ? tsk->tgid : 0; 1000 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
841 entry->flags = 1001 entry->flags =
842#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1002#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
843 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1003 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -850,14 +1010,15 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
850} 1010}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1011EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
852 1012
853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 1013struct ring_buffer_event *
854 int type, 1014trace_buffer_lock_reserve(struct ring_buffer *buffer,
855 unsigned long len, 1015 int type,
856 unsigned long flags, int pc) 1016 unsigned long len,
1017 unsigned long flags, int pc)
857{ 1018{
858 struct ring_buffer_event *event; 1019 struct ring_buffer_event *event;
859 1020
860 event = ring_buffer_lock_reserve(tr->buffer, len); 1021 event = ring_buffer_lock_reserve(buffer, len);
861 if (event != NULL) { 1022 if (event != NULL) {
862 struct trace_entry *ent = ring_buffer_event_data(event); 1023 struct trace_entry *ent = ring_buffer_event_data(event);
863 1024
@@ -867,58 +1028,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
867 1028
868 return event; 1029 return event;
869} 1030}
870static void ftrace_trace_stack(struct trace_array *tr,
871 unsigned long flags, int skip, int pc);
872static void ftrace_trace_userstack(struct trace_array *tr,
873 unsigned long flags, int pc);
874 1031
875static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 1032static inline void
876 struct ring_buffer_event *event, 1033__trace_buffer_unlock_commit(struct ring_buffer *buffer,
877 unsigned long flags, int pc, 1034 struct ring_buffer_event *event,
878 int wake) 1035 unsigned long flags, int pc,
1036 int wake)
879{ 1037{
880 ring_buffer_unlock_commit(tr->buffer, event); 1038 ring_buffer_unlock_commit(buffer, event);
881 1039
882 ftrace_trace_stack(tr, flags, 6, pc); 1040 ftrace_trace_stack(buffer, flags, 6, pc);
883 ftrace_trace_userstack(tr, flags, pc); 1041 ftrace_trace_userstack(buffer, flags, pc);
884 1042
885 if (wake) 1043 if (wake)
886 trace_wake_up(); 1044 trace_wake_up();
887} 1045}
888 1046
889void trace_buffer_unlock_commit(struct trace_array *tr, 1047void trace_buffer_unlock_commit(struct ring_buffer *buffer,
890 struct ring_buffer_event *event, 1048 struct ring_buffer_event *event,
891 unsigned long flags, int pc) 1049 unsigned long flags, int pc)
892{ 1050{
893 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 1051 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
894} 1052}
895 1053
896struct ring_buffer_event * 1054struct ring_buffer_event *
897trace_current_buffer_lock_reserve(int type, unsigned long len, 1055trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1056 int type, unsigned long len,
898 unsigned long flags, int pc) 1057 unsigned long flags, int pc)
899{ 1058{
900 return trace_buffer_lock_reserve(&global_trace, 1059 *current_rb = global_trace.buffer;
1060 return trace_buffer_lock_reserve(*current_rb,
901 type, len, flags, pc); 1061 type, len, flags, pc);
902} 1062}
903EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 1063EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
904 1064
905void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 1065void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1066 struct ring_buffer_event *event,
906 unsigned long flags, int pc) 1067 unsigned long flags, int pc)
907{ 1068{
908 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 1069 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
909} 1070}
910EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1071EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
911 1072
912void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 1073void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
913 unsigned long flags, int pc) 1074 struct ring_buffer_event *event,
1075 unsigned long flags, int pc)
914{ 1076{
915 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 1077 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
916} 1078}
917EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1079EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
918 1080
919void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 1081void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1082 struct ring_buffer_event *event)
920{ 1083{
921 ring_buffer_discard_commit(global_trace.buffer, event); 1084 ring_buffer_discard_commit(buffer, event);
922} 1085}
923EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 1086EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
924 1087
@@ -928,6 +1091,7 @@ trace_function(struct trace_array *tr,
928 int pc) 1091 int pc)
929{ 1092{
930 struct ftrace_event_call *call = &event_function; 1093 struct ftrace_event_call *call = &event_function;
1094 struct ring_buffer *buffer = tr->buffer;
931 struct ring_buffer_event *event; 1095 struct ring_buffer_event *event;
932 struct ftrace_entry *entry; 1096 struct ftrace_entry *entry;
933 1097
@@ -935,7 +1099,7 @@ trace_function(struct trace_array *tr,
935 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1099 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
936 return; 1100 return;
937 1101
938 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 1102 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
939 flags, pc); 1103 flags, pc);
940 if (!event) 1104 if (!event)
941 return; 1105 return;
@@ -943,58 +1107,10 @@ trace_function(struct trace_array *tr,
943 entry->ip = ip; 1107 entry->ip = ip;
944 entry->parent_ip = parent_ip; 1108 entry->parent_ip = parent_ip;
945 1109
946 if (!filter_check_discard(call, entry, tr->buffer, event)) 1110 if (!filter_check_discard(call, entry, buffer, event))
947 ring_buffer_unlock_commit(tr->buffer, event); 1111 ring_buffer_unlock_commit(buffer, event);
948}
949
950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
951static int __trace_graph_entry(struct trace_array *tr,
952 struct ftrace_graph_ent *trace,
953 unsigned long flags,
954 int pc)
955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
957 struct ring_buffer_event *event;
958 struct ftrace_graph_ent_entry *entry;
959
960 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
961 return 0;
962
963 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
964 sizeof(*entry), flags, pc);
965 if (!event)
966 return 0;
967 entry = ring_buffer_event_data(event);
968 entry->graph_ent = *trace;
969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
971
972 return 1;
973} 1112}
974 1113
975static void __trace_graph_return(struct trace_array *tr,
976 struct ftrace_graph_ret *trace,
977 unsigned long flags,
978 int pc)
979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
981 struct ring_buffer_event *event;
982 struct ftrace_graph_ret_entry *entry;
983
984 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
985 return;
986
987 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
988 sizeof(*entry), flags, pc);
989 if (!event)
990 return;
991 entry = ring_buffer_event_data(event);
992 entry->ret = *trace;
993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
995}
996#endif
997
998void 1114void
999ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1115ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1000 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1116 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,17 +1120,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1004 trace_function(tr, ip, parent_ip, flags, pc); 1120 trace_function(tr, ip, parent_ip, flags, pc);
1005} 1121}
1006 1122
1007static void __ftrace_trace_stack(struct trace_array *tr, 1123#ifdef CONFIG_STACKTRACE
1124static void __ftrace_trace_stack(struct ring_buffer *buffer,
1008 unsigned long flags, 1125 unsigned long flags,
1009 int skip, int pc) 1126 int skip, int pc)
1010{ 1127{
1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack; 1128 struct ftrace_event_call *call = &event_kernel_stack;
1013 struct ring_buffer_event *event; 1129 struct ring_buffer_event *event;
1014 struct stack_entry *entry; 1130 struct stack_entry *entry;
1015 struct stack_trace trace; 1131 struct stack_trace trace;
1016 1132
1017 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1133 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1018 sizeof(*entry), flags, pc); 1134 sizeof(*entry), flags, pc);
1019 if (!event) 1135 if (!event)
1020 return; 1136 return;
@@ -1027,32 +1143,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1027 trace.entries = entry->caller; 1143 trace.entries = entry->caller;
1028 1144
1029 save_stack_trace(&trace); 1145 save_stack_trace(&trace);
1030 if (!filter_check_discard(call, entry, tr->buffer, event)) 1146 if (!filter_check_discard(call, entry, buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event); 1147 ring_buffer_unlock_commit(buffer, event);
1032#endif
1033} 1148}
1034 1149
1035static void ftrace_trace_stack(struct trace_array *tr, 1150void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1036 unsigned long flags, 1151 int skip, int pc)
1037 int skip, int pc)
1038{ 1152{
1039 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1153 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1040 return; 1154 return;
1041 1155
1042 __ftrace_trace_stack(tr, flags, skip, pc); 1156 __ftrace_trace_stack(buffer, flags, skip, pc);
1043} 1157}
1044 1158
1045void __trace_stack(struct trace_array *tr, 1159void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1046 unsigned long flags, 1160 int pc)
1047 int skip, int pc)
1048{ 1161{
1049 __ftrace_trace_stack(tr, flags, skip, pc); 1162 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1050} 1163}
1051 1164
1052static void ftrace_trace_userstack(struct trace_array *tr, 1165void
1053 unsigned long flags, int pc) 1166ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1054{ 1167{
1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack; 1168 struct ftrace_event_call *call = &event_user_stack;
1057 struct ring_buffer_event *event; 1169 struct ring_buffer_event *event;
1058 struct userstack_entry *entry; 1170 struct userstack_entry *entry;
@@ -1061,12 +1173,13 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1061 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1173 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1062 return; 1174 return;
1063 1175
1064 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1176 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1065 sizeof(*entry), flags, pc); 1177 sizeof(*entry), flags, pc);
1066 if (!event) 1178 if (!event)
1067 return; 1179 return;
1068 entry = ring_buffer_event_data(event); 1180 entry = ring_buffer_event_data(event);
1069 1181
1182 entry->tgid = current->tgid;
1070 memset(&entry->caller, 0, sizeof(entry->caller)); 1183 memset(&entry->caller, 0, sizeof(entry->caller));
1071 1184
1072 trace.nr_entries = 0; 1185 trace.nr_entries = 0;
@@ -1075,9 +1188,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1075 trace.entries = entry->caller; 1188 trace.entries = entry->caller;
1076 1189
1077 save_stack_trace_user(&trace); 1190 save_stack_trace_user(&trace);
1078 if (!filter_check_discard(call, entry, tr->buffer, event)) 1191 if (!filter_check_discard(call, entry, buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event); 1192 ring_buffer_unlock_commit(buffer, event);
1080#endif
1081} 1193}
1082 1194
1083#ifdef UNUSED 1195#ifdef UNUSED
@@ -1087,16 +1199,20 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1087} 1199}
1088#endif /* UNUSED */ 1200#endif /* UNUSED */
1089 1201
1202#endif /* CONFIG_STACKTRACE */
1203
1090static void 1204static void
1091ftrace_trace_special(void *__tr, 1205ftrace_trace_special(void *__tr,
1092 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1206 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1093 int pc) 1207 int pc)
1094{ 1208{
1209 struct ftrace_event_call *call = &event_special;
1095 struct ring_buffer_event *event; 1210 struct ring_buffer_event *event;
1096 struct trace_array *tr = __tr; 1211 struct trace_array *tr = __tr;
1212 struct ring_buffer *buffer = tr->buffer;
1097 struct special_entry *entry; 1213 struct special_entry *entry;
1098 1214
1099 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1215 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1100 sizeof(*entry), 0, pc); 1216 sizeof(*entry), 0, pc);
1101 if (!event) 1217 if (!event)
1102 return; 1218 return;
@@ -1104,7 +1220,9 @@ ftrace_trace_special(void *__tr,
1104 entry->arg1 = arg1; 1220 entry->arg1 = arg1;
1105 entry->arg2 = arg2; 1221 entry->arg2 = arg2;
1106 entry->arg3 = arg3; 1222 entry->arg3 = arg3;
1107 trace_buffer_unlock_commit(tr, event, 0, pc); 1223
1224 if (!filter_check_discard(call, entry, buffer, event))
1225 trace_buffer_unlock_commit(buffer, event, 0, pc);
1108} 1226}
1109 1227
1110void 1228void
@@ -1115,62 +1233,6 @@ __trace_special(void *__tr, void *__data,
1115} 1233}
1116 1234
1117void 1235void
1118tracing_sched_switch_trace(struct trace_array *tr,
1119 struct task_struct *prev,
1120 struct task_struct *next,
1121 unsigned long flags, int pc)
1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1124 struct ring_buffer_event *event;
1125 struct ctx_switch_entry *entry;
1126
1127 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1128 sizeof(*entry), flags, pc);
1129 if (!event)
1130 return;
1131 entry = ring_buffer_event_data(event);
1132 entry->prev_pid = prev->pid;
1133 entry->prev_prio = prev->prio;
1134 entry->prev_state = prev->state;
1135 entry->next_pid = next->pid;
1136 entry->next_prio = next->prio;
1137 entry->next_state = next->state;
1138 entry->next_cpu = task_cpu(next);
1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1142}
1143
1144void
1145tracing_sched_wakeup_trace(struct trace_array *tr,
1146 struct task_struct *wakee,
1147 struct task_struct *curr,
1148 unsigned long flags, int pc)
1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1151 struct ring_buffer_event *event;
1152 struct ctx_switch_entry *entry;
1153
1154 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1155 sizeof(*entry), flags, pc);
1156 if (!event)
1157 return;
1158 entry = ring_buffer_event_data(event);
1159 entry->prev_pid = curr->pid;
1160 entry->prev_prio = curr->prio;
1161 entry->prev_state = curr->state;
1162 entry->next_pid = wakee->pid;
1163 entry->next_prio = wakee->prio;
1164 entry->next_state = wakee->state;
1165 entry->next_cpu = task_cpu(wakee);
1166
1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1169 ftrace_trace_stack(tr, flags, 6, pc);
1170 ftrace_trace_userstack(tr, flags, pc);
1171}
1172
1173void
1174ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1236ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1175{ 1237{
1176 struct trace_array *tr = &global_trace; 1238 struct trace_array *tr = &global_trace;
@@ -1194,68 +1256,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1194 local_irq_restore(flags); 1256 local_irq_restore(flags);
1195} 1257}
1196 1258
1197#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1198int trace_graph_entry(struct ftrace_graph_ent *trace)
1199{
1200 struct trace_array *tr = &global_trace;
1201 struct trace_array_cpu *data;
1202 unsigned long flags;
1203 long disabled;
1204 int ret;
1205 int cpu;
1206 int pc;
1207
1208 if (!ftrace_trace_task(current))
1209 return 0;
1210
1211 if (!ftrace_graph_addr(trace->func))
1212 return 0;
1213
1214 local_irq_save(flags);
1215 cpu = raw_smp_processor_id();
1216 data = tr->data[cpu];
1217 disabled = atomic_inc_return(&data->disabled);
1218 if (likely(disabled == 1)) {
1219 pc = preempt_count();
1220 ret = __trace_graph_entry(tr, trace, flags, pc);
1221 } else {
1222 ret = 0;
1223 }
1224 /* Only do the atomic if it is not already set */
1225 if (!test_tsk_trace_graph(current))
1226 set_tsk_trace_graph(current);
1227
1228 atomic_dec(&data->disabled);
1229 local_irq_restore(flags);
1230
1231 return ret;
1232}
1233
1234void trace_graph_return(struct ftrace_graph_ret *trace)
1235{
1236 struct trace_array *tr = &global_trace;
1237 struct trace_array_cpu *data;
1238 unsigned long flags;
1239 long disabled;
1240 int cpu;
1241 int pc;
1242
1243 local_irq_save(flags);
1244 cpu = raw_smp_processor_id();
1245 data = tr->data[cpu];
1246 disabled = atomic_inc_return(&data->disabled);
1247 if (likely(disabled == 1)) {
1248 pc = preempt_count();
1249 __trace_graph_return(tr, trace, flags, pc);
1250 }
1251 if (!trace->depth)
1252 clear_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255}
1256#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1257
1258
1259/** 1259/**
1260 * trace_vbprintk - write binary msg to tracing buffer 1260 * trace_vbprintk - write binary msg to tracing buffer
1261 * 1261 *
@@ -1268,6 +1268,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1268 1268
1269 struct ftrace_event_call *call = &event_bprint; 1269 struct ftrace_event_call *call = &event_bprint;
1270 struct ring_buffer_event *event; 1270 struct ring_buffer_event *event;
1271 struct ring_buffer *buffer;
1271 struct trace_array *tr = &global_trace; 1272 struct trace_array *tr = &global_trace;
1272 struct trace_array_cpu *data; 1273 struct trace_array_cpu *data;
1273 struct bprint_entry *entry; 1274 struct bprint_entry *entry;
@@ -1300,7 +1301,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 goto out_unlock; 1301 goto out_unlock;
1301 1302
1302 size = sizeof(*entry) + sizeof(u32) * len; 1303 size = sizeof(*entry) + sizeof(u32) * len;
1303 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1304 buffer = tr->buffer;
1305 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1306 flags, pc);
1304 if (!event) 1307 if (!event)
1305 goto out_unlock; 1308 goto out_unlock;
1306 entry = ring_buffer_event_data(event); 1309 entry = ring_buffer_event_data(event);
@@ -1308,8 +1311,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1308 entry->fmt = fmt; 1311 entry->fmt = fmt;
1309 1312
1310 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1313 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1311 if (!filter_check_discard(call, entry, tr->buffer, event)) 1314 if (!filter_check_discard(call, entry, buffer, event))
1312 ring_buffer_unlock_commit(tr->buffer, event); 1315 ring_buffer_unlock_commit(buffer, event);
1313 1316
1314out_unlock: 1317out_unlock:
1315 __raw_spin_unlock(&trace_buf_lock); 1318 __raw_spin_unlock(&trace_buf_lock);
@@ -1324,14 +1327,30 @@ out:
1324} 1327}
1325EXPORT_SYMBOL_GPL(trace_vbprintk); 1328EXPORT_SYMBOL_GPL(trace_vbprintk);
1326 1329
1327int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1330int trace_array_printk(struct trace_array *tr,
1331 unsigned long ip, const char *fmt, ...)
1332{
1333 int ret;
1334 va_list ap;
1335
1336 if (!(trace_flags & TRACE_ITER_PRINTK))
1337 return 0;
1338
1339 va_start(ap, fmt);
1340 ret = trace_array_vprintk(tr, ip, fmt, ap);
1341 va_end(ap);
1342 return ret;
1343}
1344
1345int trace_array_vprintk(struct trace_array *tr,
1346 unsigned long ip, const char *fmt, va_list args)
1328{ 1347{
1329 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1348 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1330 static char trace_buf[TRACE_BUF_SIZE]; 1349 static char trace_buf[TRACE_BUF_SIZE];
1331 1350
1332 struct ftrace_event_call *call = &event_print; 1351 struct ftrace_event_call *call = &event_print;
1333 struct ring_buffer_event *event; 1352 struct ring_buffer_event *event;
1334 struct trace_array *tr = &global_trace; 1353 struct ring_buffer *buffer;
1335 struct trace_array_cpu *data; 1354 struct trace_array_cpu *data;
1336 int cpu, len = 0, size, pc; 1355 int cpu, len = 0, size, pc;
1337 struct print_entry *entry; 1356 struct print_entry *entry;
@@ -1359,7 +1378,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1359 trace_buf[len] = 0; 1378 trace_buf[len] = 0;
1360 1379
1361 size = sizeof(*entry) + len + 1; 1380 size = sizeof(*entry) + len + 1;
1362 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1381 buffer = tr->buffer;
1382 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1383 irq_flags, pc);
1363 if (!event) 1384 if (!event)
1364 goto out_unlock; 1385 goto out_unlock;
1365 entry = ring_buffer_event_data(event); 1386 entry = ring_buffer_event_data(event);
@@ -1367,8 +1388,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1367 1388
1368 memcpy(&entry->buf, trace_buf, len); 1389 memcpy(&entry->buf, trace_buf, len);
1369 entry->buf[len] = 0; 1390 entry->buf[len] = 0;
1370 if (!filter_check_discard(call, entry, tr->buffer, event)) 1391 if (!filter_check_discard(call, entry, buffer, event))
1371 ring_buffer_unlock_commit(tr->buffer, event); 1392 ring_buffer_unlock_commit(buffer, event);
1372 1393
1373 out_unlock: 1394 out_unlock:
1374 __raw_spin_unlock(&trace_buf_lock); 1395 __raw_spin_unlock(&trace_buf_lock);
@@ -1380,6 +1401,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1380 1401
1381 return len; 1402 return len;
1382} 1403}
1404
1405int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1406{
1407 return trace_array_printk(&global_trace, ip, fmt, args);
1408}
1383EXPORT_SYMBOL_GPL(trace_vprintk); 1409EXPORT_SYMBOL_GPL(trace_vprintk);
1384 1410
1385enum trace_file_type { 1411enum trace_file_type {
@@ -1519,6 +1545,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1519 return ent; 1545 return ent;
1520} 1546}
1521 1547
1548static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1549{
1550 struct trace_array *tr = iter->tr;
1551 struct ring_buffer_event *event;
1552 struct ring_buffer_iter *buf_iter;
1553 unsigned long entries = 0;
1554 u64 ts;
1555
1556 tr->data[cpu]->skipped_entries = 0;
1557
1558 if (!iter->buffer_iter[cpu])
1559 return;
1560
1561 buf_iter = iter->buffer_iter[cpu];
1562 ring_buffer_iter_reset(buf_iter);
1563
1564 /*
1565 * We could have the case with the max latency tracers
1566 * that a reset never took place on a cpu. This is evident
1567 * by the timestamp being before the start of the buffer.
1568 */
1569 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1570 if (ts >= iter->tr->time_start)
1571 break;
1572 entries++;
1573 ring_buffer_read(buf_iter, NULL);
1574 }
1575
1576 tr->data[cpu]->skipped_entries = entries;
1577}
1578
1522/* 1579/*
1523 * No necessary locking here. The worst thing which can 1580 * No necessary locking here. The worst thing which can
1524 * happen is loosing events consumed at the same time 1581 * happen is loosing events consumed at the same time
@@ -1557,10 +1614,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1557 1614
1558 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1615 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1559 for_each_tracing_cpu(cpu) 1616 for_each_tracing_cpu(cpu)
1560 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1617 tracing_iter_reset(iter, cpu);
1561 } else 1618 } else
1562 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1619 tracing_iter_reset(iter, cpu_file);
1563
1564 1620
1565 ftrace_enable_cpu(); 1621 ftrace_enable_cpu();
1566 1622
@@ -1589,10 +1645,10 @@ static void print_lat_help_header(struct seq_file *m)
1589 seq_puts(m, "# | / _----=> need-resched \n"); 1645 seq_puts(m, "# | / _----=> need-resched \n");
1590 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1646 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1591 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1647 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1592 seq_puts(m, "# |||| / \n"); 1648 seq_puts(m, "# |||| /_--=> lock-depth \n");
1593 seq_puts(m, "# ||||| delay \n"); 1649 seq_puts(m, "# |||||/ delay \n");
1594 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1650 seq_puts(m, "# cmd pid |||||| time | caller \n");
1595 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1651 seq_puts(m, "# \\ / |||||| \\ | / \n");
1596} 1652}
1597 1653
1598static void print_func_help_header(struct seq_file *m) 1654static void print_func_help_header(struct seq_file *m)
@@ -1609,16 +1665,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1609 struct trace_array *tr = iter->tr; 1665 struct trace_array *tr = iter->tr;
1610 struct trace_array_cpu *data = tr->data[tr->cpu]; 1666 struct trace_array_cpu *data = tr->data[tr->cpu];
1611 struct tracer *type = current_trace; 1667 struct tracer *type = current_trace;
1612 unsigned long total; 1668 unsigned long entries = 0;
1613 unsigned long entries; 1669 unsigned long total = 0;
1670 unsigned long count;
1614 const char *name = "preemption"; 1671 const char *name = "preemption";
1672 int cpu;
1615 1673
1616 if (type) 1674 if (type)
1617 name = type->name; 1675 name = type->name;
1618 1676
1619 entries = ring_buffer_entries(iter->tr->buffer); 1677
1620 total = entries + 1678 for_each_tracing_cpu(cpu) {
1621 ring_buffer_overruns(iter->tr->buffer); 1679 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1680 /*
1681 * If this buffer has skipped entries, then we hold all
1682 * entries for the trace and we need to ignore the
1683 * ones before the time stamp.
1684 */
1685 if (tr->data[cpu]->skipped_entries) {
1686 count -= tr->data[cpu]->skipped_entries;
1687 /* total is the same as the entries */
1688 total += count;
1689 } else
1690 total += count +
1691 ring_buffer_overrun_cpu(tr->buffer, cpu);
1692 entries += count;
1693 }
1622 1694
1623 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1695 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1624 name, UTS_RELEASE); 1696 name, UTS_RELEASE);
@@ -1660,7 +1732,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1660 seq_puts(m, "\n# => ended at: "); 1732 seq_puts(m, "\n# => ended at: ");
1661 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1733 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1662 trace_print_seq(m, &iter->seq); 1734 trace_print_seq(m, &iter->seq);
1663 seq_puts(m, "#\n"); 1735 seq_puts(m, "\n#\n");
1664 } 1736 }
1665 1737
1666 seq_puts(m, "#\n"); 1738 seq_puts(m, "#\n");
@@ -1679,6 +1751,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1679 if (cpumask_test_cpu(iter->cpu, iter->started)) 1751 if (cpumask_test_cpu(iter->cpu, iter->started))
1680 return; 1752 return;
1681 1753
1754 if (iter->tr->data[iter->cpu]->skipped_entries)
1755 return;
1756
1682 cpumask_set_cpu(iter->cpu, iter->started); 1757 cpumask_set_cpu(iter->cpu, iter->started);
1683 1758
1684 /* Don't print started cpu buffer for the first entry of the trace */ 1759 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1941,19 +2016,23 @@ __tracing_open(struct inode *inode, struct file *file)
1941 if (ring_buffer_overruns(iter->tr->buffer)) 2016 if (ring_buffer_overruns(iter->tr->buffer))
1942 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2017 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1943 2018
2019 /* stop the trace while dumping */
2020 tracing_stop();
2021
1944 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2022 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1945 for_each_tracing_cpu(cpu) { 2023 for_each_tracing_cpu(cpu) {
1946 2024
1947 iter->buffer_iter[cpu] = 2025 iter->buffer_iter[cpu] =
1948 ring_buffer_read_start(iter->tr->buffer, cpu); 2026 ring_buffer_read_start(iter->tr->buffer, cpu);
2027 tracing_iter_reset(iter, cpu);
1949 } 2028 }
1950 } else { 2029 } else {
1951 cpu = iter->cpu_file; 2030 cpu = iter->cpu_file;
1952 iter->buffer_iter[cpu] = 2031 iter->buffer_iter[cpu] =
1953 ring_buffer_read_start(iter->tr->buffer, cpu); 2032 ring_buffer_read_start(iter->tr->buffer, cpu);
2033 tracing_iter_reset(iter, cpu);
1954 } 2034 }
1955 2035
1956 /* TODO stop tracer */
1957 ret = seq_open(file, &tracer_seq_ops); 2036 ret = seq_open(file, &tracer_seq_ops);
1958 if (ret < 0) { 2037 if (ret < 0) {
1959 fail_ret = ERR_PTR(ret); 2038 fail_ret = ERR_PTR(ret);
@@ -1963,9 +2042,6 @@ __tracing_open(struct inode *inode, struct file *file)
1963 m = file->private_data; 2042 m = file->private_data;
1964 m->private = iter; 2043 m->private = iter;
1965 2044
1966 /* stop the trace while dumping */
1967 tracing_stop();
1968
1969 mutex_unlock(&trace_types_lock); 2045 mutex_unlock(&trace_types_lock);
1970 2046
1971 return iter; 2047 return iter;
@@ -1976,6 +2052,7 @@ __tracing_open(struct inode *inode, struct file *file)
1976 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2052 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1977 } 2053 }
1978 free_cpumask_var(iter->started); 2054 free_cpumask_var(iter->started);
2055 tracing_start();
1979 fail: 2056 fail:
1980 mutex_unlock(&trace_types_lock); 2057 mutex_unlock(&trace_types_lock);
1981 kfree(iter->trace); 2058 kfree(iter->trace);
@@ -2257,8 +2334,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2257 len += 3; /* "no" and newline */ 2334 len += 3; /* "no" and newline */
2258 } 2335 }
2259 2336
2260 /* +2 for \n and \0 */ 2337 /* +1 for \0 */
2261 buf = kmalloc(len + 2, GFP_KERNEL); 2338 buf = kmalloc(len + 1, GFP_KERNEL);
2262 if (!buf) { 2339 if (!buf) {
2263 mutex_unlock(&trace_types_lock); 2340 mutex_unlock(&trace_types_lock);
2264 return -ENOMEM; 2341 return -ENOMEM;
@@ -2281,7 +2358,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2281 } 2358 }
2282 mutex_unlock(&trace_types_lock); 2359 mutex_unlock(&trace_types_lock);
2283 2360
2284 WARN_ON(r >= len + 2); 2361 WARN_ON(r >= len + 1);
2285 2362
2286 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2363 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2287 2364
@@ -2292,23 +2369,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2292/* Try to assign a tracer specific option */ 2369/* Try to assign a tracer specific option */
2293static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2370static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2294{ 2371{
2295 struct tracer_flags *trace_flags = trace->flags; 2372 struct tracer_flags *tracer_flags = trace->flags;
2296 struct tracer_opt *opts = NULL; 2373 struct tracer_opt *opts = NULL;
2297 int ret = 0, i = 0; 2374 int ret = 0, i = 0;
2298 int len; 2375 int len;
2299 2376
2300 for (i = 0; trace_flags->opts[i].name; i++) { 2377 for (i = 0; tracer_flags->opts[i].name; i++) {
2301 opts = &trace_flags->opts[i]; 2378 opts = &tracer_flags->opts[i];
2302 len = strlen(opts->name); 2379 len = strlen(opts->name);
2303 2380
2304 if (strncmp(cmp, opts->name, len) == 0) { 2381 if (strncmp(cmp, opts->name, len) == 0) {
2305 ret = trace->set_flag(trace_flags->val, 2382 ret = trace->set_flag(tracer_flags->val,
2306 opts->bit, !neg); 2383 opts->bit, !neg);
2307 break; 2384 break;
2308 } 2385 }
2309 } 2386 }
2310 /* Not found */ 2387 /* Not found */
2311 if (!trace_flags->opts[i].name) 2388 if (!tracer_flags->opts[i].name)
2312 return -EINVAL; 2389 return -EINVAL;
2313 2390
2314 /* Refused to handle */ 2391 /* Refused to handle */
@@ -2316,9 +2393,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2316 return ret; 2393 return ret;
2317 2394
2318 if (neg) 2395 if (neg)
2319 trace_flags->val &= ~opts->bit; 2396 tracer_flags->val &= ~opts->bit;
2320 else 2397 else
2321 trace_flags->val |= opts->bit; 2398 tracer_flags->val |= opts->bit;
2322 2399
2323 return 0; 2400 return 0;
2324} 2401}
@@ -2333,22 +2410,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2333 trace_flags |= mask; 2410 trace_flags |= mask;
2334 else 2411 else
2335 trace_flags &= ~mask; 2412 trace_flags &= ~mask;
2336
2337 if (mask == TRACE_ITER_GLOBAL_CLK) {
2338 u64 (*func)(void);
2339
2340 if (enabled)
2341 func = trace_clock_global;
2342 else
2343 func = trace_clock_local;
2344
2345 mutex_lock(&trace_types_lock);
2346 ring_buffer_set_clock(global_trace.buffer, func);
2347
2348 if (max_tr.buffer)
2349 ring_buffer_set_clock(max_tr.buffer, func);
2350 mutex_unlock(&trace_types_lock);
2351 }
2352} 2413}
2353 2414
2354static ssize_t 2415static ssize_t
@@ -3316,6 +3377,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3316 return cnt; 3377 return cnt;
3317} 3378}
3318 3379
3380static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3381 size_t cnt, loff_t *ppos)
3382{
3383 char buf[64];
3384 int bufiter = 0;
3385 int i;
3386
3387 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3388 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3389 "%s%s%s%s", i ? " " : "",
3390 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3391 i == trace_clock_id ? "]" : "");
3392 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3393
3394 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3395}
3396
3397static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3398 size_t cnt, loff_t *fpos)
3399{
3400 char buf[64];
3401 const char *clockstr;
3402 int i;
3403
3404 if (cnt >= sizeof(buf))
3405 return -EINVAL;
3406
3407 if (copy_from_user(&buf, ubuf, cnt))
3408 return -EFAULT;
3409
3410 buf[cnt] = 0;
3411
3412 clockstr = strstrip(buf);
3413
3414 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3415 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3416 break;
3417 }
3418 if (i == ARRAY_SIZE(trace_clocks))
3419 return -EINVAL;
3420
3421 trace_clock_id = i;
3422
3423 mutex_lock(&trace_types_lock);
3424
3425 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3426 if (max_tr.buffer)
3427 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3428
3429 mutex_unlock(&trace_types_lock);
3430
3431 *fpos += cnt;
3432
3433 return cnt;
3434}
3435
3319static const struct file_operations tracing_max_lat_fops = { 3436static const struct file_operations tracing_max_lat_fops = {
3320 .open = tracing_open_generic, 3437 .open = tracing_open_generic,
3321 .read = tracing_max_lat_read, 3438 .read = tracing_max_lat_read,
@@ -3353,6 +3470,12 @@ static const struct file_operations tracing_mark_fops = {
3353 .write = tracing_mark_write, 3470 .write = tracing_mark_write,
3354}; 3471};
3355 3472
3473static const struct file_operations trace_clock_fops = {
3474 .open = tracing_open_generic,
3475 .read = tracing_clock_read,
3476 .write = tracing_clock_write,
3477};
3478
3356struct ftrace_buffer_info { 3479struct ftrace_buffer_info {
3357 struct trace_array *tr; 3480 struct trace_array *tr;
3358 void *spare; 3481 void *spare;
@@ -3633,9 +3756,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3756 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3757 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635 3758
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3759 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640 3760
3641 kfree(s); 3761 kfree(s);
@@ -4066,11 +4186,13 @@ static __init int tracer_init_debugfs(void)
4066 trace_create_file("current_tracer", 0644, d_tracer, 4186 trace_create_file("current_tracer", 0644, d_tracer,
4067 &global_trace, &set_tracer_fops); 4187 &global_trace, &set_tracer_fops);
4068 4188
4189#ifdef CONFIG_TRACER_MAX_TRACE
4069 trace_create_file("tracing_max_latency", 0644, d_tracer, 4190 trace_create_file("tracing_max_latency", 0644, d_tracer,
4070 &tracing_max_latency, &tracing_max_lat_fops); 4191 &tracing_max_latency, &tracing_max_lat_fops);
4071 4192
4072 trace_create_file("tracing_thresh", 0644, d_tracer, 4193 trace_create_file("tracing_thresh", 0644, d_tracer,
4073 &tracing_thresh, &tracing_max_lat_fops); 4194 &tracing_thresh, &tracing_max_lat_fops);
4195#endif
4074 4196
4075 trace_create_file("README", 0444, d_tracer, 4197 trace_create_file("README", 0444, d_tracer,
4076 NULL, &tracing_readme_fops); 4198 NULL, &tracing_readme_fops);
@@ -4087,6 +4209,9 @@ static __init int tracer_init_debugfs(void)
4087 trace_create_file("saved_cmdlines", 0444, d_tracer, 4209 trace_create_file("saved_cmdlines", 0444, d_tracer,
4088 NULL, &tracing_saved_cmdlines_fops); 4210 NULL, &tracing_saved_cmdlines_fops);
4089 4211
4212 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4213 &trace_clock_fops);
4214
4090#ifdef CONFIG_DYNAMIC_FTRACE 4215#ifdef CONFIG_DYNAMIC_FTRACE
4091 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4216 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4092 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4217 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4265,7 +4390,6 @@ void ftrace_dump(void)
4265 4390
4266__init static int tracer_alloc_buffers(void) 4391__init static int tracer_alloc_buffers(void)
4267{ 4392{
4268 struct trace_array_cpu *data;
4269 int ring_buf_size; 4393 int ring_buf_size;
4270 int i; 4394 int i;
4271 int ret = -ENOMEM; 4395 int ret = -ENOMEM;
@@ -4315,7 +4439,7 @@ __init static int tracer_alloc_buffers(void)
4315 4439
4316 /* Allocate the first page for all buffers */ 4440 /* Allocate the first page for all buffers */
4317 for_each_tracing_cpu(i) { 4441 for_each_tracing_cpu(i) {
4318 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4442 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4319 max_tr.data[i] = &per_cpu(max_data, i); 4443 max_tr.data[i] = &per_cpu(max_data, i);
4320 } 4444 }
4321 4445
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e9559..86bcff94791a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,6 +7,7 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
@@ -34,8 +35,6 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 36 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 37 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 38 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 39 TRACE_KMEM_FREE,
41 TRACE_POWER, 40 TRACE_POWER,
@@ -44,157 +43,54 @@ enum trace_type {
44 __TRACE_LAST_TYPE, 43 __TRACE_LAST_TYPE,
45}; 44};
46 45
47/* 46enum kmemtrace_type_id {
48 * Function trace entry - function address and parent function addres: 47 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
49 */ 48 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
50struct ftrace_entry { 49 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
51 struct trace_entry ent;
52 unsigned long ip;
53 unsigned long parent_ip;
54};
55
56/* Function call entry */
57struct ftrace_graph_ent_entry {
58 struct trace_entry ent;
59 struct ftrace_graph_ent graph_ent;
60}; 50};
61 51
62/* Function return entry */
63struct ftrace_graph_ret_entry {
64 struct trace_entry ent;
65 struct ftrace_graph_ret ret;
66};
67extern struct tracer boot_tracer; 52extern struct tracer boot_tracer;
68 53
69/* 54#undef __field
70 * Context switch trace entry - which task (and prio) we switched from/to: 55#define __field(type, item) type item;
71 */
72struct ctx_switch_entry {
73 struct trace_entry ent;
74 unsigned int prev_pid;
75 unsigned char prev_prio;
76 unsigned char prev_state;
77 unsigned int next_pid;
78 unsigned char next_prio;
79 unsigned char next_state;
80 unsigned int next_cpu;
81};
82 56
83/* 57#undef __field_struct
84 * Special (free-form) trace entry: 58#define __field_struct(type, item) __field(type, item)
85 */
86struct special_entry {
87 struct trace_entry ent;
88 unsigned long arg1;
89 unsigned long arg2;
90 unsigned long arg3;
91};
92 59
93/* 60#undef __field_desc
94 * Stack-trace entry: 61#define __field_desc(type, container, item)
95 */
96 62
97#define FTRACE_STACK_ENTRIES 8 63#undef __array
64#define __array(type, item, size) type item[size];
98 65
99struct stack_entry { 66#undef __array_desc
100 struct trace_entry ent; 67#define __array_desc(type, container, item, size)
101 unsigned long caller[FTRACE_STACK_ENTRIES];
102};
103 68
104struct userstack_entry { 69#undef __dynamic_array
105 struct trace_entry ent; 70#define __dynamic_array(type, item) type item[];
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108 71
109/* 72#undef F_STRUCT
110 * trace_printk entry: 73#define F_STRUCT(args...) args
111 */
112struct bprint_entry {
113 struct trace_entry ent;
114 unsigned long ip;
115 const char *fmt;
116 u32 buf[];
117};
118 74
119struct print_entry { 75#undef FTRACE_ENTRY
120 struct trace_entry ent; 76#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121 unsigned long ip; 77 struct struct_name { \
122 char buf[]; 78 struct trace_entry ent; \
123}; 79 tstruct \
124 80 }
125#define TRACE_OLD_SIZE 88
126
127struct trace_field_cont {
128 unsigned char type;
129 /* Temporary till we get rid of this completely */
130 char buf[TRACE_OLD_SIZE - 1];
131};
132
133struct trace_mmiotrace_rw {
134 struct trace_entry ent;
135 struct mmiotrace_rw rw;
136};
137
138struct trace_mmiotrace_map {
139 struct trace_entry ent;
140 struct mmiotrace_map map;
141};
142
143struct trace_boot_call {
144 struct trace_entry ent;
145 struct boot_trace_call boot_call;
146};
147
148struct trace_boot_ret {
149 struct trace_entry ent;
150 struct boot_trace_ret boot_ret;
151};
152
153#define TRACE_FUNC_SIZE 30
154#define TRACE_FILE_SIZE 20
155struct trace_branch {
156 struct trace_entry ent;
157 unsigned line;
158 char func[TRACE_FUNC_SIZE+1];
159 char file[TRACE_FILE_SIZE+1];
160 char correct;
161};
162
163struct hw_branch_entry {
164 struct trace_entry ent;
165 u64 from;
166 u64 to;
167};
168 81
169struct trace_power { 82#undef TP_ARGS
170 struct trace_entry ent; 83#define TP_ARGS(args...) args
171 struct power_trace state_data;
172};
173 84
174enum kmemtrace_type_id { 85#undef FTRACE_ENTRY_DUP
175 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 86#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
176 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
177 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
178};
179
180struct kmemtrace_alloc_entry {
181 struct trace_entry ent;
182 enum kmemtrace_type_id type_id;
183 unsigned long call_site;
184 const void *ptr;
185 size_t bytes_req;
186 size_t bytes_alloc;
187 gfp_t gfp_flags;
188 int node;
189};
190 87
191struct kmemtrace_free_entry { 88#include "trace_entries.h"
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196};
197 89
90/*
91 * syscalls are special, and need special handling, this is why
92 * they are not included in trace_entries.h
93 */
198struct syscall_trace_enter { 94struct syscall_trace_enter {
199 struct trace_entry ent; 95 struct trace_entry ent;
200 int nr; 96 int nr;
@@ -207,13 +103,12 @@ struct syscall_trace_exit {
207 unsigned long ret; 103 unsigned long ret;
208}; 104};
209 105
210
211/* 106/*
212 * trace_flag_type is an enumeration that holds different 107 * trace_flag_type is an enumeration that holds different
213 * states when a trace occurs. These are: 108 * states when a trace occurs. These are:
214 * IRQS_OFF - interrupts were disabled 109 * IRQS_OFF - interrupts were disabled
215 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 110 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
216 * NEED_RESCED - reschedule is requested 111 * NEED_RESCHED - reschedule is requested
217 * HARDIRQ - inside an interrupt handler 112 * HARDIRQ - inside an interrupt handler
218 * SOFTIRQ - inside a softirq handler 113 * SOFTIRQ - inside a softirq handler
219 */ 114 */
@@ -236,9 +131,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 131 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 132 void *buffer_page; /* ring buffer spare */
238 133
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 134 unsigned long saved_latency;
243 unsigned long critical_start; 135 unsigned long critical_start;
244 unsigned long critical_end; 136 unsigned long critical_end;
@@ -246,6 +138,7 @@ struct trace_array_cpu {
246 unsigned long nice; 138 unsigned long nice;
247 unsigned long policy; 139 unsigned long policy;
248 unsigned long rt_priority; 140 unsigned long rt_priority;
141 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 142 cycle_t preempt_timestamp;
250 pid_t pid; 143 pid_t pid;
251 uid_t uid; 144 uid_t uid;
@@ -319,10 +212,6 @@ extern void __ftrace_bad_type(void);
319 TRACE_KMEM_ALLOC); \ 212 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 213 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 214 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 215 __ftrace_bad_type(); \
327 } while (0) 216 } while (0)
328 217
@@ -398,7 +287,6 @@ struct tracer {
398 struct tracer *next; 287 struct tracer *next;
399 int print_max; 288 int print_max;
400 struct tracer_flags *flags; 289 struct tracer_flags *flags;
401 struct tracer_stat *stats;
402}; 290};
403 291
404 292
@@ -423,12 +311,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 311
424struct ring_buffer_event; 312struct ring_buffer_event;
425 313
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 314struct ring_buffer_event *
427 int type, 315trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 316 int type,
429 unsigned long flags, 317 unsigned long len,
430 int pc); 318 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 319 int pc);
320void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 321 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 322 unsigned long flags, int pc);
434 323
@@ -467,6 +356,7 @@ void trace_function(struct trace_array *tr,
467 356
468void trace_graph_return(struct ftrace_graph_ret *trace); 357void trace_graph_return(struct ftrace_graph_ret *trace);
469int trace_graph_entry(struct ftrace_graph_ent *trace); 358int trace_graph_entry(struct ftrace_graph_ent *trace);
359void set_graph_array(struct trace_array *tr);
470 360
471void tracing_start_cmdline_record(void); 361void tracing_start_cmdline_record(void);
472void tracing_stop_cmdline_record(void); 362void tracing_stop_cmdline_record(void);
@@ -475,35 +365,46 @@ void tracing_stop_sched_switch_record(void);
475void tracing_start_sched_switch_record(void); 365void tracing_start_sched_switch_record(void);
476int register_tracer(struct tracer *type); 366int register_tracer(struct tracer *type);
477void unregister_tracer(struct tracer *type); 367void unregister_tracer(struct tracer *type);
368int is_tracing_stopped(void);
478 369
479extern unsigned long nsecs_to_usecs(unsigned long nsecs); 370extern unsigned long nsecs_to_usecs(unsigned long nsecs);
480 371
372#ifdef CONFIG_TRACER_MAX_TRACE
481extern unsigned long tracing_max_latency; 373extern unsigned long tracing_max_latency;
482extern unsigned long tracing_thresh; 374extern unsigned long tracing_thresh;
483 375
484void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 376void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
485void update_max_tr_single(struct trace_array *tr, 377void update_max_tr_single(struct trace_array *tr,
486 struct task_struct *tsk, int cpu); 378 struct task_struct *tsk, int cpu);
379#endif /* CONFIG_TRACER_MAX_TRACE */
487 380
488void __trace_stack(struct trace_array *tr, 381#ifdef CONFIG_STACKTRACE
489 unsigned long flags, 382void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
490 int skip, int pc); 383 int skip, int pc);
491 384
492extern cycle_t ftrace_now(int cpu); 385void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
386 int pc);
493 387
494#ifdef CONFIG_CONTEXT_SWITCH_TRACER 388void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
495typedef void 389 int pc);
496(*tracer_switch_func_t)(void *private, 390#else
497 void *__rq, 391static inline void ftrace_trace_stack(struct trace_array *tr,
498 struct task_struct *prev, 392 unsigned long flags, int skip, int pc)
499 struct task_struct *next); 393{
500 394}
501struct tracer_switch_ops { 395
502 tracer_switch_func_t func; 396static inline void ftrace_trace_userstack(struct trace_array *tr,
503 void *private; 397 unsigned long flags, int pc)
504 struct tracer_switch_ops *next; 398{
505}; 399}
506#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 400
401static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
402 int skip, int pc)
403{
404}
405#endif /* CONFIG_STACKTRACE */
406
407extern cycle_t ftrace_now(int cpu);
507 408
508extern void trace_find_cmdline(int pid, char comm[]); 409extern void trace_find_cmdline(int pid, char comm[]);
509 410
@@ -513,6 +414,10 @@ extern unsigned long ftrace_update_tot_cnt;
513extern int DYN_FTRACE_TEST_NAME(void); 414extern int DYN_FTRACE_TEST_NAME(void);
514#endif 415#endif
515 416
417extern int ring_buffer_expanded;
418extern bool tracing_selftest_disabled;
419DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
420
516#ifdef CONFIG_FTRACE_STARTUP_TEST 421#ifdef CONFIG_FTRACE_STARTUP_TEST
517extern int trace_selftest_startup_function(struct tracer *trace, 422extern int trace_selftest_startup_function(struct tracer *trace,
518 struct trace_array *tr); 423 struct trace_array *tr);
@@ -544,9 +449,16 @@ extern int
544trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 449trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
545extern int 450extern int
546trace_vprintk(unsigned long ip, const char *fmt, va_list args); 451trace_vprintk(unsigned long ip, const char *fmt, va_list args);
452extern int
453trace_array_vprintk(struct trace_array *tr,
454 unsigned long ip, const char *fmt, va_list args);
455int trace_array_printk(struct trace_array *tr,
456 unsigned long ip, const char *fmt, ...);
547 457
548extern unsigned long trace_flags; 458extern unsigned long trace_flags;
549 459
460extern int trace_clock_id;
461
550/* Standard output formatting function used for function return traces */ 462/* Standard output formatting function used for function return traces */
551#ifdef CONFIG_FUNCTION_GRAPH_TRACER 463#ifdef CONFIG_FUNCTION_GRAPH_TRACER
552extern enum print_line_t print_graph_function(struct trace_iterator *iter); 464extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -609,6 +521,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
609#endif 521#endif
610 522
611/* 523/*
524 * struct trace_parser - servers for reading the user input separated by spaces
525 * @cont: set if the input is not complete - no final space char was found
526 * @buffer: holds the parsed user input
527 * @idx: user input lenght
528 * @size: buffer size
529 */
530struct trace_parser {
531 bool cont;
532 char *buffer;
533 unsigned idx;
534 unsigned size;
535};
536
537static inline bool trace_parser_loaded(struct trace_parser *parser)
538{
539 return (parser->idx != 0);
540}
541
542static inline bool trace_parser_cont(struct trace_parser *parser)
543{
544 return parser->cont;
545}
546
547static inline void trace_parser_clear(struct trace_parser *parser)
548{
549 parser->cont = false;
550 parser->idx = 0;
551}
552
553extern int trace_parser_get_init(struct trace_parser *parser, int size);
554extern void trace_parser_put(struct trace_parser *parser);
555extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
556 size_t cnt, loff_t *ppos);
557
558/*
612 * trace_iterator_flags is an enumeration that defines bit 559 * trace_iterator_flags is an enumeration that defines bit
613 * positions into trace_flags that controls the output. 560 * positions into trace_flags that controls the output.
614 * 561 *
@@ -635,9 +582,8 @@ enum trace_iterator_flags {
635 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 582 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
636 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 583 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
637 TRACE_ITER_LATENCY_FMT = 0x40000, 584 TRACE_ITER_LATENCY_FMT = 0x40000,
638 TRACE_ITER_GLOBAL_CLK = 0x80000, 585 TRACE_ITER_SLEEP_TIME = 0x80000,
639 TRACE_ITER_SLEEP_TIME = 0x100000, 586 TRACE_ITER_GRAPH_TIME = 0x100000,
640 TRACE_ITER_GRAPH_TIME = 0x200000,
641}; 587};
642 588
643/* 589/*
@@ -734,6 +680,7 @@ struct ftrace_event_field {
734 struct list_head link; 680 struct list_head link;
735 char *name; 681 char *name;
736 char *type; 682 char *type;
683 int filter_type;
737 int offset; 684 int offset;
738 int size; 685 int size;
739 int is_signed; 686 int is_signed;
@@ -743,13 +690,15 @@ struct event_filter {
743 int n_preds; 690 int n_preds;
744 struct filter_pred **preds; 691 struct filter_pred **preds;
745 char *filter_string; 692 char *filter_string;
693 bool no_reset;
746}; 694};
747 695
748struct event_subsystem { 696struct event_subsystem {
749 struct list_head list; 697 struct list_head list;
750 const char *name; 698 const char *name;
751 struct dentry *entry; 699 struct dentry *entry;
752 void *filter; 700 struct event_filter *filter;
701 int nr_events;
753}; 702};
754 703
755struct filter_pred; 704struct filter_pred;
@@ -777,6 +726,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
777 char *filter_string); 726 char *filter_string);
778extern void print_subsystem_event_filter(struct event_subsystem *system, 727extern void print_subsystem_event_filter(struct event_subsystem *system,
779 struct trace_seq *s); 728 struct trace_seq *s);
729extern int filter_assign_type(const char *type);
780 730
781static inline int 731static inline int
782filter_check_discard(struct ftrace_event_call *call, void *rec, 732filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -791,58 +741,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
791 return 0; 741 return 0;
792} 742}
793 743
794#define DEFINE_COMPARISON_PRED(type) \
795static int filter_pred_##type(struct filter_pred *pred, void *event, \
796 int val1, int val2) \
797{ \
798 type *addr = (type *)(event + pred->offset); \
799 type val = (type)pred->val; \
800 int match = 0; \
801 \
802 switch (pred->op) { \
803 case OP_LT: \
804 match = (*addr < val); \
805 break; \
806 case OP_LE: \
807 match = (*addr <= val); \
808 break; \
809 case OP_GT: \
810 match = (*addr > val); \
811 break; \
812 case OP_GE: \
813 match = (*addr >= val); \
814 break; \
815 default: \
816 break; \
817 } \
818 \
819 return match; \
820}
821
822#define DEFINE_EQUALITY_PRED(size) \
823static int filter_pred_##size(struct filter_pred *pred, void *event, \
824 int val1, int val2) \
825{ \
826 u##size *addr = (u##size *)(event + pred->offset); \
827 u##size val = (u##size)pred->val; \
828 int match; \
829 \
830 match = (val == *addr) ^ pred->not; \
831 \
832 return match; \
833}
834
835extern struct mutex event_mutex; 744extern struct mutex event_mutex;
836extern struct list_head ftrace_events; 745extern struct list_head ftrace_events;
837 746
838extern const char *__start___trace_bprintk_fmt[]; 747extern const char *__start___trace_bprintk_fmt[];
839extern const char *__stop___trace_bprintk_fmt[]; 748extern const char *__stop___trace_bprintk_fmt[];
840 749
841#undef TRACE_EVENT_FORMAT 750#undef FTRACE_ENTRY
842#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 751#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
843 extern struct ftrace_event_call event_##call; 752 extern struct ftrace_event_call event_##call;
844#undef TRACE_EVENT_FORMAT_NOFILTER 753#undef FTRACE_ENTRY_DUP
845#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 754#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
846#include "trace_event_types.h" 755 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
756#include "trace_entries.h"
847 757
848#endif /* _LINUX_KERNEL_TRACE_H */ 758#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -131,7 +129,9 @@ struct tracer boot_tracer __read_mostly =
131 129
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
134 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 136 struct trace_array *tr = boot_trace;
137 137
@@ -144,20 +144,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 145 preempt_disable();
146 146
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 149 sizeof(*entry), 0, 0);
149 if (!event) 150 if (!event)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 166 struct trace_array *tr = boot_trace;
163 167
@@ -167,13 +171,15 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 171 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 172 preempt_disable();
169 173
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 176 sizeof(*entry), 0, 0);
172 if (!event) 177 if (!event)
173 goto out; 178 goto out;
174 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 183 out:
178 preempt_enable(); 184 preempt_enable();
179} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..a431748ddd6e
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,383 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(power, trace_power,
334
335 TRACE_POWER,
336
337 F_STRUCT(
338 __field_struct( struct power_trace, state_data )
339 __field_desc( s64, state_data, stamp )
340 __field_desc( s64, state_data, end )
341 __field_desc( int, state_data, type )
342 __field_desc( int, state_data, state )
343 ),
344
345 F_printk("%llx->%llx type:%u state:%u",
346 __entry->stamp, __entry->end,
347 __entry->type, __entry->state)
348);
349
350FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
351
352 TRACE_KMEM_ALLOC,
353
354 F_STRUCT(
355 __field( enum kmemtrace_type_id, type_id )
356 __field( unsigned long, call_site )
357 __field( const void *, ptr )
358 __field( size_t, bytes_req )
359 __field( size_t, bytes_alloc )
360 __field( gfp_t, gfp_flags )
361 __field( int, node )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
365 " flags:%x node:%d",
366 __entry->type_id, __entry->call_site, __entry->ptr,
367 __entry->bytes_req, __entry->bytes_alloc,
368 __entry->gfp_flags, __entry->node)
369);
370
371FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
372
373 TRACE_KMEM_FREE,
374
375 F_STRUCT(
376 __field( enum kmemtrace_type_id, type_id )
377 __field( unsigned long, call_site )
378 __field( const void *, ptr )
379 ),
380
381 F_printk("type:%u call_site:%lx ptr:%p",
382 __entry->type_id, __entry->call_site, __entry->ptr)
383);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 11ba5bb4ed0a..55a25c933d15 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,6 +5,7 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
10int ftrace_profile_enable(int event_id) 11int ftrace_profile_enable(int event_id)
@@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id)
14 15
15 mutex_lock(&event_mutex); 16 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 17 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id && event->profile_enable) { 18 if (event->id == event_id && event->profile_enable &&
19 try_module_get(event->mod)) {
18 ret = event->profile_enable(event); 20 ret = event->profile_enable(event);
19 break; 21 break;
20 } 22 }
@@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id)
32 list_for_each_entry(event, &ftrace_events, list) { 34 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 35 if (event->id == event_id) {
34 event->profile_disable(event); 36 event->profile_disable(event);
37 module_put(event->mod);
35 break; 38 break;
36 } 39 }
37 } 40 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 6db005e12487..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,178 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(unsigned long long, ret.calltime, calltime)
30 TRACE_FIELD(unsigned long long, ret.rettime, rettime)
31 TRACE_FIELD(unsigned long, ret.overrun, overrun)
32 TRACE_FIELD(int, ret.depth, depth)
33 ),
34 TP_RAW_FMT("<-- %lx (%d)")
35);
36
37TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
38 TRACE_STRUCT(
39 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
40 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
41 TRACE_FIELD(unsigned char, prev_state, prev_state)
42 TRACE_FIELD(unsigned int, next_pid, next_pid)
43 TRACE_FIELD(unsigned char, next_prio, next_prio)
44 TRACE_FIELD(unsigned char, next_state, next_state)
45 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
46 ),
47 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
48);
49
50TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
51 TRACE_STRUCT(
52 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
53 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
54 TRACE_FIELD(unsigned char, prev_state, prev_state)
55 TRACE_FIELD(unsigned int, next_pid, next_pid)
56 TRACE_FIELD(unsigned char, next_prio, next_prio)
57 TRACE_FIELD(unsigned char, next_state, next_state)
58 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
59 ),
60 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
61);
62
63TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
64 TRACE_STRUCT(
65 TRACE_FIELD(unsigned long, arg1, arg1)
66 TRACE_FIELD(unsigned long, arg2, arg2)
67 TRACE_FIELD(unsigned long, arg3, arg3)
68 ),
69 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
70);
71
72/*
73 * Stack-trace entry:
74 */
75
76/* #define FTRACE_STACK_ENTRIES 8 */
77
78TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
79 TRACE_STRUCT(
80 TRACE_FIELD(unsigned long, caller[0], stack0)
81 TRACE_FIELD(unsigned long, caller[1], stack1)
82 TRACE_FIELD(unsigned long, caller[2], stack2)
83 TRACE_FIELD(unsigned long, caller[3], stack3)
84 TRACE_FIELD(unsigned long, caller[4], stack4)
85 TRACE_FIELD(unsigned long, caller[5], stack5)
86 TRACE_FIELD(unsigned long, caller[6], stack6)
87 TRACE_FIELD(unsigned long, caller[7], stack7)
88 ),
89 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
90 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
91);
92
93TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
94 TRACE_STRUCT(
95 TRACE_FIELD(unsigned long, caller[0], stack0)
96 TRACE_FIELD(unsigned long, caller[1], stack1)
97 TRACE_FIELD(unsigned long, caller[2], stack2)
98 TRACE_FIELD(unsigned long, caller[3], stack3)
99 TRACE_FIELD(unsigned long, caller[4], stack4)
100 TRACE_FIELD(unsigned long, caller[5], stack5)
101 TRACE_FIELD(unsigned long, caller[6], stack6)
102 TRACE_FIELD(unsigned long, caller[7], stack7)
103 ),
104 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
105 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
106);
107
108TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
109 TRACE_STRUCT(
110 TRACE_FIELD(unsigned long, ip, ip)
111 TRACE_FIELD(char *, fmt, fmt)
112 TRACE_FIELD_ZERO_CHAR(buf)
113 ),
114 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
115);
116
117TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
118 TRACE_STRUCT(
119 TRACE_FIELD(unsigned long, ip, ip)
120 TRACE_FIELD_ZERO_CHAR(buf)
121 ),
122 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
123);
124
125TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
126 TRACE_STRUCT(
127 TRACE_FIELD(unsigned int, line, line)
128 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
129 TRACE_FUNC_SIZE+1, func)
130 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
131 TRACE_FUNC_SIZE+1, file)
132 TRACE_FIELD(char, correct, correct)
133 ),
134 TP_RAW_FMT("%u:%s:%s (%u)")
135);
136
137TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
138 TRACE_STRUCT(
139 TRACE_FIELD(u64, from, from)
140 TRACE_FIELD(u64, to, to)
141 ),
142 TP_RAW_FMT("from: %llx to: %llx")
143);
144
145TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
146 TRACE_STRUCT(
147 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
148 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
149 TRACE_FIELD(int, state_data.type, type)
150 TRACE_FIELD(int, state_data.state, state)
151 ),
152 TP_RAW_FMT("%llx->%llx type:%u state:%u")
153);
154
155TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
156 TRACE_STRUCT(
157 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
158 TRACE_FIELD(unsigned long, call_site, call_site)
159 TRACE_FIELD(const void *, ptr, ptr)
160 TRACE_FIELD(size_t, bytes_req, bytes_req)
161 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
162 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
163 TRACE_FIELD(int, node, node)
164 ),
165 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
166 " flags:%x node:%d")
167);
168
169TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
170 TRACE_STRUCT(
171 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
172 TRACE_FIELD(unsigned long, call_site, call_site)
173 TRACE_FIELD(const void *, ptr, ptr)
174 ),
175 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
176);
177
178#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..56c260b83a9c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,16 +17,20 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
24#undef TRACE_SYSTEM
22#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
23 26
24DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
25 28
26LIST_HEAD(ftrace_events); 29LIST_HEAD(ftrace_events);
27 30
28int trace_define_field(struct ftrace_event_call *call, char *type, 31int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 32 const char *name, int offset, int size, int is_signed,
33 int filter_type)
30{ 34{
31 struct ftrace_event_field *field; 35 struct ftrace_event_field *field;
32 36
@@ -42,9 +46,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 46 if (!field->type)
43 goto err; 47 goto err;
44 48
49 if (filter_type == FILTER_OTHER)
50 field->filter_type = filter_assign_type(type);
51 else
52 field->filter_type = filter_type;
53
45 field->offset = offset; 54 field->offset = offset;
46 field->size = size; 55 field->size = size;
47 field->is_signed = is_signed; 56 field->is_signed = is_signed;
57
48 list_add(&field->link, &call->fields); 58 list_add(&field->link, &call->fields);
49 59
50 return 0; 60 return 0;
@@ -60,6 +70,29 @@ err:
60} 70}
61EXPORT_SYMBOL_GPL(trace_define_field); 71EXPORT_SYMBOL_GPL(trace_define_field);
62 72
73#define __common_field(type, item) \
74 ret = trace_define_field(call, #type, "common_" #item, \
75 offsetof(typeof(ent), item), \
76 sizeof(ent.item), \
77 is_signed_type(type), FILTER_OTHER); \
78 if (ret) \
79 return ret;
80
81int trace_define_common_fields(struct ftrace_event_call *call)
82{
83 int ret;
84 struct trace_entry ent;
85
86 __common_field(unsigned short, type);
87 __common_field(unsigned char, flags);
88 __common_field(unsigned char, preempt_count);
89 __common_field(int, pid);
90 __common_field(int, lock_depth);
91
92 return ret;
93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95
63#ifdef CONFIG_MODULES 96#ifdef CONFIG_MODULES
64 97
65static void trace_destroy_fields(struct ftrace_event_call *call) 98static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +117,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 117 if (call->enabled) {
85 call->enabled = 0; 118 call->enabled = 0;
86 tracing_stop_cmdline_record(); 119 tracing_stop_cmdline_record();
87 call->unregfunc(); 120 call->unregfunc(call->data);
88 } 121 }
89 break; 122 break;
90 case 1: 123 case 1:
91 if (!call->enabled) { 124 if (!call->enabled) {
92 call->enabled = 1; 125 call->enabled = 1;
93 tracing_start_cmdline_record(); 126 tracing_start_cmdline_record();
94 call->regfunc(); 127 call->regfunc(call->data);
95 } 128 }
96 break; 129 break;
97 } 130 }
@@ -198,11 +231,9 @@ static ssize_t
198ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
199 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
200{ 233{
234 struct trace_parser parser;
201 size_t read = 0; 235 size_t read = 0;
202 int i, set = 1;
203 ssize_t ret; 236 ssize_t ret;
204 char *buf;
205 char ch;
206 237
207 if (!cnt || cnt < 0) 238 if (!cnt || cnt < 0)
208 return 0; 239 return 0;
@@ -211,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
211 if (ret < 0) 242 if (ret < 0)
212 return ret; 243 return ret;
213 244
214 ret = get_user(ch, ubuf++); 245 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
215 if (ret)
216 return ret;
217 read++;
218 cnt--;
219
220 /* skip white space */
221 while (cnt && isspace(ch)) {
222 ret = get_user(ch, ubuf++);
223 if (ret)
224 return ret;
225 read++;
226 cnt--;
227 }
228
229 /* Only white space found? */
230 if (isspace(ch)) {
231 file->f_pos += read;
232 ret = read;
233 return ret;
234 }
235
236 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
237 if (!buf)
238 return -ENOMEM; 246 return -ENOMEM;
239 247
240 if (cnt > EVENT_BUF_SIZE) 248 read = trace_get_user(&parser, ubuf, cnt, ppos);
241 cnt = EVENT_BUF_SIZE; 249
250 if (trace_parser_loaded((&parser))) {
251 int set = 1;
242 252
243 i = 0; 253 if (*parser.buffer == '!')
244 while (cnt && !isspace(ch)) {
245 if (!i && ch == '!')
246 set = 0; 254 set = 0;
247 else
248 buf[i++] = ch;
249 255
250 ret = get_user(ch, ubuf++); 256 parser.buffer[parser.idx] = 0;
257
258 ret = ftrace_set_clr_event(parser.buffer + !set, set);
251 if (ret) 259 if (ret)
252 goto out_free; 260 goto out_put;
253 read++;
254 cnt--;
255 } 261 }
256 buf[i] = 0;
257
258 file->f_pos += read;
259
260 ret = ftrace_set_clr_event(buf, set);
261 if (ret)
262 goto out_free;
263 262
264 ret = read; 263 ret = read;
265 264
266 out_free: 265 out_put:
267 kfree(buf); 266 trace_parser_put(&parser);
268 267
269 return ret; 268 return ret;
270} 269}
@@ -546,7 +545,7 @@ static int trace_write_header(struct trace_seq *s)
546 FIELD(unsigned char, flags), 545 FIELD(unsigned char, flags),
547 FIELD(unsigned char, preempt_count), 546 FIELD(unsigned char, preempt_count),
548 FIELD(int, pid), 547 FIELD(int, pid),
549 FIELD(int, tgid)); 548 FIELD(int, lock_depth));
550} 549}
551 550
552static ssize_t 551static ssize_t
@@ -574,7 +573,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
574 trace_seq_printf(s, "format:\n"); 573 trace_seq_printf(s, "format:\n");
575 trace_write_header(s); 574 trace_write_header(s);
576 575
577 r = call->show_format(s); 576 r = call->show_format(call, s);
578 if (!r) { 577 if (!r) {
579 /* 578 /*
580 * ug! The format output is bigger than a PAGE!! 579 * ug! The format output is bigger than a PAGE!!
@@ -849,8 +848,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 848
850 /* First see if we did not already create this dir */ 849 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 850 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 851 if (strcmp(system->name, name) == 0) {
852 system->nr_events++;
853 return system->entry; 853 return system->entry;
854 }
854 } 855 }
855 856
856 /* need to create new entry */ 857 /* need to create new entry */
@@ -869,6 +870,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 870 return d_events;
870 } 871 }
871 872
873 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 874 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 875 if (!system->name) {
874 debugfs_remove(system->entry); 876 debugfs_remove(system->entry);
@@ -920,15 +922,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 922 if (strcmp(call->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 923 d_events = event_subsystem_dir(call->system, d_events);
922 924
923 if (call->raw_init) {
924 ret = call->raw_init();
925 if (ret < 0) {
926 pr_warning("Could not initialize trace point"
927 " events/%s\n", call->name);
928 return ret;
929 }
930 }
931
932 call->dir = debugfs_create_dir(call->name, d_events); 925 call->dir = debugfs_create_dir(call->name, d_events);
933 if (!call->dir) { 926 if (!call->dir) {
934 pr_warning("Could not create debugfs " 927 pr_warning("Could not create debugfs "
@@ -945,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
945 id); 938 id);
946 939
947 if (call->define_fields) { 940 if (call->define_fields) {
948 ret = call->define_fields(); 941 ret = call->define_fields(call);
949 if (ret < 0) { 942 if (ret < 0) {
950 pr_warning("Could not initialize trace point" 943 pr_warning("Could not initialize trace point"
951 " events/%s\n", call->name); 944 " events/%s\n", call->name);
@@ -987,6 +980,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 980 struct file_operations filter;
988}; 981};
989 982
983static void remove_subsystem_dir(const char *name)
984{
985 struct event_subsystem *system;
986
987 if (strcmp(name, TRACE_SYSTEM) == 0)
988 return;
989
990 list_for_each_entry(system, &event_subsystems, list) {
991 if (strcmp(system->name, name) == 0) {
992 if (!--system->nr_events) {
993 struct event_filter *filter = system->filter;
994
995 debugfs_remove_recursive(system->entry);
996 list_del(&system->list);
997 if (filter) {
998 kfree(filter->filter_string);
999 kfree(filter);
1000 }
1001 kfree(system->name);
1002 kfree(system);
1003 }
1004 break;
1005 }
1006 }
1007}
1008
990static struct ftrace_module_file_ops * 1009static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1010trace_create_file_ops(struct module *mod)
992{ 1011{
@@ -1027,6 +1046,7 @@ static void trace_module_add_events(struct module *mod)
1027 struct ftrace_module_file_ops *file_ops = NULL; 1046 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end; 1047 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events; 1048 struct dentry *d_events;
1049 int ret;
1030 1050
1031 start = mod->trace_events; 1051 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events; 1052 end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1062,15 @@ static void trace_module_add_events(struct module *mod)
1042 /* The linker may leave blanks */ 1062 /* The linker may leave blanks */
1043 if (!call->name) 1063 if (!call->name)
1044 continue; 1064 continue;
1045 1065 if (call->raw_init) {
1066 ret = call->raw_init();
1067 if (ret < 0) {
1068 if (ret != -ENOSYS)
1069 pr_warning("Could not initialize trace "
1070 "point events/%s\n", call->name);
1071 continue;
1072 }
1073 }
1046 /* 1074 /*
1047 * This module has events, create file ops for this module 1075 * This module has events, create file ops for this module
1048 * if not already done. 1076 * if not already done.
@@ -1077,6 +1105,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1105 list_del(&call->list);
1078 trace_destroy_fields(call); 1106 trace_destroy_fields(call);
1079 destroy_preds(call); 1107 destroy_preds(call);
1108 remove_subsystem_dir(call->system);
1080 } 1109 }
1081 } 1110 }
1082 1111
@@ -1125,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self,
1125} 1154}
1126#endif /* CONFIG_MODULES */ 1155#endif /* CONFIG_MODULES */
1127 1156
1128struct notifier_block trace_module_nb = { 1157static struct notifier_block trace_module_nb = {
1129 .notifier_call = trace_module_notify, 1158 .notifier_call = trace_module_notify,
1130 .priority = 0, 1159 .priority = 0,
1131}; 1160};
@@ -1133,6 +1162,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1162extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1163extern struct ftrace_event_call __stop_ftrace_events[];
1135 1164
1165static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1166
1167static __init int setup_trace_event(char *str)
1168{
1169 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1170 ring_buffer_expanded = 1;
1171 tracing_selftest_disabled = 1;
1172
1173 return 1;
1174}
1175__setup("trace_event=", setup_trace_event);
1176
1136static __init int event_trace_init(void) 1177static __init int event_trace_init(void)
1137{ 1178{
1138 struct ftrace_event_call *call; 1179 struct ftrace_event_call *call;
@@ -1140,6 +1181,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1181 struct dentry *entry;
1141 struct dentry *d_events; 1182 struct dentry *d_events;
1142 int ret; 1183 int ret;
1184 char *buf = bootup_event_buf;
1185 char *token;
1143 1186
1144 d_tracer = tracing_init_dentry(); 1187 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1188 if (!d_tracer)
@@ -1179,12 +1222,34 @@ static __init int event_trace_init(void)
1179 /* The linker may leave blanks */ 1222 /* The linker may leave blanks */
1180 if (!call->name) 1223 if (!call->name)
1181 continue; 1224 continue;
1225 if (call->raw_init) {
1226 ret = call->raw_init();
1227 if (ret < 0) {
1228 if (ret != -ENOSYS)
1229 pr_warning("Could not initialize trace "
1230 "point events/%s\n", call->name);
1231 continue;
1232 }
1233 }
1182 list_add(&call->list, &ftrace_events); 1234 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops, 1235 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops, 1236 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops); 1237 &ftrace_event_format_fops);
1186 } 1238 }
1187 1239
1240 while (true) {
1241 token = strsep(&buf, ",");
1242
1243 if (!token)
1244 break;
1245 if (!*token)
1246 continue;
1247
1248 ret = ftrace_set_clr_event(token, 1);
1249 if (ret)
1250 pr_warning("Failed to enable trace event: %s\n", token);
1251 }
1252
1188 ret = register_module_notifier(&trace_module_nb); 1253 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1254 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1255 pr_warning("Failed to register trace events module notifier\n");
@@ -1261,6 +1326,18 @@ static __init void event_trace_self_tests(void)
1261 if (!call->regfunc) 1326 if (!call->regfunc)
1262 continue; 1327 continue;
1263 1328
1329/*
1330 * Testing syscall events here is pretty useless, but
1331 * we still do it if configured. But this is time consuming.
1332 * What we really need is a user thread to perform the
1333 * syscalls as we test.
1334 */
1335#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1336 if (call->system &&
1337 strcmp(call->system, "syscalls") == 0)
1338 continue;
1339#endif
1340
1264 pr_info("Testing event %s: ", call->name); 1341 pr_info("Testing event %s: ", call->name);
1265 1342
1266 /* 1343 /*
@@ -1334,12 +1411,13 @@ static __init void event_trace_self_tests(void)
1334 1411
1335#ifdef CONFIG_FUNCTION_TRACER 1412#ifdef CONFIG_FUNCTION_TRACER
1336 1413
1337static DEFINE_PER_CPU(atomic_t, test_event_disable); 1414static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1338 1415
1339static void 1416static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1417function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{ 1418{
1342 struct ring_buffer_event *event; 1419 struct ring_buffer_event *event;
1420 struct ring_buffer *buffer;
1343 struct ftrace_entry *entry; 1421 struct ftrace_entry *entry;
1344 unsigned long flags; 1422 unsigned long flags;
1345 long disabled; 1423 long disabled;
@@ -1350,14 +1428,15 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1350 pc = preempt_count(); 1428 pc = preempt_count();
1351 resched = ftrace_preempt_disable(); 1429 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id(); 1430 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1431 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1354 1432
1355 if (disabled != 1) 1433 if (disabled != 1)
1356 goto out; 1434 goto out;
1357 1435
1358 local_save_flags(flags); 1436 local_save_flags(flags);
1359 1437
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1438 event = trace_current_buffer_lock_reserve(&buffer,
1439 TRACE_FN, sizeof(*entry),
1361 flags, pc); 1440 flags, pc);
1362 if (!event) 1441 if (!event)
1363 goto out; 1442 goto out;
@@ -1365,10 +1444,10 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1365 entry->ip = ip; 1444 entry->ip = ip;
1366 entry->parent_ip = parent_ip; 1445 entry->parent_ip = parent_ip;
1367 1446
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1447 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1369 1448
1370 out: 1449 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1450 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1372 ftrace_preempt_enable(resched); 1451 ftrace_preempt_enable(resched);
1373} 1452}
1374 1453
@@ -1392,10 +1471,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1471
1393static __init int event_trace_self_tests_init(void) 1472static __init int event_trace_self_tests_init(void)
1394{ 1473{
1395 1474 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1475 event_trace_self_tests();
1397 1476 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1477 }
1399 1478
1400 return 0; 1479 return 0;
1401} 1480}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
@@ -163,6 +204,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
163 return match; 204 return match;
164} 205}
165 206
207/* Filter predicate for char * pointers */
208static int filter_pred_pchar(struct filter_pred *pred, void *event,
209 int val1, int val2)
210{
211 char **addr = (char **)(event + pred->offset);
212 int cmp, match;
213
214 cmp = strncmp(*addr, pred->str_val, pred->str_len);
215
216 match = (!cmp) ^ pred->not;
217
218 return match;
219}
220
166/* 221/*
167 * Filter predicate for dynamic sized arrays of characters. 222 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end 223 * These are implemented through a list of strings at the end
@@ -176,11 +231,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 231static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 232 int val1, int val2)
178{ 233{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 234 u32 str_item = *(u32 *)(event + pred->offset);
235 int str_loc = str_item & 0xffff;
236 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 237 char *addr = (char *)(event + str_loc);
181 int cmp, match; 238 int cmp, match;
182 239
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 240 cmp = strncmp(addr, pred->str_val, str_len);
184 241
185 match = (!cmp) ^ pred->not; 242 match = (!cmp) ^ pred->not;
186 243
@@ -293,7 +350,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
293 struct event_filter *filter = call->filter; 350 struct event_filter *filter = call->filter;
294 351
295 mutex_lock(&event_mutex); 352 mutex_lock(&event_mutex);
296 if (filter->filter_string) 353 if (filter && filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string); 354 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else 355 else
299 trace_seq_printf(s, "none\n"); 356 trace_seq_printf(s, "none\n");
@@ -306,7 +363,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
306 struct event_filter *filter = system->filter; 363 struct event_filter *filter = system->filter;
307 364
308 mutex_lock(&event_mutex); 365 mutex_lock(&event_mutex);
309 if (filter->filter_string) 366 if (filter && filter->filter_string)
310 trace_seq_printf(s, "%s\n", filter->filter_string); 367 trace_seq_printf(s, "%s\n", filter->filter_string);
311 else 368 else
312 trace_seq_printf(s, "none\n"); 369 trace_seq_printf(s, "none\n");
@@ -374,6 +431,9 @@ void destroy_preds(struct ftrace_event_call *call)
374 struct event_filter *filter = call->filter; 431 struct event_filter *filter = call->filter;
375 int i; 432 int i;
376 433
434 if (!filter)
435 return;
436
377 for (i = 0; i < MAX_FILTER_PRED; i++) { 437 for (i = 0; i < MAX_FILTER_PRED; i++) {
378 if (filter->preds[i]) 438 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]); 439 filter_free_pred(filter->preds[i]);
@@ -384,17 +444,19 @@ void destroy_preds(struct ftrace_event_call *call)
384 call->filter = NULL; 444 call->filter = NULL;
385} 445}
386 446
387int init_preds(struct ftrace_event_call *call) 447static int init_preds(struct ftrace_event_call *call)
388{ 448{
389 struct event_filter *filter; 449 struct event_filter *filter;
390 struct filter_pred *pred; 450 struct filter_pred *pred;
391 int i; 451 int i;
392 452
453 if (call->filter)
454 return 0;
455
393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
394 if (!call->filter) 457 if (!call->filter)
395 return -ENOMEM; 458 return -ENOMEM;
396 459
397 call->filter_active = 0;
398 filter->n_preds = 0; 460 filter->n_preds = 0;
399 461
400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 462 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -416,30 +478,55 @@ oom:
416 478
417 return -ENOMEM; 479 return -ENOMEM;
418} 480}
419EXPORT_SYMBOL_GPL(init_preds);
420 481
421static void filter_free_subsystem_preds(struct event_subsystem *system) 482static int init_subsystem_preds(struct event_subsystem *system)
422{ 483{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 484 struct ftrace_event_call *call;
425 int i; 485 int err;
426 486
427 if (filter->n_preds) { 487 list_for_each_entry(call, &ftrace_events, list) {
428 for (i = 0; i < filter->n_preds; i++) 488 if (!call->define_fields)
429 filter_free_pred(filter->preds[i]); 489 continue;
430 kfree(filter->preds); 490
431 filter->preds = NULL; 491 if (strcmp(call->system, system->name) != 0)
432 filter->n_preds = 0; 492 continue;
493
494 err = init_preds(call);
495 if (err)
496 return err;
433 } 497 }
434 498
499 return 0;
500}
501
502enum {
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{
511 struct ftrace_event_call *call;
512
435 list_for_each_entry(call, &ftrace_events, list) { 513 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 514 if (!call->define_fields)
437 continue; 515 continue;
438 516
439 if (!strcmp(call->system, system->name)) { 517 if (strcmp(call->system, system->name) != 0)
440 filter_disable_preds(call); 518 continue;
441 remove_filter_string(call->filter); 519
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
442 } 523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call);
529 remove_filter_string(call->filter);
443 } 530 }
444} 531}
445 532
@@ -468,12 +555,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
468 return 0; 555 return 0;
469} 556}
470 557
471enum { 558int filter_assign_type(const char *type)
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
476static int is_string_field(const char *type)
477{ 559{
478 if (strstr(type, "__data_loc") && strstr(type, "char")) 560 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING; 561 return FILTER_DYN_STRING;
@@ -481,12 +563,19 @@ static int is_string_field(const char *type)
481 if (strchr(type, '[') && strstr(type, "char")) 563 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 564 return FILTER_STATIC_STRING;
483 565
484 return 0; 566 return FILTER_OTHER;
567}
568
569static bool is_string_field(struct ftrace_event_field *field)
570{
571 return field->filter_type == FILTER_DYN_STRING ||
572 field->filter_type == FILTER_STATIC_STRING ||
573 field->filter_type == FILTER_PTR_STRING;
485} 574}
486 575
487static int is_legal_op(struct ftrace_event_field *field, int op) 576static int is_legal_op(struct ftrace_event_field *field, int op)
488{ 577{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
490 return 0; 579 return 0;
491 580
492 return 1; 581 return 1;
@@ -537,22 +626,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 626
538static int filter_add_pred(struct filter_parse_state *ps, 627static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 628 struct ftrace_event_call *call,
540 struct filter_pred *pred) 629 struct filter_pred *pred,
630 bool dry_run)
541{ 631{
542 struct ftrace_event_field *field; 632 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 633 filter_pred_fn_t fn;
544 unsigned long long val; 634 unsigned long long val;
545 int string_type;
546 int ret; 635 int ret;
547 636
548 pred->fn = filter_pred_none; 637 pred->fn = filter_pred_none;
549 638
550 if (pred->op == OP_AND) { 639 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 640 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 641 fn = filter_pred_and;
642 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 643 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 644 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 645 fn = filter_pred_or;
646 goto add_pred_fn;
556 } 647 }
557 648
558 field = find_event_field(call, pred->field_name); 649 field = find_event_field(call, pred->field_name);
@@ -568,16 +659,17 @@ static int filter_add_pred(struct filter_parse_state *ps,
568 return -EINVAL; 659 return -EINVAL;
569 } 660 }
570 661
571 string_type = is_string_field(field->type); 662 if (is_string_field(field)) {
572 if (string_type) { 663 pred->str_len = field->size;
573 if (string_type == FILTER_STATIC_STRING) 664
665 if (field->filter_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string; 666 fn = filter_pred_string;
575 else 667 else if (field->filter_type == FILTER_DYN_STRING)
576 fn = filter_pred_strloc; 668 fn = filter_pred_strloc;
577 pred->str_len = field->size; 669 else {
578 if (pred->op == OP_NE) 670 fn = filter_pred_pchar;
579 pred->not = 1; 671 pred->str_len = strlen(pred->str_val);
580 return filter_add_pred_fn(ps, call, pred, fn); 672 }
581 } else { 673 } else {
582 if (field->is_signed) 674 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 675 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +680,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 680 return -EINVAL;
589 } 681 }
590 pred->val = val; 682 pred->val = val;
591 }
592 683
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 684 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 685 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 686 if (!fn) {
596 return -EINVAL; 687 parse_error(ps, FILT_ERR_INVALID_OP, 0);
688 return -EINVAL;
689 }
597 } 690 }
598 691
599 if (pred->op == OP_NE) 692 if (pred->op == OP_NE)
600 pred->not = 1; 693 pred->not = 1;
601 694
602 return filter_add_pred_fn(ps, call, pred, fn); 695add_pred_fn:
696 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn);
698 return 0;
603} 699}
604 700
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 702 struct event_subsystem *system,
607 struct filter_pred *pred, 703 struct filter_pred *pred,
608 char *filter_string) 704 char *filter_string,
705 bool dry_run)
609{ 706{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 707 struct ftrace_event_call *call;
612 int err = 0; 708 int err = 0;
613 709 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626 710
627 list_for_each_entry(call, &ftrace_events, list) { 711 list_for_each_entry(call, &ftrace_events, list) {
628 712
@@ -632,19 +716,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
632 if (strcmp(call->system, system->name)) 716 if (strcmp(call->system, system->name))
633 continue; 717 continue;
634 718
635 err = filter_add_pred(ps, call, pred); 719 if (call->filter->no_reset)
636 if (err) { 720 continue;
637 filter_free_subsystem_preds(system); 721
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 722 err = filter_add_pred(ps, call, pred, dry_run);
639 goto out; 723 if (err)
640 } 724 call->filter->no_reset = true;
641 replace_filter_string(call->filter, filter_string); 725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
642 } 730 }
643 731
644 filter->preds[filter->n_preds] = pred; 732 if (fail) {
645 filter->n_preds++; 733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
646out: 734 return err;
647 return err; 735 }
736 return 0;
648} 737}
649 738
650static void parse_init(struct filter_parse_state *ps, 739static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1092,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1092static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1093 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1094 struct filter_parse_state *ps,
1006 char *filter_string) 1095 char *filter_string,
1096 bool dry_run)
1007{ 1097{
1008 char *operand1 = NULL, *operand2 = NULL; 1098 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1099 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1100 struct postfix_elt *elt;
1011 int err; 1101 int err;
1102 int n_preds = 0;
1012 1103
1013 err = check_preds(ps); 1104 err = check_preds(ps);
1014 if (err) 1105 if (err)
@@ -1027,24 +1118,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1118 continue;
1028 } 1119 }
1029 1120
1121 if (n_preds++ == MAX_FILTER_PRED) {
1122 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1123 return -ENOSPC;
1124 }
1125
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1126 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1127 pred = create_logical_pred(elt->op);
1032 if (!pred) 1128 goto add_pred;
1033 return -ENOMEM;
1034 if (call) {
1035 err = filter_add_pred(ps, call, pred);
1036 filter_free_pred(pred);
1037 } else {
1038 err = filter_add_subsystem_pred(ps, system,
1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
1047 continue;
1048 } 1129 }
1049 1130
1050 if (!operand1 || !operand2) { 1131 if (!operand1 || !operand2) {
@@ -1053,17 +1134,15 @@ static int replace_preds(struct event_subsystem *system,
1053 } 1134 }
1054 1135
1055 pred = create_pred(elt->op, operand1, operand2); 1136 pred = create_pred(elt->op, operand1, operand2);
1137add_pred:
1056 if (!pred) 1138 if (!pred)
1057 return -ENOMEM; 1139 return -ENOMEM;
1058 if (call) { 1140 if (call)
1059 err = filter_add_pred(ps, call, pred); 1141 err = filter_add_pred(ps, call, pred, false);
1060 filter_free_pred(pred); 1142 else
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred, 1143 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string); 1144 filter_string, dry_run);
1064 if (err) 1145 filter_free_pred(pred);
1065 filter_free_pred(pred);
1066 }
1067 if (err) 1146 if (err)
1068 return err; 1147 return err;
1069 1148
@@ -1081,6 +1160,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1081 1160
1082 mutex_lock(&event_mutex); 1161 mutex_lock(&event_mutex);
1083 1162
1163 err = init_preds(call);
1164 if (err)
1165 goto out_unlock;
1166
1084 if (!strcmp(strstrip(filter_string), "0")) { 1167 if (!strcmp(strstrip(filter_string), "0")) {
1085 filter_disable_preds(call); 1168 filter_disable_preds(call);
1086 remove_filter_string(call->filter); 1169 remove_filter_string(call->filter);
@@ -1103,7 +1186,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1103 goto out; 1186 goto out;
1104 } 1187 }
1105 1188
1106 err = replace_preds(NULL, call, ps, filter_string); 1189 err = replace_preds(NULL, call, ps, filter_string, false);
1107 if (err) 1190 if (err)
1108 append_filter_err(ps, call->filter); 1191 append_filter_err(ps, call->filter);
1109 1192
@@ -1126,8 +1209,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1126 1209
1127 mutex_lock(&event_mutex); 1210 mutex_lock(&event_mutex);
1128 1211
1212 err = init_subsystem_preds(system);
1213 if (err)
1214 goto out_unlock;
1215
1129 if (!strcmp(strstrip(filter_string), "0")) { 1216 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system); 1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1131 remove_filter_string(system->filter); 1218 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex); 1219 mutex_unlock(&event_mutex);
1133 return 0; 1220 return 0;
@@ -1138,7 +1225,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 if (!ps) 1225 if (!ps)
1139 goto out_unlock; 1226 goto out_unlock;
1140 1227
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string); 1228 replace_filter_string(system->filter, filter_string);
1143 1229
1144 parse_init(ps, filter_ops, filter_string); 1230 parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1234,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1148 goto out; 1234 goto out;
1149 } 1235 }
1150 1236
1151 err = replace_preds(system, NULL, ps, filter_string); 1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1152 if (err) 1238
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1153 append_filter_err(ps, system->filter); 1242 append_filter_err(ps, system->filter);
1243 goto out;
1244 }
1245
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1247
1248 /* really apply the filter to the events */
1249 err = replace_preds(system, NULL, ps, filter_string, false);
1250 if (err) {
1251 append_filter_err(ps, system->filter);
1252 filter_free_subsystem_preds(system, 2);
1253 }
1154 1254
1155out: 1255out:
1156 filter_opstack_clear(ps); 1256 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,116 +15,209 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
83
84#undef __array
85#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \
88 offsetof(typeof(field), item), \
89 sizeof(field.item)); \
90 if (!ret) \
91 return 0;
35 92
36#undef TRACE_FIELD_SPECIAL 93#undef __array_desc
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 94#define __array_desc(type, container, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 97 offsetof(typeof(field), container.item), \
41 (unsigned int)sizeof(field.item)); \ 98 sizeof(field.container.item)); \
42 if (!ret) \ 99 if (!ret) \
43 return 0; 100 return 0;
44 101
45#undef TRACE_FIELD_ZERO_CHAR 102#undef __dynamic_array
46#define TRACE_FIELD_ZERO_CHAR(item) \ 103#define __dynamic_array(type, item) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
48 "offset:%u;\tsize:0;\n", \ 105 "offset:%zu;\tsize:0;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 106 offsetof(typeof(field), item)); \
50 if (!ret) \ 107 if (!ret) \
51 return 0; 108 return 0;
52 109
53#undef TRACE_FIELD_SIGN 110#undef F_printk
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
55 TRACE_FIELD(type, item, assign)
56 112
57#undef TP_RAW_FMT 113#undef __entry
58#define TP_RAW_FMT(args...) args 114#define __entry REC
59 115
60#undef TRACE_EVENT_FORMAT 116#undef FTRACE_ENTRY
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
62static int \ 118static int \
63ftrace_format_##call(struct trace_seq *s) \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
120 struct trace_seq *s) \
64{ \ 121{ \
65 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
66 int ret; \ 123 int ret = 0; \
67 \ 124 \
68 tstruct; \ 125 tstruct; \
69 \ 126 \
70 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
71 \ 128 \
72 return ret; \ 129 return ret; \
73} 130}
74 131
75#undef TRACE_EVENT_FORMAT_NOFILTER 132#include "trace_entries.h"
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 133
77 tpfmt) \ 134
78static int \ 135#undef __field
79ftrace_format_##call(struct trace_seq *s) \ 136#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \
138 offsetof(typeof(field), item), \
139 sizeof(field.item), \
140 is_signed_type(type), FILTER_OTHER); \
141 if (ret) \
142 return ret;
143
144#undef __field_desc
145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \
160 if (ret) \
161 return ret;
162
163#undef __array_desc
164#define __array_desc(type, container, item, len) \
165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
170 FILTER_OTHER); \
171 if (ret) \
172 return ret;
173
174#undef __dynamic_array
175#define __dynamic_array(type, item)
176
177#undef FTRACE_ENTRY
178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
179int \
180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
80{ \ 181{ \
81 struct args field; \ 182 struct struct_name field; \
82 int ret; \ 183 int ret; \
83 \ 184 \
84 tstruct; \ 185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
85 \ 188 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 189 tstruct; \
87 \ 190 \
88 return ret; \ 191 return ret; \
89} 192}
90 193
91#include "trace_event_types.h" 194#include "trace_entries.h"
92
93#undef TRACE_ZERO_CHAR
94#define TRACE_ZERO_CHAR(arg)
95 195
96#undef TRACE_FIELD
97#define TRACE_FIELD(type, item, assign)\
98 entry->item = assign;
99 196
100#undef TRACE_FIELD 197#undef __field
101#define TRACE_FIELD(type, item, assign)\ 198#define __field(type, item)
102 entry->item = assign;
103 199
104#undef TRACE_FIELD_SIGN 200#undef __field_desc
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 201#define __field_desc(type, container, item)
106 TRACE_FIELD(type, item, assign)
107 202
108#undef TP_CMD 203#undef __array
109#define TP_CMD(cmd...) cmd 204#define __array(type, item, len)
110 205
111#undef TRACE_ENTRY 206#undef __array_desc
112#define TRACE_ENTRY entry 207#define __array_desc(type, container, item, len)
113 208
114#undef TRACE_FIELD_SPECIAL 209#undef __dynamic_array
115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 210#define __dynamic_array(type, item)
116 cmd;
117 211
118#undef TRACE_EVENT_FORMAT 212#undef FTRACE_ENTRY
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \ 214static int ftrace_raw_init_event_##call(void); \
122 \ 215 \
123struct ftrace_event_call __used \ 216struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \ 217__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \ 218__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \ 219 .name = #call, \
127 .id = proto, \ 220 .id = type, \
128 .system = __stringify(TRACE_SYSTEM), \ 221 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \ 222 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \ 223 .show_format = ftrace_format_##call, \
@@ -133,74 +226,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 226static int ftrace_raw_init_event_##call(void) \
134{ \ 227{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 228 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 229 return 0; \
138} \ 230} \
139 231
140#undef TRACE_EVENT_FORMAT_NOFILTER 232#include "trace_entries.h"
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
143 \
144struct ftrace_event_call __used \
145__attribute__((__aligned__(4))) \
146__attribute__((section("_ftrace_events"))) event_##call = { \
147 .name = #call, \
148 .id = proto, \
149 .system = __stringify(TRACE_SYSTEM), \
150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%ps:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
169static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
170{ 280{
171 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
172 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
173 if (ret) 286 if (ret)
174 return ret; 287 return ret;
175 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 290 return 0;
178} 291}
179 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
180static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
181{ 299{
182 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
184} 302}
185 303
186static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 305
195static enum print_line_t 306static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
197{ 308{
198 int i;
199 int ret; 309 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 310
204 /* 311 /*
205 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
207 * email: 314 * email:
208 */ 315 */
209 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 317 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
225 319
@@ -270,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
270} 364}
271 365
272 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
273/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
274static enum print_line_t 377static enum print_line_t
275verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -427,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
427 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
428 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
429 } 532 }
533
430 /* Proc */ 534 /* Proc */
431 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
432 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -565,11 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
566 } 670 }
567 671
568 ret = seq_print_ip_sym(s, call->func, 0); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 673 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
575 675
@@ -612,11 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
613 } 713 }
614 714
615 ret = seq_print_ip_sym(s, call->func, 0); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 716 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
622 718
@@ -672,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
672 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
673 } 769 }
674 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
675 return 0; 778 return 0;
676} 779}
677 780
@@ -866,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
866 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
867} 970}
868 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
869static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
870{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
871 /* 1st line */ 1001 /* 1st line */
872 seq_printf(s, "# "); 1002 seq_printf(s, "#");
873 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
874 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
875 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
876 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
877 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
878 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
879 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
880 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
881 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
882 1014
883 /* 2nd line */ 1015 /* 2nd line */
884 seq_printf(s, "# "); 1016 seq_printf(s, "#");
885 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
886 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
887 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
888 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
889 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
890 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
891 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
892 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
893 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
@@ -934,6 +1068,8 @@ static struct tracer graph_trace __read_mostly = {
934 1068
935static __init int init_graph_trace(void) 1069static __init int init_graph_trace(void)
936{ 1070{
1071 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1072
937 return register_tracer(&graph_trace); 1073 return register_tracer(&graph_trace);
938} 1074}
939 1075
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
@@ -178,7 +170,6 @@ out_unlock:
178out: 170out:
179 data->critical_sequence = max_sequence; 171 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 172 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 173 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 174}
184 175
@@ -208,7 +199,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 199 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 200 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 201 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 202
213 local_save_flags(flags); 203 local_save_flags(flags);
214 204
@@ -379,6 +369,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 369 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 370 /* make sure that the tracer is visible */
381 smp_wmb(); 371 smp_wmb();
372 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 373 start_irqsoff_tracer(tr);
383} 374}
384 375
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
311 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 314 int pc = preempt_count();
313 315
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 316 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 317 sizeof(*entry), 0, pc);
316 if (!event) { 318 if (!event) {
317 atomic_inc(&dropped_count); 319 atomic_inc(&dropped_count);
@@ -319,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 321 }
320 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 323 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 327}
324 328
325void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +337,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
335{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
341 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 344 int pc = preempt_count();
339 345
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 346 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 347 sizeof(*entry), 0, pc);
342 if (!event) { 348 if (!event) {
343 atomic_inc(&dropped_count); 349 atomic_inc(&dropped_count);
@@ -345,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 351 }
346 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
347 entry->map = *map; 353 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 357}
350 358
351void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index e0c2545622e8..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
407 * since individual threads might have already quit! 407 * since individual threads might have already quit!
408 */ 408 */
409 rcu_read_lock(); 409 rcu_read_lock();
410 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
411 if (task) 411 if (task)
412 mm = get_task_mm(task); 412 mm = get_task_mm(task);
413 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
460 return ret; 460 return ret;
461} 461}
462 462
463static int 463/**
464lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
465{ 472{
466 int hardirq, softirq; 473 int hardirq, softirq;
467 char comm[TASK_COMM_LEN]; 474 int ret;
468 475
469 trace_find_cmdline(entry->pid, comm);
470 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
471 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
472 478
473 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
474 comm, entry->pid, cpu,
475 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
476 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
477 'X' : '.', 482 'X' : '.',
@@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
481 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
482 return 0; 487 return 0;
483 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
484 if (entry->preempt_count) 496 if (entry->preempt_count)
485 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
486 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
487} 513}
488 514
489static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8a30d9874cd4..fe1a00f1445a 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -38,6 +38,7 @@ static void probe_power_end(struct power_trace *it)
38{ 38{
39 struct ftrace_event_call *call = &event_power; 39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event; 40 struct ring_buffer_event *event;
41 struct ring_buffer *buffer;
41 struct trace_power *entry; 42 struct trace_power *entry;
42 struct trace_array_cpu *data; 43 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace; 44 struct trace_array *tr = power_trace;
@@ -45,18 +46,20 @@ static void probe_power_end(struct power_trace *it)
45 if (!trace_power_enabled) 46 if (!trace_power_enabled)
46 return; 47 return;
47 48
49 buffer = tr->buffer;
50
48 preempt_disable(); 51 preempt_disable();
49 it->end = ktime_get(); 52 it->end = ktime_get();
50 data = tr->data[smp_processor_id()]; 53 data = tr->data[smp_processor_id()];
51 54
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 55 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
53 sizeof(*entry), 0, 0); 56 sizeof(*entry), 0, 0);
54 if (!event) 57 if (!event)
55 goto out; 58 goto out;
56 entry = ring_buffer_event_data(event); 59 entry = ring_buffer_event_data(event);
57 entry->state_data = *it; 60 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event)) 61 if (!filter_check_discard(call, entry, buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0); 62 trace_buffer_unlock_commit(buffer, event, 0, 0);
60 out: 63 out:
61 preempt_enable(); 64 preempt_enable();
62} 65}
@@ -66,6 +69,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
66{ 69{
67 struct ftrace_event_call *call = &event_power; 70 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event; 71 struct ring_buffer_event *event;
72 struct ring_buffer *buffer;
69 struct trace_power *entry; 73 struct trace_power *entry;
70 struct trace_array_cpu *data; 74 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace; 75 struct trace_array *tr = power_trace;
@@ -73,6 +77,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
73 if (!trace_power_enabled) 77 if (!trace_power_enabled)
74 return; 78 return;
75 79
80 buffer = tr->buffer;
81
76 memset(it, 0, sizeof(struct power_trace)); 82 memset(it, 0, sizeof(struct power_trace));
77 it->state = level; 83 it->state = level;
78 it->type = type; 84 it->type = type;
@@ -81,14 +87,14 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
81 it->end = it->stamp; 87 it->end = it->stamp;
82 data = tr->data[smp_processor_id()]; 88 data = tr->data[smp_processor_id()];
83 89
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER, 90 event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
85 sizeof(*entry), 0, 0); 91 sizeof(*entry), 0, 0);
86 if (!event) 92 if (!event)
87 goto out; 93 goto out;
88 entry = ring_buffer_event_data(event); 94 entry = ring_buffer_event_data(event);
89 entry->state_data = *it; 95 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event)) 96 if (!filter_check_discard(call, entry, buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0); 97 trace_buffer_unlock_commit(buffer, event, 0, 0);
92 out: 98 out:
93 preempt_enable(); 99 preempt_enable();
94} 100}
@@ -144,14 +150,12 @@ static void power_trace_reset(struct trace_array *tr)
144 150
145static int power_trace_init(struct trace_array *tr) 151static int power_trace_init(struct trace_array *tr)
146{ 152{
147 int cpu;
148 power_trace = tr; 153 power_trace = tr;
149 154
150 trace_power_enabled = 1; 155 trace_power_enabled = 1;
151 tracing_power_register(); 156 tracing_power_register();
152 157
153 for_each_cpu(cpu, cpu_possible_mask) 158 tracing_reset_online_cpus(tr);
154 tracing_reset(tr, cpu);
155 return 0; 159 return 0;
156} 160}
157 161
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -186,11 +177,6 @@ out:
186 177
187static void __wakeup_reset(struct trace_array *tr) 178static void __wakeup_reset(struct trace_array *tr)
188{ 179{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 180 wakeup_cpu = -1;
195 wakeup_prio = -1; 181 wakeup_prio = -1;
196 182
@@ -204,6 +190,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 190{
205 unsigned long flags; 191 unsigned long flags;
206 192
193 tracing_reset_online_cpus(tr);
194
207 local_irq_save(flags); 195 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 196 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 197 __wakeup_reset(tr);
@@ -247,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
247 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
248 236
249 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
250 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
251 240
252 wakeup_task = p; 241 wakeup_task = p;
@@ -296,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
296 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
297 } 286 }
298 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
299 wakeup_reset(tr); 295 wakeup_reset(tr);
300 296
301 /* 297 /*
@@ -328,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
328 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
329 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
330 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
331} 328}
332 329
333static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192 192
193 (*pos)++; 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201
202 if (i >= max_stack_trace.nr_entries ||
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239
240 sprint_symbol(str, addr);
241 227
242 return seq_printf(m, "%s\n", str); 228 return seq_printf(m, "%pF\n", (void *)addr);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
@@ -200,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
200{ 203{
201 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
202 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
203 int i; 207 int i;
204 208
205 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
206 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
207 211
208 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
209 if (!*pos && session->ts->stat_headers) 213 if (session->ts->stat_headers) {
210 return SEQ_START_TOKEN; 214 if (n == 0)
215 return SEQ_START_TOKEN;
216 n--;
217 }
211 218
212 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
213 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
214 node = rb_next(node); 221 node = rb_next(node);
215 222
216 return node; 223 return node;
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..8712ce3c6a0e 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_counter.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,244 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data;
389 int syscall_nr;
390 int size;
233 391
234 ret = register_ftrace_event(&syscall_enter_event); 392 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 394 return;
237 syscall_enter_event.type); 395
238 WARN_ON_ONCE(1); 396 sys_data = syscall_nr_to_meta(syscall_nr);
397 if (!sys_data)
398 return;
399
400 /* get the size after alignment with the u32 buffer size field */
401 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
402 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32);
404
405 do {
406 char raw_data[size];
407
408 /* zero the dead bytes from align to not leak stack to user */
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
410
411 rec = (struct syscall_trace_enter *) raw_data;
412 tracing_generic_entry_update(&rec->ent, 0, 0);
413 rec->ent.type = sys_data->enter_id;
414 rec->nr = syscall_nr;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
416 (unsigned long *)&rec->args);
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
418 } while(0);
419}
420
421int reg_prof_syscall_enter(char *name)
422{
423 int ret = 0;
424 int num;
425
426 num = syscall_name_to_nr(name);
427 if (num < 0 || num >= NR_syscalls)
428 return -ENOSYS;
429
430 mutex_lock(&syscall_trace_lock);
431 if (!sys_prof_refcount_enter)
432 ret = register_trace_sys_enter(prof_syscall_enter);
433 if (ret) {
434 pr_info("event trace: Could not activate"
435 "syscall entry trace point");
436 } else {
437 set_bit(num, enabled_prof_enter_syscalls);
438 sys_prof_refcount_enter++;
239 } 439 }
440 mutex_unlock(&syscall_trace_lock);
441 return ret;
442}
240 443
241 ret = register_ftrace_event(&syscall_exit_event); 444void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 445{
243 printk(KERN_WARNING "event %d failed to register\n", 446 int num;
244 syscall_exit_event.type); 447
245 WARN_ON_ONCE(1); 448 num = syscall_name_to_nr(name);
449 if (num < 0 || num >= NR_syscalls)
450 return;
451
452 mutex_lock(&syscall_trace_lock);
453 sys_prof_refcount_enter--;
454 clear_bit(num, enabled_prof_enter_syscalls);
455 if (!sys_prof_refcount_enter)
456 unregister_trace_sys_enter(prof_syscall_enter);
457 mutex_unlock(&syscall_trace_lock);
458}
459
460static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{
462 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec;
464 int syscall_nr;
465
466 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
468 return;
469
470 sys_data = syscall_nr_to_meta(syscall_nr);
471 if (!sys_data)
472 return;
473
474 tracing_generic_entry_update(&rec.ent, 0, 0);
475 rec.ent.type = sys_data->exit_id;
476 rec.nr = syscall_nr;
477 rec.ret = syscall_get_return_value(current, regs);
478
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec));
480}
481
482int reg_prof_syscall_exit(char *name)
483{
484 int ret = 0;
485 int num;
486
487 num = syscall_name_to_nr(name);
488 if (num < 0 || num >= NR_syscalls)
489 return -ENOSYS;
490
491 mutex_lock(&syscall_trace_lock);
492 if (!sys_prof_refcount_exit)
493 ret = register_trace_sys_exit(prof_syscall_exit);
494 if (ret) {
495 pr_info("event trace: Could not activate"
496 "syscall entry trace point");
497 } else {
498 set_bit(num, enabled_prof_exit_syscalls);
499 sys_prof_refcount_exit++;
246 } 500 }
501 mutex_unlock(&syscall_trace_lock);
502 return ret;
503}
247 504
248 return register_tracer(&syscall_tracer); 505void unreg_prof_syscall_exit(char *name)
506{
507 int num;
508
509 num = syscall_name_to_nr(name);
510 if (num < 0 || num >= NR_syscalls)
511 return;
512
513 mutex_lock(&syscall_trace_lock);
514 sys_prof_refcount_exit--;
515 clear_bit(num, enabled_prof_exit_syscalls);
516 if (!sys_prof_refcount_exit)
517 unregister_trace_sys_exit(prof_syscall_exit);
518 mutex_unlock(&syscall_trace_lock);
249} 519}
250device_initcall(register_ftrace_syscalls); 520
521#endif
522
523
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..9489a0a9b1be 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{