aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c8
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cred.c293
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/futex.c47
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h13
-rw-r--r--kernel/irq/manage.c102
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c128
-rw-r--r--kernel/perf_counter.c173
-rw-r--r--kernel/printk.c175
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c202
-rw-r--r--kernel/rcutree.c280
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h532
-rw-r--r--kernel/rcutree_trace.c88
-rw-r--r--kernel/sched.c1232
-rw-r--r--kernel/sched_cpupri.c30
-rw-r--r--kernel/sched_debug.c4
-rw-r--r--kernel/sched_fair.c84
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c62
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sysctl.c25
-rw-r--r--kernel/timer.c3
-rw-r--r--kernel/workqueue.c9
42 files changed, 3560 insertions, 4060 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..b833bd5cc127 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,11 +80,9 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 9f3391090b3e..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -491,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
491 u64 run_time; 491 u64 run_time;
492 struct timespec uptime; 492 struct timespec uptime;
493 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
494 498
495 /* 499 /*
496 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
497 * the process accounting system. 501 * the process accounting system.
498 */ 502 */
499 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
500 return; 504 goto out;
501 505
502 /* 506 /*
503 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -578,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
578 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
579 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
580 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
581} 587}
582 588
583/** 589/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b6eadfe30e7b..c7ece8f027f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -600,6 +600,7 @@ static struct inode_operations cgroup_dir_inode_operations;
600static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
601 601
602static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
603 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
604}; 605};
605 606
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..006fcab009d5 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,9 +126,22 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
@@ -106,12 +160,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 160 */
107void __put_cred(struct cred *cred) 161void __put_cred(struct cred *cred)
108{ 162{
163 kdebug("__put_cred(%p{%d,%d})", cred,
164 atomic_read(&cred->usage),
165 read_cred_subscribers(cred));
166
109 BUG_ON(atomic_read(&cred->usage) != 0); 167 BUG_ON(atomic_read(&cred->usage) != 0);
168#ifdef CONFIG_DEBUG_CREDENTIALS
169 BUG_ON(read_cred_subscribers(cred) != 0);
170 cred->magic = CRED_MAGIC_DEAD;
171 cred->put_addr = __builtin_return_address(0);
172#endif
173 BUG_ON(cred == current->cred);
174 BUG_ON(cred == current->real_cred);
110 175
111 call_rcu(&cred->rcu, put_cred_rcu); 176 call_rcu(&cred->rcu, put_cred_rcu);
112} 177}
113EXPORT_SYMBOL(__put_cred); 178EXPORT_SYMBOL(__put_cred);
114 179
180/*
181 * Clean up a task's credentials when it exits
182 */
183void exit_creds(struct task_struct *tsk)
184{
185 struct cred *cred;
186
187 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
188 atomic_read(&tsk->cred->usage),
189 read_cred_subscribers(tsk->cred));
190
191 cred = (struct cred *) tsk->real_cred;
192 tsk->real_cred = NULL;
193 validate_creds(cred);
194 alter_cred_subscribers(cred, -1);
195 put_cred(cred);
196
197 cred = (struct cred *) tsk->cred;
198 tsk->cred = NULL;
199 validate_creds(cred);
200 alter_cred_subscribers(cred, -1);
201 put_cred(cred);
202
203 cred = (struct cred *) tsk->replacement_session_keyring;
204 if (cred) {
205 tsk->replacement_session_keyring = NULL;
206 validate_creds(cred);
207 put_cred(cred);
208 }
209}
210
211/*
212 * Allocate blank credentials, such that the credentials can be filled in at a
213 * later date without risk of ENOMEM.
214 */
215struct cred *cred_alloc_blank(void)
216{
217 struct cred *new;
218
219 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
220 if (!new)
221 return NULL;
222
223#ifdef CONFIG_KEYS
224 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
225 if (!new->tgcred) {
226 kfree(new);
227 return NULL;
228 }
229 atomic_set(&new->tgcred->usage, 1);
230#endif
231
232 atomic_set(&new->usage, 1);
233
234 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
235 goto error;
236
237#ifdef CONFIG_DEBUG_CREDENTIALS
238 new->magic = CRED_MAGIC;
239#endif
240 return new;
241
242error:
243 abort_creds(new);
244 return NULL;
245}
246
115/** 247/**
116 * prepare_creds - Prepare a new set of credentials for modification 248 * prepare_creds - Prepare a new set of credentials for modification
117 * 249 *
@@ -132,16 +264,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 264 const struct cred *old;
133 struct cred *new; 265 struct cred *new;
134 266
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 267 validate_process_creds();
136 268
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 269 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 270 if (!new)
139 return NULL; 271 return NULL;
140 272
273 kdebug("prepare_creds() alloc %p", new);
274
141 old = task->cred; 275 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 276 memcpy(new, old, sizeof(struct cred));
143 277
144 atomic_set(&new->usage, 1); 278 atomic_set(&new->usage, 1);
279 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 280 get_group_info(new->group_info);
146 get_uid(new->user); 281 get_uid(new->user);
147 282
@@ -157,6 +292,7 @@ struct cred *prepare_creds(void)
157 292
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 293 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 294 goto error;
295 validate_creds(new);
160 return new; 296 return new;
161 297
162error: 298error:
@@ -229,9 +365,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 365 if (!new)
230 return NULL; 366 return NULL;
231 367
368 kdebug("prepare_usermodehelper_creds() alloc %p", new);
369
232 memcpy(new, &init_cred, sizeof(struct cred)); 370 memcpy(new, &init_cred, sizeof(struct cred));
233 371
234 atomic_set(&new->usage, 1); 372 atomic_set(&new->usage, 1);
373 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 374 get_group_info(new->group_info);
236 get_uid(new->user); 375 get_uid(new->user);
237 376
@@ -250,6 +389,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 389#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 390 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 391 goto error;
392 validate_creds(new);
253 393
254 BUG_ON(atomic_read(&new->usage) != 1); 394 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 395 return new;
@@ -286,6 +426,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 426 ) {
287 p->real_cred = get_cred(p->cred); 427 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 428 get_cred(p->cred);
429 alter_cred_subscribers(p->cred, 2);
430 kdebug("share_creds(%p{%d,%d})",
431 p->cred, atomic_read(&p->cred->usage),
432 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 433 atomic_inc(&p->cred->user->processes);
290 return 0; 434 return 0;
291 } 435 }
@@ -331,6 +475,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 475
332 atomic_inc(&new->user->processes); 476 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 477 p->cred = p->real_cred = get_cred(new);
478 alter_cred_subscribers(new, 2);
479 validate_creds(new);
334 return 0; 480 return 0;
335 481
336error_put: 482error_put:
@@ -355,13 +501,20 @@ error_put:
355int commit_creds(struct cred *new) 501int commit_creds(struct cred *new)
356{ 502{
357 struct task_struct *task = current; 503 struct task_struct *task = current;
358 const struct cred *old; 504 const struct cred *old = task->real_cred;
359 505
360 BUG_ON(task->cred != task->real_cred); 506 kdebug("commit_creds(%p{%d,%d})", new,
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 507 atomic_read(&new->usage),
508 read_cred_subscribers(new));
509
510 BUG_ON(task->cred != old);
511#ifdef CONFIG_DEBUG_CREDENTIALS
512 BUG_ON(read_cred_subscribers(old) < 2);
513 validate_creds(old);
514 validate_creds(new);
515#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 516 BUG_ON(atomic_read(&new->usage) < 1);
363 517
364 old = task->real_cred;
365 security_commit_creds(new, old); 518 security_commit_creds(new, old);
366 519
367 get_cred(new); /* we will require a ref for the subj creds too */ 520 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +543,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 543 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 544 * we should be checking for it. -DaveM
392 */ 545 */
546 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 547 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 548 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 549 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 550 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 551 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 552 atomic_dec(&old->user->processes);
553 alter_cred_subscribers(old, -2);
399 554
400 sched_switch_user(task); 555 sched_switch_user(task);
401 556
@@ -428,6 +583,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 583 */
429void abort_creds(struct cred *new) 584void abort_creds(struct cred *new)
430{ 585{
586 kdebug("abort_creds(%p{%d,%d})", new,
587 atomic_read(&new->usage),
588 read_cred_subscribers(new));
589
590#ifdef CONFIG_DEBUG_CREDENTIALS
591 BUG_ON(read_cred_subscribers(new) != 0);
592#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 593 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 594 put_cred(new);
433} 595}
@@ -444,7 +606,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 606{
445 const struct cred *old = current->cred; 607 const struct cred *old = current->cred;
446 608
447 rcu_assign_pointer(current->cred, get_cred(new)); 609 kdebug("override_creds(%p{%d,%d})", new,
610 atomic_read(&new->usage),
611 read_cred_subscribers(new));
612
613 validate_creds(old);
614 validate_creds(new);
615 get_cred(new);
616 alter_cred_subscribers(new, 1);
617 rcu_assign_pointer(current->cred, new);
618 alter_cred_subscribers(old, -1);
619
620 kdebug("override_creds() = %p{%d,%d}", old,
621 atomic_read(&old->usage),
622 read_cred_subscribers(old));
448 return old; 623 return old;
449} 624}
450EXPORT_SYMBOL(override_creds); 625EXPORT_SYMBOL(override_creds);
@@ -460,7 +635,15 @@ void revert_creds(const struct cred *old)
460{ 635{
461 const struct cred *override = current->cred; 636 const struct cred *override = current->cred;
462 637
638 kdebug("revert_creds(%p{%d,%d})", old,
639 atomic_read(&old->usage),
640 read_cred_subscribers(old));
641
642 validate_creds(old);
643 validate_creds(override);
644 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 645 rcu_assign_pointer(current->cred, old);
646 alter_cred_subscribers(override, -1);
464 put_cred(override); 647 put_cred(override);
465} 648}
466EXPORT_SYMBOL(revert_creds); 649EXPORT_SYMBOL(revert_creds);
@@ -502,11 +685,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 685 if (!new)
503 return NULL; 686 return NULL;
504 687
688 kdebug("prepare_kernel_cred() alloc %p", new);
689
505 if (daemon) 690 if (daemon)
506 old = get_task_cred(daemon); 691 old = get_task_cred(daemon);
507 else 692 else
508 old = get_cred(&init_cred); 693 old = get_cred(&init_cred);
509 694
695 validate_creds(old);
696
510 *new = *old; 697 *new = *old;
511 get_uid(new->user); 698 get_uid(new->user);
512 get_group_info(new->group_info); 699 get_group_info(new->group_info);
@@ -526,7 +713,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 713 goto error;
527 714
528 atomic_set(&new->usage, 1); 715 atomic_set(&new->usage, 1);
716 set_cred_subscribers(new, 0);
529 put_cred(old); 717 put_cred(old);
718 validate_creds(new);
530 return new; 719 return new;
531 720
532error: 721error:
@@ -589,3 +778,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 778 return security_kernel_create_files_as(new, inode);
590} 779}
591EXPORT_SYMBOL(set_create_files_as); 780EXPORT_SYMBOL(set_create_files_as);
781
782#ifdef CONFIG_DEBUG_CREDENTIALS
783
784/*
785 * dump invalid credentials
786 */
787static void dump_invalid_creds(const struct cred *cred, const char *label,
788 const struct task_struct *tsk)
789{
790 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
791 label, cred,
792 cred == &init_cred ? "[init]" : "",
793 cred == tsk->real_cred ? "[real]" : "",
794 cred == tsk->cred ? "[eff]" : "");
795 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
796 cred->magic, cred->put_addr);
797 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
798 atomic_read(&cred->usage),
799 read_cred_subscribers(cred));
800 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
801 cred->uid, cred->euid, cred->suid, cred->fsuid);
802 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
803 cred->gid, cred->egid, cred->sgid, cred->fsgid);
804#ifdef CONFIG_SECURITY
805 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
806 if ((unsigned long) cred->security >= PAGE_SIZE &&
807 (((unsigned long) cred->security & 0xffffff00) !=
808 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
809 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
810 ((u32*)cred->security)[0],
811 ((u32*)cred->security)[1]);
812#endif
813}
814
815/*
816 * report use of invalid credentials
817 */
818void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
819{
820 printk(KERN_ERR "CRED: Invalid credentials\n");
821 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
822 dump_invalid_creds(cred, "Specified", current);
823 BUG();
824}
825EXPORT_SYMBOL(__invalid_creds);
826
827/*
828 * check the credentials on a process
829 */
830void __validate_process_creds(struct task_struct *tsk,
831 const char *file, unsigned line)
832{
833 if (tsk->cred == tsk->real_cred) {
834 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
835 creds_are_invalid(tsk->cred)))
836 goto invalid_creds;
837 } else {
838 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
839 read_cred_subscribers(tsk->cred) < 1 ||
840 creds_are_invalid(tsk->real_cred) ||
841 creds_are_invalid(tsk->cred)))
842 goto invalid_creds;
843 }
844 return;
845
846invalid_creds:
847 printk(KERN_ERR "CRED: Invalid process credentials\n");
848 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
849
850 dump_invalid_creds(tsk->real_cred, "Real", tsk);
851 if (tsk->cred != tsk->real_cred)
852 dump_invalid_creds(tsk->cred, "Effective", tsk);
853 else
854 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
855 BUG();
856}
857EXPORT_SYMBOL(__validate_process_creds);
858
859/*
860 * check creds for do_exit()
861 */
862void validate_creds_for_do_exit(struct task_struct *tsk)
863{
864 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
865 tsk->real_cred, tsk->cred,
866 atomic_read(&tsk->cred->usage),
867 read_cred_subscribers(tsk->cred));
868
869 __validate_process_creds(tsk, __FILE__, __LINE__);
870}
871
872#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/exit.c b/kernel/exit.c
index 869dc221733e..ae5d8660ddff 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -901,6 +901,8 @@ NORET_TYPE void do_exit(long code)
901 901
902 tracehook_report_exit(&code); 902 tracehook_report_exit(&code);
903 903
904 validate_creds_for_do_exit(tsk);
905
904 /* 906 /*
905 * We're taking recursive faults here in do_exit. Safest is to just 907 * We're taking recursive faults here in do_exit. Safest is to just
906 * leave this task alone and wait for reboot. 908 * leave this task alone and wait for reboot.
@@ -1009,7 +1011,10 @@ NORET_TYPE void do_exit(long code)
1009 if (tsk->splice_pipe) 1011 if (tsk->splice_pipe)
1010 __free_pipe_info(tsk->splice_pipe); 1012 __free_pipe_info(tsk->splice_pipe);
1011 1013
1014 validate_creds_for_do_exit(tsk);
1015
1012 preempt_disable(); 1016 preempt_disable();
1017 exit_rcu();
1013 /* causes final put_task_struct in finish_task_switch(). */ 1018 /* causes final put_task_struct in finish_task_switch(). */
1014 tsk->state = TASK_DEAD; 1019 tsk->state = TASK_DEAD;
1015 schedule(); 1020 schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index e6c04d462ab2..bfee931ee3fb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -152,8 +152,7 @@ void __put_task_struct(struct task_struct *tsk)
152 WARN_ON(atomic_read(&tsk->usage)); 152 WARN_ON(atomic_read(&tsk->usage));
153 WARN_ON(tsk == current); 153 WARN_ON(tsk == current);
154 154
155 put_cred(tsk->real_cred); 155 exit_creds(tsk);
156 put_cred(tsk->cred);
157 delayacct_tsk_free(tsk); 156 delayacct_tsk_free(tsk);
158 157
159 if (!profile_handoff_task(tsk)) 158 if (!profile_handoff_task(tsk))
@@ -1008,10 +1007,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1008 copy_flags(clone_flags, p); 1007 copy_flags(clone_flags, p);
1009 INIT_LIST_HEAD(&p->children); 1008 INIT_LIST_HEAD(&p->children);
1010 INIT_LIST_HEAD(&p->sibling); 1009 INIT_LIST_HEAD(&p->sibling);
1011#ifdef CONFIG_PREEMPT_RCU 1010 rcu_copy_process(p);
1012 p->rcu_read_lock_nesting = 0;
1013 p->rcu_flipctr_idx = 0;
1014#endif /* #ifdef CONFIG_PREEMPT_RCU */
1015 p->vfork_done = NULL; 1011 p->vfork_done = NULL;
1016 spin_lock_init(&p->alloc_lock); 1012 spin_lock_init(&p->alloc_lock);
1017 1013
@@ -1297,8 +1293,7 @@ bad_fork_cleanup_put_domain:
1297 module_put(task_thread_info(p)->exec_domain->module); 1293 module_put(task_thread_info(p)->exec_domain->module);
1298bad_fork_cleanup_count: 1294bad_fork_cleanup_count:
1299 atomic_dec(&p->cred->user->processes); 1295 atomic_dec(&p->cred->user->processes);
1300 put_cred(p->real_cred); 1296 exit_creds(p);
1301 put_cred(p->cred);
1302bad_fork_free: 1297bad_fork_free:
1303 free_task(p); 1298 free_task(p);
1304fork_out: 1299fork_out:
diff --git a/kernel/futex.c b/kernel/futex.c
index e18cfbdc7190..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -1089,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1089 if (!top_waiter) 1092 if (!top_waiter)
1090 return 0; 1093 return 0;
1091 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1092 /* 1099 /*
1093 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1094 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1276,6 +1283,12 @@ retry_private:
1276 continue; 1283 continue;
1277 } 1284 }
1278 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1279 /* 1292 /*
1280 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1281 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1751,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1751 q.pi_state = NULL; 1764 q.pi_state = NULL;
1752 q.bitset = bitset; 1765 q.bitset = bitset;
1753 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1754 1768
1755 if (abs_time) { 1769 if (abs_time) {
1756 to = &timeout; 1770 to = &timeout;
@@ -1858,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1858 1872
1859 q.pi_state = NULL; 1873 q.pi_state = NULL;
1860 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1861retry: 1876retry:
1862 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1863 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2118,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2119 * via the following: 2134 * via the following:
2120 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2121 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2122 * 3) signal (before or after requeue) 2137 * 3) signal
2123 * 4) timeout (before or after requeue) 2138 * 4) timeout
2124 * 2139 *
2125 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2126 * 2141 *
2127 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2128 * 5) successful lock 2143 * 5) successful lock
@@ -2130,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2130 * 7) timeout 2145 * 7) timeout
2131 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2132 * 2147 *
2133 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2134 * 2149 *
2135 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2136 * 2151 *
@@ -2169,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2169 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2170 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2171 2186
2172 q.pi_state = NULL;
2173 q.bitset = bitset;
2174 q.rt_waiter = &rt_waiter;
2175
2176 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2177 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2178 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2179 goto out; 2190 goto out;
2180 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2181 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2182 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2183 if (ret) 2199 if (ret)
@@ -2248,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2248 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2249 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2250 /* 2266 /*
2251 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2252 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2253 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2254 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2255 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2256 * "val" has been changed. That's the same what the
2257 * restart of the syscall would do in
2258 * futex_wait_setup().
2259 */ 2272 */
2260 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2261 } 2274 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e70ed5592eb9..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -44,6 +44,19 @@ extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void irq_set_thread_affinity(struct irq_desc *desc); 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
59
47/* 60/*
48 * Debugging printout: 61 * Debugging printout:
49 */ 62 */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0ec9ed831737..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -230,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
230 if (!desc) 230 if (!desc)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc);
233 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
234 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
235 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
236} 238}
237EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
238 240
@@ -294,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
294 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
295 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
296 * 298 *
297 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
298 */ 301 */
299void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
300{ 303{
@@ -304,9 +307,11 @@ void enable_irq(unsigned int irq)
304 if (!desc) 307 if (!desc)
305 return; 308 return;
306 309
310 chip_bus_lock(irq, desc);
307 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
308 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
309 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
310} 315}
311EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
312 317
@@ -436,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 return ret; 441 return ret;
437} 442}
438 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
439static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
440{ 465{
441 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -451,6 +476,23 @@ static int irq_wait_for_interrupt(struct irqaction *action)
451 return -1; 476 return -1;
452} 477}
453 478
479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
454#ifdef CONFIG_SMP 496#ifdef CONFIG_SMP
455/* 497/*
456 * Check whether we need to change the affinity of the interrupt thread. 498 * Check whether we need to change the affinity of the interrupt thread.
@@ -492,7 +534,7 @@ static int irq_thread(void *data)
492 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
493 struct irqaction *action = data; 535 struct irqaction *action = data;
494 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
495 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
496 538
497 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
498 current->irqaction = action; 540 current->irqaction = action;
@@ -518,6 +560,9 @@ static int irq_thread(void *data)
518 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
519 561
520 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
521 } 566 }
522 567
523 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -565,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
565 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
566 const char *old_name = NULL; 611 const char *old_name = NULL;
567 unsigned long flags; 612 unsigned long flags;
568 int shared = 0; 613 int nested, shared = 0;
569 int ret; 614 int ret;
570 615
571 if (!desc) 616 if (!desc)
@@ -590,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
590 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
591 } 636 }
592 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
593 /* 658 /*
594 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
595 */ 662 */
596 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
597 struct task_struct *t; 664 struct task_struct *t;
598 665
599 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -662,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
662 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
663#endif 730#endif
664 731
665 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
666 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
667 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
668 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
669 desc->depth = 0; 739 desc->depth = 0;
670 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -875,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
875 */ 945 */
876void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
877{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
878 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
879} 956}
880EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
881 958
@@ -884,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
884 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
885 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
886 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
887 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
888 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
889 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -963,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
963 1042
964 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
965 return -EINVAL; 1044 return -EINVAL;
966 if (!handler) 1045
967 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
968 1051
969 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
970 if (!action) 1053 if (!action)
@@ -976,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
976 action->name = devname; 1059 action->name = devname;
977 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
978 1061
1062 chip_bus_lock(irq, desc);
979 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
980 if (retval) 1066 if (retval)
981 kfree(action); 1067 kfree(action);
982 1068
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a92280870e30..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -80,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
81 static int kmod_loop_msg; 81 static int kmod_loop_msg;
82 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
83 va_start(args, fmt); 87 va_start(args, fmt);
84 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
85 va_end(args); 89 va_end(args);
@@ -466,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
466 int retval = 0; 470 int retval = 0;
467 471
468 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
469 474
470 helper_lock(); 475 helper_lock();
471 if (sub_info->path[0] == '\0') 476 if (sub_info->path[0] == '\0')
diff --git a/kernel/kthread.c b/kernel/kthread.c
index eb8751aa0418..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -221,7 +218,6 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 220 ignore_signals(tsk);
224 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
226 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
227 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..f74d2d7aa605 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 840}
899 841
900/* 842/*
843 * For good efficiency of modular, we use power of 2
844 */
845#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
846#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
847
848/*
849 * The circular_queue and helpers is used to implement the
850 * breadth-first search(BFS)algorithem, by which we can build
851 * the shortest path from the next lock to be acquired to the
852 * previous held lock if there is a circular between them.
853 */
854struct circular_queue {
855 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
856 unsigned int front, rear;
857};
858
859static struct circular_queue lock_cq;
860
861unsigned int max_bfs_queue_depth;
862
863static unsigned int lockdep_dependency_gen_id;
864
865static inline void __cq_init(struct circular_queue *cq)
866{
867 cq->front = cq->rear = 0;
868 lockdep_dependency_gen_id++;
869}
870
871static inline int __cq_empty(struct circular_queue *cq)
872{
873 return (cq->front == cq->rear);
874}
875
876static inline int __cq_full(struct circular_queue *cq)
877{
878 return ((cq->rear + 1) & CQ_MASK) == cq->front;
879}
880
881static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
882{
883 if (__cq_full(cq))
884 return -1;
885
886 cq->element[cq->rear] = elem;
887 cq->rear = (cq->rear + 1) & CQ_MASK;
888 return 0;
889}
890
891static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
892{
893 if (__cq_empty(cq))
894 return -1;
895
896 *elem = cq->element[cq->front];
897 cq->front = (cq->front + 1) & CQ_MASK;
898 return 0;
899}
900
901static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
902{
903 return (cq->rear - cq->front) & CQ_MASK;
904}
905
906static inline void mark_lock_accessed(struct lock_list *lock,
907 struct lock_list *parent)
908{
909 unsigned long nr;
910
911 nr = lock - list_entries;
912 WARN_ON(nr >= nr_list_entries);
913 lock->parent = parent;
914 lock->class->dep_gen_id = lockdep_dependency_gen_id;
915}
916
917static inline unsigned long lock_accessed(struct lock_list *lock)
918{
919 unsigned long nr;
920
921 nr = lock - list_entries;
922 WARN_ON(nr >= nr_list_entries);
923 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
924}
925
926static inline struct lock_list *get_lock_parent(struct lock_list *child)
927{
928 return child->parent;
929}
930
931static inline int get_lock_depth(struct lock_list *child)
932{
933 int depth = 0;
934 struct lock_list *parent;
935
936 while ((parent = get_lock_parent(child))) {
937 child = parent;
938 depth++;
939 }
940 return depth;
941}
942
943static int __bfs(struct lock_list *source_entry,
944 void *data,
945 int (*match)(struct lock_list *entry, void *data),
946 struct lock_list **target_entry,
947 int forward)
948{
949 struct lock_list *entry;
950 struct list_head *head;
951 struct circular_queue *cq = &lock_cq;
952 int ret = 1;
953
954 if (match(source_entry, data)) {
955 *target_entry = source_entry;
956 ret = 0;
957 goto exit;
958 }
959
960 if (forward)
961 head = &source_entry->class->locks_after;
962 else
963 head = &source_entry->class->locks_before;
964
965 if (list_empty(head))
966 goto exit;
967
968 __cq_init(cq);
969 __cq_enqueue(cq, (unsigned long)source_entry);
970
971 while (!__cq_empty(cq)) {
972 struct lock_list *lock;
973
974 __cq_dequeue(cq, (unsigned long *)&lock);
975
976 if (!lock->class) {
977 ret = -2;
978 goto exit;
979 }
980
981 if (forward)
982 head = &lock->class->locks_after;
983 else
984 head = &lock->class->locks_before;
985
986 list_for_each_entry(entry, head, entry) {
987 if (!lock_accessed(entry)) {
988 unsigned int cq_depth;
989 mark_lock_accessed(entry, lock);
990 if (match(entry, data)) {
991 *target_entry = entry;
992 ret = 0;
993 goto exit;
994 }
995
996 if (__cq_enqueue(cq, (unsigned long)entry)) {
997 ret = -1;
998 goto exit;
999 }
1000 cq_depth = __cq_get_elem_count(cq);
1001 if (max_bfs_queue_depth < cq_depth)
1002 max_bfs_queue_depth = cq_depth;
1003 }
1004 }
1005 }
1006exit:
1007 return ret;
1008}
1009
1010static inline int __bfs_forwards(struct lock_list *src_entry,
1011 void *data,
1012 int (*match)(struct lock_list *entry, void *data),
1013 struct lock_list **target_entry)
1014{
1015 return __bfs(src_entry, data, match, target_entry, 1);
1016
1017}
1018
1019static inline int __bfs_backwards(struct lock_list *src_entry,
1020 void *data,
1021 int (*match)(struct lock_list *entry, void *data),
1022 struct lock_list **target_entry)
1023{
1024 return __bfs(src_entry, data, match, target_entry, 0);
1025
1026}
1027
1028/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1029 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1030 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1031 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1032 */
909static struct held_lock *check_source, *check_target;
910 1033
911/* 1034/*
912 * Print a dependency chain entry (this is only done when a deadlock 1035 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1036 * has been detected):
914 */ 1037 */
915static noinline int 1038static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1039print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1040{
918 if (debug_locks_silent) 1041 if (debug_locks_silent)
919 return 0; 1042 return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1053 * header first:
931 */ 1054 */
932static noinline int 1055static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1056print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1057 struct held_lock *check_src,
1058 struct held_lock *check_tgt)
934{ 1059{
935 struct task_struct *curr = current; 1060 struct task_struct *curr = current;
936 1061
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1062 if (debug_locks_silent)
938 return 0; 1063 return 0;
939 1064
940 printk("\n=======================================================\n"); 1065 printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1068 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1069 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1070 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1071 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1072 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1073 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1074 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1075 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1076
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1079 return 0;
955} 1080}
956 1081
957static noinline int print_circular_bug_tail(void) 1082static inline int class_equal(struct lock_list *entry, void *data)
1083{
1084 return entry->class == data;
1085}
1086
1087static noinline int print_circular_bug(struct lock_list *this,
1088 struct lock_list *target,
1089 struct held_lock *check_src,
1090 struct held_lock *check_tgt)
958{ 1091{
959 struct task_struct *curr = current; 1092 struct task_struct *curr = current;
960 struct lock_list this; 1093 struct lock_list *parent;
1094 int depth;
961 1095
962 if (debug_locks_silent) 1096 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1097 return 0;
964 1098
965 this.class = hlock_class(check_source); 1099 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1100 return 0;
968 1101
969 print_circular_bug_entry(&this, 0); 1102 depth = get_lock_depth(target);
1103
1104 print_circular_bug_header(target, depth, check_src, check_tgt);
1105
1106 parent = get_lock_parent(target);
1107
1108 while (parent) {
1109 print_circular_bug_entry(parent, --depth);
1110 parent = get_lock_parent(parent);
1111 }
970 1112
971 printk("\nother info that might help us debug this:\n\n"); 1113 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1114 lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1119 return 0;
978} 1120}
979 1121
980#define RECURSION_LIMIT 40 1122static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1123{
984 if (!debug_locks_off_graph_unlock()) 1124 if (!debug_locks_off_graph_unlock())
985 return 0; 1125 return 0;
986 1126
987 WARN_ON(1); 1127 WARN(1, "lockdep bfs error:%d\n", ret);
988 1128
989 return 0; 1129 return 0;
990} 1130}
991 1131
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1132static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1133{
995 struct lock_list *entry; 1134 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1135 return 0;
1136}
997 1137
998 if (lockdep_dependency_visit(class, depth)) 1138unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1139{
1140 unsigned long count = 0;
1141 struct lock_list *uninitialized_var(target_entry);
1000 1142
1001 /* 1143 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1144
1007 return ret; 1145 return count;
1008} 1146}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1147unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1148{
1012 unsigned long ret, flags; 1149 unsigned long ret, flags;
1150 struct lock_list this;
1151
1152 this.parent = NULL;
1153 this.class = class;
1013 1154
1014 local_irq_save(flags); 1155 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1156 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1157 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1158 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1159 local_irq_restore(flags);
1019 1160
1020 return ret; 1161 return ret;
1021} 1162}
1022 1163
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1164unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1165{
1026 struct lock_list *entry; 1166 unsigned long count = 0;
1027 unsigned long ret = 1; 1167 struct lock_list *uninitialized_var(target_entry);
1028 1168
1029 if (lockdep_dependency_visit(class, depth)) 1169 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1170
1037 return ret; 1171 return count;
1038} 1172}
1039 1173
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1174unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1175{
1042 unsigned long ret, flags; 1176 unsigned long ret, flags;
1177 struct lock_list this;
1178
1179 this.parent = NULL;
1180 this.class = class;
1043 1181
1044 local_irq_save(flags); 1182 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1183 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1184 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1185 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1186 local_irq_restore(flags);
1049 1187
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1193 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1194 */
1057static noinline int 1195static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1196check_noncircular(struct lock_list *root, struct lock_class *target,
1197 struct lock_list **target_entry)
1059{ 1198{
1060 struct lock_list *entry; 1199 int result;
1061 1200
1062 if (lockdep_dependency_visit(source, depth)) 1201 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1202
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1203 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1204
1067 max_recursion_depth = depth; 1205 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1206}
1082 1207
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1208#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1211 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1212 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1213 */
1089static enum lock_usage_bit find_usage_bit; 1214
1090static struct lock_class *forwards_match, *backwards_match; 1215static inline int usage_match(struct lock_list *entry, void *bit)
1216{
1217 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1218}
1219
1220
1091 1221
1092/* 1222/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1223 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1224 * at @root->class that matches @bit.
1095 * 1225 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1226 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1227 * into *@target_entry.
1098 * 1228 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1229 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1230 * Return <0 on error.
1101 */ 1231 */
1102static noinline int 1232static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1233find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1234 struct lock_list **target_entry)
1104{ 1235{
1105 struct lock_list *entry; 1236 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1237
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1238 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1239
1122 /* 1240 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1241
1124 */ 1242 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1243}
1133 1244
1134/* 1245/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1246 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1247 * at @root->class that matches @bit.
1137 * 1248 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1249 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1250 * into *@target_entry.
1140 * 1251 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1252 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1253 * Return <0 on error.
1143 */ 1254 */
1144static noinline int 1255static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1256find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1257 struct lock_list **target_entry)
1146{ 1258{
1147 struct lock_list *entry; 1259 int result;
1148 int ret;
1149 1260
1150 if (lockdep_dependency_visit(source, depth)) 1261 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1262
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1263 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1264
1156 if (depth > max_recursion_depth) 1265 return result;
1157 max_recursion_depth = depth; 1266}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1267
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1268static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1269{
1163 backwards_match = source; 1270 int bit;
1164 return 2;
1165 }
1166 1271
1167 if (!source && debug_locks_off_graph_unlock()) { 1272 printk("%*s->", depth, "");
1168 WARN_ON(1); 1273 print_lock_name(class);
1169 return 0; 1274 printk(" ops: %lu", class->ops);
1170 } 1275 printk(" {\n");
1171 1276
1172 /* 1277 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1278 if (class->usage_mask & (1 << bit)) {
1174 */ 1279 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1280
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1281 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1282 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1283 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1284 }
1180 } 1285 }
1181 return 1; 1286 printk("%*s }\n", depth, "");
1287
1288 printk("%*s ... key at: ",depth,"");
1289 print_ip_sym((unsigned long)class->key);
1290}
1291
1292/*
1293 * printk the shortest lock dependencies from @start to @end in reverse order:
1294 */
1295static void __used
1296print_shortest_lock_dependencies(struct lock_list *leaf,
1297 struct lock_list *root)
1298{
1299 struct lock_list *entry = leaf;
1300 int depth;
1301
1302 /*compute depth from generated tree by BFS*/
1303 depth = get_lock_depth(leaf);
1304
1305 do {
1306 print_lock_class_header(entry->class, depth);
1307 printk("%*s ... acquired at:\n", depth, "");
1308 print_stack_trace(&entry->trace, 2);
1309 printk("\n");
1310
1311 if (depth == 0 && (entry != root)) {
1312 printk("lockdep:%s bad BFS generated tree\n", __func__);
1313 break;
1314 }
1315
1316 entry = get_lock_parent(entry);
1317 depth--;
1318 } while (entry && (depth >= 0));
1319
1320 return;
1182} 1321}
1183 1322
1184static int 1323static int
1185print_bad_irq_dependency(struct task_struct *curr, 1324print_bad_irq_dependency(struct task_struct *curr,
1325 struct lock_list *prev_root,
1326 struct lock_list *next_root,
1327 struct lock_list *backwards_entry,
1328 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1329 struct held_lock *prev,
1187 struct held_lock *next, 1330 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1331 enum lock_usage_bit bit1,
@@ -1215,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1358
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1359 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1360 irqclass);
1218 print_lock_name(backwards_match); 1361 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1362 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1363
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1364 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1365
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1366 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1367 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1368 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1369 printk("...");
1227 1370
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1371 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1372
1230 printk("\nother info that might help us debug this:\n\n"); 1373 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1374 lockdep_print_held_locks(curr);
1232 1375
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1376 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1377 printk(" and the holding lock:\n");
1378 if (!save_trace(&prev_root->trace))
1379 return 0;
1380 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1381
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1382 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1383 printk(" and %s-irq-unsafe lock:\n", irqclass);
1384 if (!save_trace(&next_root->trace))
1385 return 0;
1386 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1387
1239 printk("\nstack backtrace:\n"); 1388 printk("\nstack backtrace:\n");
1240 dump_stack(); 1389 dump_stack();
@@ -1248,19 +1397,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1397 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1398{
1250 int ret; 1399 int ret;
1400 struct lock_list this, that;
1401 struct lock_list *uninitialized_var(target_entry);
1402 struct lock_list *uninitialized_var(target_entry1);
1251 1403
1252 find_usage_bit = bit_backwards; 1404 this.parent = NULL;
1253 /* fills in <backwards_match> */ 1405
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1406 this.class = hlock_class(prev);
1255 if (!ret || ret == 1) 1407 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1408 if (ret < 0)
1409 return print_bfs_bug(ret);
1410 if (ret == 1)
1256 return ret; 1411 return ret;
1257 1412
1258 find_usage_bit = bit_forwards; 1413 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1414 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1415 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1416 if (ret < 0)
1417 return print_bfs_bug(ret);
1418 if (ret == 1)
1261 return ret; 1419 return ret;
1262 /* ret == 2 */ 1420
1263 return print_bad_irq_dependency(curr, prev, next, 1421 return print_bad_irq_dependency(curr, &this, &that,
1422 target_entry, target_entry1,
1423 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1424 bit_backwards, bit_forwards, irqclass);
1265} 1425}
1266 1426
@@ -1472,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1632{
1473 struct lock_list *entry; 1633 struct lock_list *entry;
1474 int ret; 1634 int ret;
1635 struct lock_list this;
1636 struct lock_list *uninitialized_var(target_entry);
1475 1637
1476 /* 1638 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1639 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1644 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1645 * keep the stackframe size of the recursive functions low:
1484 */ 1646 */
1485 check_source = next; 1647 this.class = hlock_class(next);
1486 check_target = prev; 1648 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1649 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1650 if (unlikely(!ret))
1651 return print_circular_bug(&this, target_entry, next, prev);
1652 else if (unlikely(ret < 0))
1653 return print_bfs_bug(ret);
1489 1654
1490 if (!check_prev_add_irq(curr, prev, next)) 1655 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1656 return 0;
@@ -1884,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2049 * print irq inversion bug:
1885 */ 2050 */
1886static int 2051static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2052print_irq_inversion_bug(struct task_struct *curr,
2053 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2054 struct held_lock *this, int forwards,
1889 const char *irqclass) 2055 const char *irqclass)
1890{ 2056{
@@ -1902,17 +2068,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2068 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2069 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2070 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2071 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2072 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2073
1908 printk("\nother info that might help us debug this:\n"); 2074 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2075 lockdep_print_held_locks(curr);
1910 2076
1911 printk("\nthe first lock's dependencies:\n"); 2077 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2078 if (!save_trace(&root->trace))
1913 2079 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2080 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2081
1917 printk("\nstack backtrace:\n"); 2082 printk("\nstack backtrace:\n");
1918 dump_stack(); 2083 dump_stack();
@@ -1929,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2094 enum lock_usage_bit bit, const char *irqclass)
1930{ 2095{
1931 int ret; 2096 int ret;
1932 2097 struct lock_list root;
1933 find_usage_bit = bit; 2098 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2099
1935 ret = find_usage_forwards(hlock_class(this), 0); 2100 root.parent = NULL;
1936 if (!ret || ret == 1) 2101 root.class = hlock_class(this);
2102 ret = find_usage_forwards(&root, bit, &target_entry);
2103 if (ret < 0)
2104 return print_bfs_bug(ret);
2105 if (ret == 1)
1937 return ret; 2106 return ret;
1938 2107
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2108 return print_irq_inversion_bug(curr, &root, target_entry,
2109 this, 1, irqclass);
1940} 2110}
1941 2111
1942/* 2112/*
@@ -1948,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2118 enum lock_usage_bit bit, const char *irqclass)
1949{ 2119{
1950 int ret; 2120 int ret;
1951 2121 struct lock_list root;
1952 find_usage_bit = bit; 2122 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2123
1954 ret = find_usage_backwards(hlock_class(this), 0); 2124 root.parent = NULL;
1955 if (!ret || ret == 1) 2125 root.class = hlock_class(this);
2126 ret = find_usage_backwards(&root, bit, &target_entry);
2127 if (ret < 0)
2128 return print_bfs_bug(ret);
2129 if (ret == 1)
1956 return ret; 2130 return ret;
1957 2131
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2132 return print_irq_inversion_bug(curr, &root, target_entry,
2133 this, 1, irqclass);
1959} 2134}
1960 2135
1961void print_irqtrace_events(struct task_struct *curr) 2136void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2705 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2707 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2708 struct lockdep_map *nest_lock, unsigned long ip,
2709 int references)
2534{ 2710{
2535 struct task_struct *curr = current; 2711 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2712 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2713 struct held_lock *hlock;
2538 unsigned int depth, id; 2714 unsigned int depth, id;
2539 int chain_head = 0; 2715 int chain_head = 0;
2716 int class_idx;
2540 u64 chain_key; 2717 u64 chain_key;
2541 2718
2542 if (!prove_locking) 2719 if (!prove_locking)
@@ -2584,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2761 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2762 return 0;
2586 2763
2764 class_idx = class - lock_classes + 1;
2765
2766 if (depth) {
2767 hlock = curr->held_locks + depth - 1;
2768 if (hlock->class_idx == class_idx && nest_lock) {
2769 if (hlock->references)
2770 hlock->references++;
2771 else
2772 hlock->references = 2;
2773
2774 return 1;
2775 }
2776 }
2777
2587 hlock = curr->held_locks + depth; 2778 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2779 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2780 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2781 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2782 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2783 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2784 hlock->nest_lock = nest_lock;
@@ -2595,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2786 hlock->read = read;
2596 hlock->check = check; 2787 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2788 hlock->hardirqs_off = !!hardirqs_off;
2789 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2790#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2791 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2792 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2895 return 1;
2704} 2896}
2705 2897
2898static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2899{
2900 if (hlock->instance == lock)
2901 return 1;
2902
2903 if (hlock->references) {
2904 struct lock_class *class = lock->class_cache;
2905
2906 if (!class)
2907 class = look_up_lock_class(lock, 0);
2908
2909 if (DEBUG_LOCKS_WARN_ON(!class))
2910 return 0;
2911
2912 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2913 return 0;
2914
2915 if (hlock->class_idx == class - lock_classes + 1)
2916 return 1;
2917 }
2918
2919 return 0;
2920}
2921
2706static int 2922static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2923__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2924 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2942 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2943 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2944 break;
2729 if (hlock->instance == lock) 2945 if (match_held_lock(hlock, lock))
2730 goto found_it; 2946 goto found_it;
2731 prev_hlock = hlock; 2947 prev_hlock = hlock;
2732 } 2948 }
@@ -2745,7 +2961,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2961 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2962 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2963 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2964 hlock->nest_lock, hlock->acquire_ip,
2965 hlock->references))
2749 return 0; 2966 return 0;
2750 } 2967 }
2751 2968
@@ -2784,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3001 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3002 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3003 break;
2787 if (hlock->instance == lock) 3004 if (match_held_lock(hlock, lock))
2788 goto found_it; 3005 goto found_it;
2789 prev_hlock = hlock; 3006 prev_hlock = hlock;
2790 } 3007 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3008 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3009
2793found_it: 3010found_it:
2794 lock_release_holdtime(hlock); 3011 if (hlock->instance == lock)
3012 lock_release_holdtime(hlock);
3013
3014 if (hlock->references) {
3015 hlock->references--;
3016 if (hlock->references) {
3017 /*
3018 * We had, and after removing one, still have
3019 * references, the current lock stack is still
3020 * valid. We're done!
3021 */
3022 return 1;
3023 }
3024 }
2795 3025
2796 /* 3026 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3027 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3028 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3029 * entries (if any), recalculating the hash along the way:
2800 */ 3030 */
3031
2801 curr->lockdep_depth = i; 3032 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3033 curr->curr_chain_key = hlock->prev_chain_key;
2803 3034
@@ -2806,7 +3037,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3037 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3038 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3039 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3040 hlock->nest_lock, hlock->acquire_ip,
3041 hlock->references))
2810 return 0; 3042 return 0;
2811 } 3043 }
2812 3044
@@ -2836,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3068 /*
2837 * Is the unlock non-nested: 3069 * Is the unlock non-nested:
2838 */ 3070 */
2839 if (hlock->instance != lock) 3071 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3072 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3073 curr->lockdep_depth--;
2842 3074
@@ -2881,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3113 check_chain_key(curr);
2882} 3114}
2883 3115
3116static int __lock_is_held(struct lockdep_map *lock)
3117{
3118 struct task_struct *curr = current;
3119 int i;
3120
3121 for (i = 0; i < curr->lockdep_depth; i++) {
3122 struct held_lock *hlock = curr->held_locks + i;
3123
3124 if (match_held_lock(hlock, lock))
3125 return 1;
3126 }
3127
3128 return 0;
3129}
3130
2884/* 3131/*
2885 * Check whether we follow the irq-flags state precisely: 3132 * Check whether we follow the irq-flags state precisely:
2886 */ 3133 */
@@ -2957,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3204
2958 current->lockdep_recursion = 1; 3205 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3206 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3207 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3208 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3209 raw_local_irq_restore(flags);
2963} 3210}
@@ -2982,6 +3229,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3229}
2983EXPORT_SYMBOL_GPL(lock_release); 3230EXPORT_SYMBOL_GPL(lock_release);
2984 3231
3232int lock_is_held(struct lockdep_map *lock)
3233{
3234 unsigned long flags;
3235 int ret = 0;
3236
3237 if (unlikely(current->lockdep_recursion))
3238 return ret;
3239
3240 raw_local_irq_save(flags);
3241 check_flags(flags);
3242
3243 current->lockdep_recursion = 1;
3244 ret = __lock_is_held(lock);
3245 current->lockdep_recursion = 0;
3246 raw_local_irq_restore(flags);
3247
3248 return ret;
3249}
3250EXPORT_SYMBOL_GPL(lock_is_held);
3251
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3252void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3253{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3254 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3308 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3309 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3310 break;
3044 if (hlock->instance == lock) 3311 if (match_held_lock(hlock, lock))
3045 goto found_it; 3312 goto found_it;
3046 prev_hlock = hlock; 3313 prev_hlock = hlock;
3047 } 3314 }
@@ -3049,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3316 return;
3050 3317
3051found_it: 3318found_it:
3319 if (hlock->instance != lock)
3320 return;
3321
3052 hlock->waittime_stamp = sched_clock(); 3322 hlock->waittime_stamp = sched_clock();
3053 3323
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3358 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3359 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3360 break;
3091 if (hlock->instance == lock) 3361 if (match_held_lock(hlock, lock))
3092 goto found_it; 3362 goto found_it;
3093 prev_hlock = hlock; 3363 prev_hlock = hlock;
3094 } 3364 }
@@ -3096,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3366 return;
3097 3367
3098found_it: 3368found_it:
3369 if (hlock->instance != lock)
3370 return;
3371
3099 cpu = smp_processor_id(); 3372 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3373 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3374 now = sched_clock();
@@ -3326,7 +3599,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3599 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3600 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3601 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3602 sizeof(struct list_head) * CHAINHASH_SIZE
3603#ifdef CONFIG_PROVE_LOCKING
3604 + sizeof(struct circular_queue)
3605#endif
3606 ) / 1024
3607 );
3330 3608
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3609 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3610 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index e94caa666dba..d4b3dbc79fdb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d7cbc579fc80..e0d91fdf0c3c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly;
46 46
47/* 47/*
48 * perf counter paranoia level: 48 * perf counter paranoia level:
49 * 0 - not paranoid 49 * -1 - not paranoid at all
50 * 1 - disallow cpu counters to unpriv 50 * 0 - disallow raw tracepoint access for unpriv
51 * 2 - disallow kernel profiling to unpriv 51 * 1 - disallow cpu counters for unpriv
52 * 2 - disallow kernel profiling for unpriv
52 */ 53 */
53int sysctl_perf_counter_paranoid __read_mostly = 1; 54int sysctl_perf_counter_paranoid __read_mostly = 1;
54 55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_counter_paranoid > -1;
59}
60
55static inline bool perf_paranoid_cpu(void) 61static inline bool perf_paranoid_cpu(void)
56{ 62{
57 return sysctl_perf_counter_paranoid > 0; 63 return sysctl_perf_counter_paranoid > 0;
@@ -469,7 +475,8 @@ static void update_counter_times(struct perf_counter *counter)
469 struct perf_counter_context *ctx = counter->ctx; 475 struct perf_counter_context *ctx = counter->ctx;
470 u64 run_end; 476 u64 run_end;
471 477
472 if (counter->state < PERF_COUNTER_STATE_INACTIVE) 478 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
479 counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
473 return; 480 return;
474 481
475 counter->total_time_enabled = ctx->time - counter->tstamp_enabled; 482 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@ -518,7 +525,7 @@ static void __perf_counter_disable(void *info)
518 */ 525 */
519 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { 526 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
520 update_context_time(ctx); 527 update_context_time(ctx);
521 update_counter_times(counter); 528 update_group_times(counter);
522 if (counter == counter->group_leader) 529 if (counter == counter->group_leader)
523 group_sched_out(counter, cpuctx, ctx); 530 group_sched_out(counter, cpuctx, ctx);
524 else 531 else
@@ -573,7 +580,7 @@ static void perf_counter_disable(struct perf_counter *counter)
573 * in, so we can change the state safely. 580 * in, so we can change the state safely.
574 */ 581 */
575 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 582 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
576 update_counter_times(counter); 583 update_group_times(counter);
577 counter->state = PERF_COUNTER_STATE_OFF; 584 counter->state = PERF_COUNTER_STATE_OFF;
578 } 585 }
579 586
@@ -851,6 +858,27 @@ retry:
851} 858}
852 859
853/* 860/*
861 * Put a counter into inactive state and update time fields.
862 * Enabling the leader of a group effectively enables all
863 * the group members that aren't explicitly disabled, so we
864 * have to update their ->tstamp_enabled also.
865 * Note: this works for group members as well as group leaders
866 * since the non-leader members' sibling_lists will be empty.
867 */
868static void __perf_counter_mark_enabled(struct perf_counter *counter,
869 struct perf_counter_context *ctx)
870{
871 struct perf_counter *sub;
872
873 counter->state = PERF_COUNTER_STATE_INACTIVE;
874 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
875 list_for_each_entry(sub, &counter->sibling_list, list_entry)
876 if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
877 sub->tstamp_enabled =
878 ctx->time - sub->total_time_enabled;
879}
880
881/*
854 * Cross CPU call to enable a performance counter 882 * Cross CPU call to enable a performance counter
855 */ 883 */
856static void __perf_counter_enable(void *info) 884static void __perf_counter_enable(void *info)
@@ -877,8 +905,7 @@ static void __perf_counter_enable(void *info)
877 905
878 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 906 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
879 goto unlock; 907 goto unlock;
880 counter->state = PERF_COUNTER_STATE_INACTIVE; 908 __perf_counter_mark_enabled(counter, ctx);
881 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
882 909
883 /* 910 /*
884 * If the counter is in a group and isn't the group leader, 911 * If the counter is in a group and isn't the group leader,
@@ -971,11 +998,9 @@ static void perf_counter_enable(struct perf_counter *counter)
971 * Since we have the lock this context can't be scheduled 998 * Since we have the lock this context can't be scheduled
972 * in, so we can change the state safely. 999 * in, so we can change the state safely.
973 */ 1000 */
974 if (counter->state == PERF_COUNTER_STATE_OFF) { 1001 if (counter->state == PERF_COUNTER_STATE_OFF)
975 counter->state = PERF_COUNTER_STATE_INACTIVE; 1002 __perf_counter_mark_enabled(counter, ctx);
976 counter->tstamp_enabled = 1003
977 ctx->time - counter->total_time_enabled;
978 }
979 out: 1004 out:
980 spin_unlock_irq(&ctx->lock); 1005 spin_unlock_irq(&ctx->lock);
981} 1006}
@@ -1479,9 +1504,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
1479 counter->attr.enable_on_exec = 0; 1504 counter->attr.enable_on_exec = 0;
1480 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) 1505 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
1481 continue; 1506 continue;
1482 counter->state = PERF_COUNTER_STATE_INACTIVE; 1507 __perf_counter_mark_enabled(counter, ctx);
1483 counter->tstamp_enabled =
1484 ctx->time - counter->total_time_enabled;
1485 enabled = 1; 1508 enabled = 1;
1486 } 1509 }
1487 1510
@@ -1675,6 +1698,11 @@ static void free_counter(struct perf_counter *counter)
1675 atomic_dec(&nr_task_counters); 1698 atomic_dec(&nr_task_counters);
1676 } 1699 }
1677 1700
1701 if (counter->output) {
1702 fput(counter->output->filp);
1703 counter->output = NULL;
1704 }
1705
1678 if (counter->destroy) 1706 if (counter->destroy)
1679 counter->destroy(counter); 1707 counter->destroy(counter);
1680 1708
@@ -1960,6 +1988,8 @@ unlock:
1960 return ret; 1988 return ret;
1961} 1989}
1962 1990
1991int perf_counter_set_output(struct perf_counter *counter, int output_fd);
1992
1963static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1993static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1964{ 1994{
1965 struct perf_counter *counter = file->private_data; 1995 struct perf_counter *counter = file->private_data;
@@ -1983,6 +2013,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1983 case PERF_COUNTER_IOC_PERIOD: 2013 case PERF_COUNTER_IOC_PERIOD:
1984 return perf_counter_period(counter, (u64 __user *)arg); 2014 return perf_counter_period(counter, (u64 __user *)arg);
1985 2015
2016 case PERF_COUNTER_IOC_SET_OUTPUT:
2017 return perf_counter_set_output(counter, arg);
2018
1986 default: 2019 default:
1987 return -ENOTTY; 2020 return -ENOTTY;
1988 } 2021 }
@@ -2253,6 +2286,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2253 2286
2254 WARN_ON_ONCE(counter->ctx->parent_ctx); 2287 WARN_ON_ONCE(counter->ctx->parent_ctx);
2255 mutex_lock(&counter->mmap_mutex); 2288 mutex_lock(&counter->mmap_mutex);
2289 if (counter->output) {
2290 ret = -EINVAL;
2291 goto unlock;
2292 }
2293
2256 if (atomic_inc_not_zero(&counter->mmap_count)) { 2294 if (atomic_inc_not_zero(&counter->mmap_count)) {
2257 if (nr_pages != counter->data->nr_pages) 2295 if (nr_pages != counter->data->nr_pages)
2258 ret = -EINVAL; 2296 ret = -EINVAL;
@@ -2638,6 +2676,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
2638 struct perf_counter *counter, unsigned int size, 2676 struct perf_counter *counter, unsigned int size,
2639 int nmi, int sample) 2677 int nmi, int sample)
2640{ 2678{
2679 struct perf_counter *output_counter;
2641 struct perf_mmap_data *data; 2680 struct perf_mmap_data *data;
2642 unsigned int offset, head; 2681 unsigned int offset, head;
2643 int have_lost; 2682 int have_lost;
@@ -2647,13 +2686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
2647 u64 lost; 2686 u64 lost;
2648 } lost_event; 2687 } lost_event;
2649 2688
2689 rcu_read_lock();
2650 /* 2690 /*
2651 * For inherited counters we send all the output towards the parent. 2691 * For inherited counters we send all the output towards the parent.
2652 */ 2692 */
2653 if (counter->parent) 2693 if (counter->parent)
2654 counter = counter->parent; 2694 counter = counter->parent;
2655 2695
2656 rcu_read_lock(); 2696 output_counter = rcu_dereference(counter->output);
2697 if (output_counter)
2698 counter = output_counter;
2699
2657 data = rcu_dereference(counter->data); 2700 data = rcu_dereference(counter->data);
2658 if (!data) 2701 if (!data)
2659 goto out; 2702 goto out;
@@ -3934,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3934 * have these. 3977 * have these.
3935 */ 3978 */
3936 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && 3979 if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
3980 perf_paranoid_tracepoint_raw() &&
3937 !capable(CAP_SYS_ADMIN)) 3981 !capable(CAP_SYS_ADMIN))
3938 return ERR_PTR(-EPERM); 3982 return ERR_PTR(-EPERM);
3939 3983
@@ -4202,6 +4246,57 @@ err_size:
4202 goto out; 4246 goto out;
4203} 4247}
4204 4248
4249int perf_counter_set_output(struct perf_counter *counter, int output_fd)
4250{
4251 struct perf_counter *output_counter = NULL;
4252 struct file *output_file = NULL;
4253 struct perf_counter *old_output;
4254 int fput_needed = 0;
4255 int ret = -EINVAL;
4256
4257 if (!output_fd)
4258 goto set;
4259
4260 output_file = fget_light(output_fd, &fput_needed);
4261 if (!output_file)
4262 return -EBADF;
4263
4264 if (output_file->f_op != &perf_fops)
4265 goto out;
4266
4267 output_counter = output_file->private_data;
4268
4269 /* Don't chain output fds */
4270 if (output_counter->output)
4271 goto out;
4272
4273 /* Don't set an output fd when we already have an output channel */
4274 if (counter->data)
4275 goto out;
4276
4277 atomic_long_inc(&output_file->f_count);
4278
4279set:
4280 mutex_lock(&counter->mmap_mutex);
4281 old_output = counter->output;
4282 rcu_assign_pointer(counter->output, output_counter);
4283 mutex_unlock(&counter->mmap_mutex);
4284
4285 if (old_output) {
4286 /*
4287 * we need to make sure no existing perf_output_*()
4288 * is still referencing this counter.
4289 */
4290 synchronize_rcu();
4291 fput(old_output->filp);
4292 }
4293
4294 ret = 0;
4295out:
4296 fput_light(output_file, fput_needed);
4297 return ret;
4298}
4299
4205/** 4300/**
4206 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu 4301 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
4207 * 4302 *
@@ -4221,15 +4316,15 @@ SYSCALL_DEFINE5(perf_counter_open,
4221 struct file *group_file = NULL; 4316 struct file *group_file = NULL;
4222 int fput_needed = 0; 4317 int fput_needed = 0;
4223 int fput_needed2 = 0; 4318 int fput_needed2 = 0;
4224 int ret; 4319 int err;
4225 4320
4226 /* for future expandability... */ 4321 /* for future expandability... */
4227 if (flags) 4322 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4228 return -EINVAL; 4323 return -EINVAL;
4229 4324
4230 ret = perf_copy_attr(attr_uptr, &attr); 4325 err = perf_copy_attr(attr_uptr, &attr);
4231 if (ret) 4326 if (err)
4232 return ret; 4327 return err;
4233 4328
4234 if (!attr.exclude_kernel) { 4329 if (!attr.exclude_kernel) {
4235 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 4330 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@ -4252,8 +4347,8 @@ SYSCALL_DEFINE5(perf_counter_open,
4252 * Look up the group leader (we will attach this counter to it): 4347 * Look up the group leader (we will attach this counter to it):
4253 */ 4348 */
4254 group_leader = NULL; 4349 group_leader = NULL;
4255 if (group_fd != -1) { 4350 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4256 ret = -EINVAL; 4351 err = -EINVAL;
4257 group_file = fget_light(group_fd, &fput_needed); 4352 group_file = fget_light(group_fd, &fput_needed);
4258 if (!group_file) 4353 if (!group_file)
4259 goto err_put_context; 4354 goto err_put_context;
@@ -4282,18 +4377,24 @@ SYSCALL_DEFINE5(perf_counter_open,
4282 4377
4283 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, 4378 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
4284 NULL, GFP_KERNEL); 4379 NULL, GFP_KERNEL);
4285 ret = PTR_ERR(counter); 4380 err = PTR_ERR(counter);
4286 if (IS_ERR(counter)) 4381 if (IS_ERR(counter))
4287 goto err_put_context; 4382 goto err_put_context;
4288 4383
4289 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); 4384 err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
4290 if (ret < 0) 4385 if (err < 0)
4291 goto err_free_put_context; 4386 goto err_free_put_context;
4292 4387
4293 counter_file = fget_light(ret, &fput_needed2); 4388 counter_file = fget_light(err, &fput_needed2);
4294 if (!counter_file) 4389 if (!counter_file)
4295 goto err_free_put_context; 4390 goto err_free_put_context;
4296 4391
4392 if (flags & PERF_FLAG_FD_OUTPUT) {
4393 err = perf_counter_set_output(counter, group_fd);
4394 if (err)
4395 goto err_fput_free_put_context;
4396 }
4397
4297 counter->filp = counter_file; 4398 counter->filp = counter_file;
4298 WARN_ON_ONCE(ctx->parent_ctx); 4399 WARN_ON_ONCE(ctx->parent_ctx);
4299 mutex_lock(&ctx->mutex); 4400 mutex_lock(&ctx->mutex);
@@ -4307,20 +4408,20 @@ SYSCALL_DEFINE5(perf_counter_open,
4307 list_add_tail(&counter->owner_entry, &current->perf_counter_list); 4408 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
4308 mutex_unlock(&current->perf_counter_mutex); 4409 mutex_unlock(&current->perf_counter_mutex);
4309 4410
4411err_fput_free_put_context:
4310 fput_light(counter_file, fput_needed2); 4412 fput_light(counter_file, fput_needed2);
4311 4413
4312out_fput:
4313 fput_light(group_file, fput_needed);
4314
4315 return ret;
4316
4317err_free_put_context: 4414err_free_put_context:
4318 kfree(counter); 4415 if (err < 0)
4416 kfree(counter);
4319 4417
4320err_put_context: 4418err_put_context:
4321 put_ctx(ctx); 4419 if (err < 0)
4420 put_ctx(ctx);
4421
4422 fput_light(group_file, fput_needed);
4322 4423
4323 goto out_fput; 4424 return err;
4324} 4425}
4325 4426
4326/* 4427/*
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..e10d193a833a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 380 logged_chars = 0;
373 break; 381 break;
374 case 6: /* Disable logging to console */ 382 case 6: /* Disable logging to console */
383 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 385 console_loglevel = minimum_console_loglevel;
376 break; 386 break;
377 case 7: /* Enable logging to console */ 387 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 388 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1;
391 }
379 break; 392 break;
380 case 8: /* Set level of messages printed to console */ 393 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 394 error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 397 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 398 len = minimum_console_loglevel;
386 console_loglevel = len; 399 console_loglevel = len;
400 /* Implicitly re-enable logging to console */
401 saved_console_loglevel = -1;
387 error = 0; 402 error = 0;
388 break; 403 break;
389 case 9: /* Number of chars in the log buffer */ 404 case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 427{
413 struct console *con; 428 struct console *con;
414 429
415 for (con = console_drivers; con; con = con->next) { 430 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 431 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 432 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 433 (con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
544{ 559{
545 struct console *con; 560 struct console *con;
546 561
547 for (con = console_drivers; con; con = con->next) 562 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 563 if (con->flags & CON_ANYTIME)
549 return 1; 564 return 1;
550 565
@@ -1082,7 +1097,7 @@ void console_unblank(void)
1082 1097
1083 console_locked = 1; 1098 console_locked = 1;
1084 console_may_schedule = 0; 1099 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1100 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1101 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1102 c->unblank();
1088 release_console_sem(); 1103 release_console_sem();
@@ -1097,7 +1112,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1112 struct tty_driver *driver = NULL;
1098 1113
1099 acquire_console_sem(); 1114 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1115 for_each_console(c) {
1101 if (!c->device) 1116 if (!c->device)
1102 continue; 1117 continue;
1103 driver = c->device(c, index); 1118 driver = c->device(c, index);
@@ -1134,25 +1149,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1149 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1150 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1151 * console driver was initialized.
1152 *
1153 * This can happen pretty early during the boot process (because of
1154 * early_printk) - sometimes before setup_arch() completes - be careful
1155 * of what kernel features are used - they may not be initialised yet.
1156 *
1157 * There are two types of consoles - bootconsoles (early_printk) and
1158 * "real" consoles (everything which is not a bootconsole) which are
1159 * handled differently.
1160 * - Any number of bootconsoles can be registered at any time.
1161 * - As soon as a "real" console is registered, all bootconsoles
1162 * will be unregistered automatically.
1163 * - Once a "real" console is registered, any attempt to register a
1164 * bootconsoles will be rejected
1137 */ 1165 */
1138void register_console(struct console *console) 1166void register_console(struct console *newcon)
1139{ 1167{
1140 int i; 1168 int i;
1141 unsigned long flags; 1169 unsigned long flags;
1142 struct console *bootconsole = NULL; 1170 struct console *bcon = NULL;
1143 1171
1144 if (console_drivers) { 1172 /*
1145 if (console->flags & CON_BOOT) 1173 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1174 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1175 */
1148 bootconsole = console_drivers; 1176 if (console_drivers && newcon->flags & CON_BOOT) {
1177 /* find the last or real console */
1178 for_each_console(bcon) {
1179 if (!(bcon->flags & CON_BOOT)) {
1180 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1181 newcon->name, newcon->index);
1182 return;
1183 }
1184 }
1149 } 1185 }
1150 1186
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1187 if (console_drivers && console_drivers->flags & CON_BOOT)
1188 bcon = console_drivers;
1189
1190 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1191 preferred_console = selected_console;
1153 1192
1154 if (console->early_setup) 1193 if (newcon->early_setup)
1155 console->early_setup(); 1194 newcon->early_setup();
1156 1195
1157 /* 1196 /*
1158 * See if we want to use this console driver. If we 1197 * See if we want to use this console driver. If we
@@ -1160,13 +1199,13 @@ void register_console(struct console *console)
1160 * that registers here. 1199 * that registers here.
1161 */ 1200 */
1162 if (preferred_console < 0) { 1201 if (preferred_console < 0) {
1163 if (console->index < 0) 1202 if (newcon->index < 0)
1164 console->index = 0; 1203 newcon->index = 0;
1165 if (console->setup == NULL || 1204 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1205 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1206 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1207 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1208 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1209 preferred_console = 0;
1171 } 1210 }
1172 } 1211 }
@@ -1178,64 +1217,62 @@ void register_console(struct console *console)
1178 */ 1217 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1218 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1219 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1220 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1221 continue;
1183 if (console->index >= 0 && 1222 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1223 newcon->index != console_cmdline[i].index)
1185 continue; 1224 continue;
1186 if (console->index < 0) 1225 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1226 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1227#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1228 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1229 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1230 braille_register_console(newcon,
1192 console_cmdline[i].index, 1231 console_cmdline[i].index,
1193 console_cmdline[i].options, 1232 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1233 console_cmdline[i].brl_options);
1195 return; 1234 return;
1196 } 1235 }
1197#endif 1236#endif
1198 if (console->setup && 1237 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1238 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1239 break;
1201 console->flags |= CON_ENABLED; 1240 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1241 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1242 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1243 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1244 preferred_console = selected_console;
1206 } 1245 }
1207 break; 1246 break;
1208 } 1247 }
1209 1248
1210 if (!(console->flags & CON_ENABLED)) 1249 if (!(newcon->flags & CON_ENABLED))
1211 return; 1250 return;
1212 1251
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1252 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1253 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1254 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1255 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1256 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1257 */
1219 } else { 1258 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1259 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1260
1224 /* 1261 /*
1225 * Put this console in the list - keep the 1262 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1263 * preferred driver at the head of the list.
1227 */ 1264 */
1228 acquire_console_sem(); 1265 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1266 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1267 newcon->next = console_drivers;
1231 console_drivers = console; 1268 console_drivers = newcon;
1232 if (console->next) 1269 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1270 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1271 } else {
1235 console->next = console_drivers->next; 1272 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1273 console_drivers->next = newcon;
1237 } 1274 }
1238 if (console->flags & CON_PRINTBUFFER) { 1275 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1276 /*
1240 * release_console_sem() will print out the buffered messages 1277 * release_console_sem() will print out the buffered messages
1241 * for us. 1278 * for us.
@@ -1245,6 +1282,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1282 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1283 }
1247 release_console_sem(); 1284 release_console_sem();
1285
1286 /*
1287 * By unregistering the bootconsoles after we enable the real console
1288 * we get the "console xxx enabled" message on all the consoles -
1289 * boot consoles, real consoles, etc - this is to ensure that end
1290 * users know there might be something in the kernel's log buffer that
1291 * went to the bootconsole (that they do not see on the real console)
1292 */
1293 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1294 /* we need to iterate through twice, to make sure we print
1295 * everything out, before we unregister the console(s)
1296 */
1297 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1298 newcon->name, newcon->index);
1299 for_each_console(bcon)
1300 if (bcon->flags & CON_BOOT)
1301 unregister_console(bcon);
1302 } else {
1303 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1304 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1305 newcon->name, newcon->index);
1306 }
1248} 1307}
1249EXPORT_SYMBOL(register_console); 1308EXPORT_SYMBOL(register_console);
1250 1309
@@ -1287,11 +1346,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1346
1288static int __init disable_boot_consoles(void) 1347static int __init disable_boot_consoles(void)
1289{ 1348{
1290 if (console_drivers != NULL) { 1349 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1350
1351 for_each_console(con) {
1352 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1353 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1354 con->name, con->index);
1294 return unregister_console(console_drivers); 1355 unregister_console(con);
1295 } 1356 }
1296 } 1357 }
1297 return 0; 1358 return 0;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 082c320e4dbf..307c285af59e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..bd5d5c8e5140 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -98,6 +98,30 @@ void synchronize_rcu(void)
98} 98}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 99EXPORT_SYMBOL_GPL(synchronize_rcu);
100 100
101/**
102 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
103 *
104 * Control will return to the caller some time after a full rcu_bh grace
105 * period has elapsed, in other words after all currently executing rcu_bh
106 * read-side critical sections have completed. RCU read-side critical
107 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
108 * and may be nested.
109 */
110void synchronize_rcu_bh(void)
111{
112 struct rcu_synchronize rcu;
113
114 if (rcu_blocking_is_gp())
115 return;
116
117 init_completion(&rcu.completion);
118 /* Will wake me after RCU finished. */
119 call_rcu_bh(&rcu.head, wakeme_after_rcu);
120 /* Wait for it. */
121 wait_for_completion(&rcu.completion);
122}
123EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
124
101static void rcu_barrier_callback(struct rcu_head *notused) 125static void rcu_barrier_callback(struct rcu_head *notused)
102{ 126{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 127 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +153,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 153static inline void wait_migrated_callbacks(void)
130{ 154{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 155 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
156 smp_mb(); /* In case we didn't sleep. */
132} 157}
133 158
134/* 159/*
@@ -192,9 +217,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 217 wake_up(&rcu_migrate_wq);
193} 218}
194 219
220extern int rcu_cpu_notify(struct notifier_block *self,
221 unsigned long action, void *hcpu);
222
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 223static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 224 unsigned long action, void *hcpu)
197{ 225{
226 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 227 if (action == CPU_DYING) {
199 /* 228 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 229 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +238,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 238 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 239 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 240 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 241 } else if (action == CPU_DOWN_PREPARE) {
242 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 243 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 244 wait_migrated_callbacks();
215 } 245 }
@@ -219,8 +249,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 249
220void __init rcu_init(void) 250void __init rcu_init(void)
221{ 251{
252 int i;
253
222 __rcu_init(); 254 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 255 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
256
257 /*
258 * We don't need protection against CPU-hotplug here because
259 * this is called early in boot, before either interrupts
260 * or the scheduler are operational.
261 */
262 for_each_online_cpu(i)
263 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 264}
225 265
226void rcu_scheduler_starting(void) 266void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..b33db539a8ad 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -257,14 +257,14 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270static struct rcu_torture_ops *cur_ops = NULL;
@@ -320,7 +320,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 320 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 321 rcu_torture_free(rp);
322 } else 322 } else
323 cur_ops->deferredfree(rp); 323 cur_ops->deferred_free(rp);
324} 324}
325 325
326static void rcu_torture_deferred_free(struct rcu_torture *p) 326static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +329,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 329}
330 330
331static struct rcu_torture_ops rcu_ops = { 331static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 332 .init = NULL,
333 .cleanup = NULL, 333 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 334 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 335 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 336 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 337 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 338 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 339 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 340 .cb_barrier = rcu_barrier,
341 .stats = NULL, 341 .stats = NULL,
342 .irqcapable = 1, 342 .irq_capable = 1,
343 .name = "rcu" 343 .name = "rcu"
344}; 344};
345 345
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 346static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +370,18 @@ static void rcu_sync_torture_init(void)
370} 370}
371 371
372static struct rcu_torture_ops rcu_sync_ops = { 372static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 373 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 374 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 375 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 376 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 377 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 378 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 379 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 380 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 381 .cb_barrier = NULL,
382 .stats = NULL, 382 .stats = NULL,
383 .irqcapable = 1, 383 .irq_capable = 1,
384 .name = "rcu_sync" 384 .name = "rcu_sync"
385}; 385};
386 386
387/* 387/*
@@ -432,33 +432,33 @@ static void rcu_bh_torture_synchronize(void)
432} 432}
433 433
434static struct rcu_torture_ops rcu_bh_ops = { 434static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 435 .init = NULL,
436 .cleanup = NULL, 436 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 437 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 438 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 439 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 440 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 441 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 442 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 443 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 444 .stats = NULL,
445 .irqcapable = 1, 445 .irq_capable = 1,
446 .name = "rcu_bh" 446 .name = "rcu_bh"
447}; 447};
448 448
449static struct rcu_torture_ops rcu_bh_sync_ops = { 449static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 450 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 451 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 452 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 453 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 454 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 455 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 456 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 457 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 458 .cb_barrier = NULL,
459 .stats = NULL, 459 .stats = NULL,
460 .irqcapable = 1, 460 .irq_capable = 1,
461 .name = "rcu_bh_sync" 461 .name = "rcu_bh_sync"
462}; 462};
463 463
464/* 464/*
@@ -530,17 +530,17 @@ static int srcu_torture_stats(char *page)
530} 530}
531 531
532static struct rcu_torture_ops srcu_ops = { 532static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 533 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 534 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 535 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 536 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 537 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 538 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 539 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 540 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 541 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 542 .stats = srcu_torture_stats,
543 .name = "srcu" 543 .name = "srcu"
544}; 544};
545 545
546/* 546/*
@@ -574,32 +574,49 @@ static void sched_torture_synchronize(void)
574} 574}
575 575
576static struct rcu_torture_ops sched_ops = { 576static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 577 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 578 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 579 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 580 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 581 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 582 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 583 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 584 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 585 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 586 .stats = NULL,
587 .irqcapable = 1, 587 .irq_capable = 1,
588 .name = "sched" 588 .name = "sched"
589}; 589};
590 590
591static struct rcu_torture_ops sched_ops_sync = { 591static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 592 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 593 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 594 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 595 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 596 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 597 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 598 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 599 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 600 .cb_barrier = NULL,
601 .stats = NULL, 601 .stats = NULL,
602 .name = "sched_sync" 602 .name = "sched_sync"
603};
604
605extern int rcu_expedited_torture_stats(char *page);
606
607static struct rcu_torture_ops sched_expedited_ops = {
608 .init = rcu_sync_torture_init,
609 .cleanup = NULL,
610 .readlock = sched_torture_read_lock,
611 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
612 .readunlock = sched_torture_read_unlock,
613 .completed = sched_torture_completed,
614 .deferred_free = rcu_sync_torture_deferred_free,
615 .sync = synchronize_sched_expedited,
616 .cb_barrier = NULL,
617 .stats = rcu_expedited_torture_stats,
618 .irq_capable = 1,
619 .name = "sched_expedited"
603}; 620};
604 621
605/* 622/*
@@ -635,7 +652,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 652 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 653 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 654 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 655 cur_ops->deferred_free(old_rp);
639 } 656 }
640 rcu_torture_current_version++; 657 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 658 oldbatch = cur_ops->completed();
@@ -700,7 +717,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 717 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 718 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 719 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 720 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 721 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 722 spin_unlock(&rand_lock);
706 preempt_disable(); 723 preempt_disable();
@@ -738,11 +755,11 @@ rcu_torture_reader(void *arg)
738 755
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 756 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 757 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 758 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 759 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 760
744 do { 761 do {
745 if (irqreader && cur_ops->irqcapable) { 762 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 763 if (!timer_pending(&t))
747 mod_timer(&t, 1); 764 mod_timer(&t, 1);
748 } 765 }
@@ -757,7 +774,7 @@ rcu_torture_reader(void *arg)
757 } 774 }
758 if (p->rtort_mbtest == 0) 775 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 776 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 777 cur_ops->read_delay(&rand);
761 preempt_disable(); 778 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 779 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 780 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +795,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 795 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 796 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 797 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 798 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 799 del_timer_sync(&t);
783 while (!kthread_should_stop()) 800 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 801 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1095,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1095 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1096 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1097 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1098 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1099 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1100
1083 mutex_lock(&fullstop_mutex); 1101 mutex_lock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 7717b95c2027..6b11b07cfe7f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,59 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 unsigned long flags;
111 struct rcu_data *rdp;
112
113 local_irq_save(flags);
114 rdp = &per_cpu(rcu_sched_data, cpu);
90 rdp->passed_quiesc = 1; 115 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed; 116 rdp->passed_quiesc_completed = rdp->completed;
117 rcu_preempt_qs(cpu);
118 local_irq_restore(flags);
92} 119}
93 120
94void rcu_bh_qsctr_inc(int cpu) 121void rcu_bh_qs(int cpu)
95{ 122{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 123 unsigned long flags;
124 struct rcu_data *rdp;
125
126 local_irq_save(flags);
127 rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed; 129 rdp->passed_quiesc_completed = rdp->completed;
130 local_irq_restore(flags);
99} 131}
100 132
101#ifdef CONFIG_NO_HZ 133#ifdef CONFIG_NO_HZ
@@ -110,15 +142,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 142static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 143
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 144static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
145static int rcu_pending(int cpu);
113 146
114/* 147/*
115 * Return the number of RCU batches processed thus far for debug & stats. 148 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 149 */
117long rcu_batches_completed(void) 150long rcu_batches_completed_sched(void)
118{ 151{
119 return rcu_state.completed; 152 return rcu_sched_state.completed;
120} 153}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 154EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 155
123/* 156/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 157 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +214,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 214 return 1;
182 } 215 }
183 216
217 /* If preemptable RCU, no point in sending reschedule IPI. */
218 if (rdp->preemptable)
219 return 0;
220
184 /* The CPU is online, so send it a reschedule IPI. */ 221 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 222 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 223 smp_send_reschedule(rdp->cpu);
@@ -193,7 +230,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 230#endif /* #ifdef CONFIG_SMP */
194 231
195#ifdef CONFIG_NO_HZ 232#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 233
198/** 234/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 235 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +249,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 249 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 250 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 251 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 252 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 253 local_irq_restore(flags);
218} 254}
219 255
@@ -232,7 +268,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 268 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 269 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 270 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 271 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 272 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 273 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 274}
@@ -251,7 +287,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 287 if (rdtp->dynticks & 0x1)
252 return; 288 return;
253 rdtp->dynticks_nmi++; 289 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 290 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 292}
257 293
@@ -270,7 +306,7 @@ void rcu_nmi_exit(void)
270 return; 306 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 307 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 308 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 309 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 310}
275 311
276/** 312/**
@@ -286,7 +322,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 322 if (rdtp->dynticks_nesting++)
287 return; 323 return;
288 rdtp->dynticks++; 324 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 325 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 326 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 327}
292 328
@@ -305,10 +341,10 @@ void rcu_irq_exit(void)
305 return; 341 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 342 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 343 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 344 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 345
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 346 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 347 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 348 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 349 set_need_resched();
314} 350}
@@ -461,6 +497,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 497
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 498 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 499 for (; rnp_cur < rnp_end; rnp_cur++) {
500 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 501 if (rnp_cur->qsmask == 0)
465 continue; 502 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 503 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +506,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 506 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 507 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 508 smp_processor_id(), (long)(jiffies - rsp->gp_start));
509 trigger_all_cpu_backtrace();
510
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 511 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 512}
474 513
@@ -479,12 +518,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 518
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 519 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 520 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 521 trigger_all_cpu_backtrace();
522
483 spin_lock_irqsave(&rnp->lock, flags); 523 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 524 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 525 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 526 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 527 spin_unlock_irqrestore(&rnp->lock, flags);
528
488 set_need_resched(); /* kick ourselves to get things going. */ 529 set_need_resched(); /* kick ourselves to get things going. */
489} 530}
490 531
@@ -674,6 +715,19 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 715}
675 716
676/* 717/*
718 * Clean up after the prior grace period and let rcu_start_gp() start up
719 * the next grace period if one is needed. Note that the caller must
720 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
721 */
722static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
723 __releases(rnp->lock)
724{
725 rsp->completed = rsp->gpnum;
726 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
727 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
728}
729
730/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 731 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 732 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 733 * group must be represented by the same leaf rcu_node structure.
@@ -694,7 +748,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 748 return;
695 } 749 }
696 rnp->qsmask &= ~mask; 750 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 751 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 752
699 /* Other bits still set at this level, so done. */ 753 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 754 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -714,14 +768,10 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
714 768
715 /* 769 /*
716 * Get here if we are the last CPU to pass through a quiescent 770 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 771 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 772 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 773 */
722 rsp->completed = rsp->gpnum; 774 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 775}
726 776
727/* 777/*
@@ -828,11 +878,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 878 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 879 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 880 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 881 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 882 break;
833 } 883 }
884 rcu_preempt_offline_tasks(rsp, rnp);
834 mask = rnp->grpmask; 885 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 886 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 887 rnp = rnp->parent;
837 } while (rnp != NULL); 888 } while (rnp != NULL);
838 lastcomp = rsp->completed; 889 lastcomp = rsp->completed;
@@ -845,7 +896,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
845 /* 896 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 897 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 898 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 899 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 900 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 901 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 902 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +927,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 927 */
877static void rcu_offline_cpu(int cpu) 928static void rcu_offline_cpu(int cpu)
878{ 929{
879 __rcu_offline_cpu(cpu, &rcu_state); 930 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 931 __rcu_offline_cpu(cpu, &rcu_bh_state);
932 rcu_preempt_offline_cpu(cpu);
881} 933}
882 934
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 935#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +1015,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 1015 */
964void rcu_check_callbacks(int cpu, int user) 1016void rcu_check_callbacks(int cpu, int user)
965{ 1017{
1018 if (!rcu_pending(cpu))
1019 return; /* if nothing for RCU to do. */
966 if (user || 1020 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1021 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1022 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1025,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1025 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1026 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1027 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1028 * a quiescent state, so note it.
975 * 1029 *
976 * No memory barrier is required here because both 1030 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1031 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1032 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1033 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1034 */
982 1035
983 rcu_qsctr_inc(cpu); 1036 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1037 rcu_bh_qs(cpu);
985 1038
986 } else if (!in_softirq()) { 1039 } else if (!in_softirq()) {
987 1040
@@ -989,11 +1042,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1042 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1043 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1044 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1045 * critical section, so note it.
993 */ 1046 */
994 1047
995 rcu_bh_qsctr_inc(cpu); 1048 rcu_bh_qs(cpu);
996 } 1049 }
1050 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1051 raise_softirq(RCU_SOFTIRQ);
998} 1052}
999 1053
@@ -1132,6 +1186,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1186{
1133 unsigned long flags; 1187 unsigned long flags;
1134 1188
1189 WARN_ON_ONCE(rdp->beenonline == 0);
1190
1135 /* 1191 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1192 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1193 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1226,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1226 */
1171 smp_mb(); /* See above block comment. */ 1227 smp_mb(); /* See above block comment. */
1172 1228
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1229 __rcu_process_callbacks(&rcu_sched_state,
1230 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1231 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1232 rcu_preempt_process_callbacks();
1175 1233
1176 /* 1234 /*
1177 * Memory references from any later RCU read-side critical sections 1235 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1285,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1285}
1228 1286
1229/* 1287/*
1230 * Queue an RCU callback for invocation after a grace period. 1288 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1289 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1290void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1291{
1234 __call_rcu(head, func, &rcu_state); 1292 __call_rcu(head, func, &rcu_sched_state);
1235} 1293}
1236EXPORT_SYMBOL_GPL(call_rcu); 1294EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1295
1238/* 1296/*
1239 * Queue an RCU for invocation after a quicker grace period. 1297 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1363,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1363 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1364 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1365 */
1308int rcu_pending(int cpu) 1366static int rcu_pending(int cpu)
1309{ 1367{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1368 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1369 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1370 rcu_preempt_pending(cpu);
1312} 1371}
1313 1372
1314/* 1373/*
@@ -1320,27 +1379,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1379int rcu_needs_cpu(int cpu)
1321{ 1380{
1322 /* RCU callbacks either ready or pending? */ 1381 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1382 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1383 per_cpu(rcu_bh_data, cpu).nxtlist ||
1384 rcu_preempt_needs_cpu(cpu);
1325} 1385}
1326 1386
1327/* 1387/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1388 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1389 */
1339static void __cpuinit 1390static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1391rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1392{
1342 unsigned long flags; 1393 unsigned long flags;
1343 int i; 1394 int i;
1395 struct rcu_data *rdp = rsp->rda[cpu];
1396 struct rcu_node *rnp = rcu_get_root(rsp);
1397
1398 /* Set up local state, ensuring consistent view of global state. */
1399 spin_lock_irqsave(&rnp->lock, flags);
1400 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1401 rdp->nxtlist = NULL;
1402 for (i = 0; i < RCU_NEXT_SIZE; i++)
1403 rdp->nxttail[i] = &rdp->nxtlist;
1404 rdp->qlen = 0;
1405#ifdef CONFIG_NO_HZ
1406 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1407#endif /* #ifdef CONFIG_NO_HZ */
1408 rdp->cpu = cpu;
1409 spin_unlock_irqrestore(&rnp->lock, flags);
1410}
1411
1412/*
1413 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1414 * offline event can be happening at a given time. Note also that we
1415 * can accept some slop in the rsp->completed access due to the fact
1416 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1417 */
1418static void __cpuinit
1419rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1420{
1421 unsigned long flags;
1344 long lastcomp; 1422 long lastcomp;
1345 unsigned long mask; 1423 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1424 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1432,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1432 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1433 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1434 rdp->beenonline = 1; /* We have now been online. */
1435 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1436 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1437 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1438 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1439
1370 /* 1440 /*
@@ -1405,16 +1475,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1405 1475
1406static void __cpuinit rcu_online_cpu(int cpu) 1476static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1477{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1478 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1479 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1480 rcu_preempt_init_percpu_data(cpu);
1411} 1481}
1412 1482
1413/* 1483/*
1414 * Handle CPU online/offline notifcation events. 1484 * Handle CPU online/offline notification events.
1415 */ 1485 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1486int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1487 unsigned long action, void *hcpu)
1418{ 1488{
1419 long cpu = (long)hcpu; 1489 long cpu = (long)hcpu;
1420 1490
@@ -1486,6 +1556,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1556 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1557 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1558 spin_lock_init(&rnp->lock);
1559 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1560 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1561 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1562 rnp->grplo = j * cpustride;
@@ -1503,16 +1574,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1574 j / rsp->levelspread[i - 1];
1504 } 1575 }
1505 rnp->level = i; 1576 rnp->level = i;
1577 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1578 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1579 }
1507 } 1580 }
1508} 1581}
1509 1582
1510/* 1583/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1584 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1585 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1586 * structure.
1513 */ 1587 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1588#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1589do { \
1590 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1591 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1592 j = 0; \
1518 for_each_possible_cpu(i) { \ 1593 for_each_possible_cpu(i) { \
@@ -1520,32 +1595,43 @@ do { \
1520 j++; \ 1595 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1596 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1597 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1598 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1599 } \
1524} while (0) 1600} while (0)
1525 1601
1526static struct notifier_block __cpuinitdata rcu_nb = { 1602#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1603
1528}; 1604void __init __rcu_init_preempt(void)
1605{
1606 int i; /* All used by RCU_INIT_FLAVOR(). */
1607 int j;
1608 struct rcu_node *rnp;
1609
1610 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1611}
1612
1613#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1614
1615void __init __rcu_init_preempt(void)
1616{
1617}
1618
1619#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1620
1530void __init __rcu_init(void) 1621void __init __rcu_init(void)
1531{ 1622{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1623 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1624 int j;
1534 struct rcu_node *rnp; 1625 struct rcu_node *rnp;
1535 1626
1536 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 1627 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1628#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1629 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1630#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1631 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1632 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1633 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1634 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549} 1635}
1550 1636
1551module_param(blimit, int, 0); 1637module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..bf8a6f9f134d 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..47789369ea59
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,532 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs_record(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc = 1;
71 rdp->passed_quiesc_completed = rdp->completed;
72}
73
74/*
75 * We have entered the scheduler or are between softirqs in ksoftirqd.
76 * If we are in an RCU read-side critical section, we need to reflect
77 * that in the state of the rcu_node structure corresponding to this CPU.
78 * Caller must disable hardirqs.
79 */
80static void rcu_preempt_qs(int cpu)
81{
82 struct task_struct *t = current;
83 int phase;
84 struct rcu_data *rdp;
85 struct rcu_node *rnp;
86
87 if (t->rcu_read_lock_nesting &&
88 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
89
90 /* Possibly blocking in an RCU read-side critical section. */
91 rdp = rcu_preempt_state.rda[cpu];
92 rnp = rdp->mynode;
93 spin_lock(&rnp->lock);
94 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
95 t->rcu_blocked_node = rnp;
96
97 /*
98 * If this CPU has already checked in, then this task
99 * will hold up the next grace period rather than the
100 * current grace period. Queue the task accordingly.
101 * If the task is queued for the current grace period
102 * (i.e., this CPU has not yet passed through a quiescent
103 * state for the current grace period), then as long
104 * as that task remains queued, the current grace period
105 * cannot end.
106 */
107 phase = !(rnp->qsmask & rdp->grpmask) ^ (rnp->gpnum & 0x1);
108 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
109 smp_mb(); /* Ensure later ctxt swtch seen after above. */
110 spin_unlock(&rnp->lock);
111 }
112
113 /*
114 * Either we were not in an RCU read-side critical section to
115 * begin with, or we have now recorded that critical section
116 * globally. Either way, we can now note a quiescent state
117 * for this CPU. Again, if we were in an RCU read-side critical
118 * section, and if that critical section was blocking the current
119 * grace period, then the fact that the task has been enqueued
120 * means that we continue to block the current grace period.
121 */
122 rcu_preempt_qs_record(cpu);
123 t->rcu_read_unlock_special &= ~(RCU_READ_UNLOCK_NEED_QS |
124 RCU_READ_UNLOCK_GOT_QS);
125}
126
127/*
128 * Tree-preemptable RCU implementation for rcu_read_lock().
129 * Just increment ->rcu_read_lock_nesting, shared state will be updated
130 * if we block.
131 */
132void __rcu_read_lock(void)
133{
134 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
135 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
136}
137EXPORT_SYMBOL_GPL(__rcu_read_lock);
138
139static void rcu_read_unlock_special(struct task_struct *t)
140{
141 int empty;
142 unsigned long flags;
143 unsigned long mask;
144 struct rcu_node *rnp;
145 int special;
146
147 /* NMI handlers cannot block and cannot safely manipulate state. */
148 if (in_nmi())
149 return;
150
151 local_irq_save(flags);
152
153 /*
154 * If RCU core is waiting for this CPU to exit critical section,
155 * let it know that we have done so.
156 */
157 special = t->rcu_read_unlock_special;
158 if (special & RCU_READ_UNLOCK_NEED_QS) {
159 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_GOT_QS;
161 }
162
163 /* Hardware IRQ handlers cannot block. */
164 if (in_irq()) {
165 local_irq_restore(flags);
166 return;
167 }
168
169 /* Clean up if blocked during RCU read-side critical section. */
170 if (special & RCU_READ_UNLOCK_BLOCKED) {
171 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
172
173 /*
174 * Remove this task from the list it blocked on. The
175 * task can migrate while we acquire the lock, but at
176 * most one time. So at most two passes through loop.
177 */
178 for (;;) {
179 rnp = t->rcu_blocked_node;
180 spin_lock(&rnp->lock);
181 if (rnp == t->rcu_blocked_node)
182 break;
183 spin_unlock(&rnp->lock);
184 }
185 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
186 list_del_init(&t->rcu_node_entry);
187 t->rcu_blocked_node = NULL;
188
189 /*
190 * If this was the last task on the current list, and if
191 * we aren't waiting on any CPUs, report the quiescent state.
192 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
193 * drop rnp->lock and restore irq.
194 */
195 if (!empty && rnp->qsmask == 0 &&
196 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
197 t->rcu_read_unlock_special &=
198 ~(RCU_READ_UNLOCK_NEED_QS |
199 RCU_READ_UNLOCK_GOT_QS);
200 if (rnp->parent == NULL) {
201 /* Only one rcu_node in the tree. */
202 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
203 return;
204 }
205 /* Report up the rest of the hierarchy. */
206 mask = rnp->grpmask;
207 spin_unlock_irqrestore(&rnp->lock, flags);
208 rnp = rnp->parent;
209 spin_lock_irqsave(&rnp->lock, flags);
210 cpu_quiet_msk(mask, &rcu_preempt_state, rnp, flags);
211 return;
212 }
213 spin_unlock(&rnp->lock);
214 }
215 local_irq_restore(flags);
216}
217
218/*
219 * Tree-preemptable RCU implementation for rcu_read_unlock().
220 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
221 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
222 * invoke rcu_read_unlock_special() to clean up after a context switch
223 * in an RCU read-side critical section and other special cases.
224 */
225void __rcu_read_unlock(void)
226{
227 struct task_struct *t = current;
228
229 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
230 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
231 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
232 rcu_read_unlock_special(t);
233}
234EXPORT_SYMBOL_GPL(__rcu_read_unlock);
235
236#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
237
238/*
239 * Scan the current list of tasks blocked within RCU read-side critical
240 * sections, printing out the tid of each.
241 */
242static void rcu_print_task_stall(struct rcu_node *rnp)
243{
244 unsigned long flags;
245 struct list_head *lp;
246 int phase = rnp->gpnum & 0x1;
247 struct task_struct *t;
248
249 if (!list_empty(&rnp->blocked_tasks[phase])) {
250 spin_lock_irqsave(&rnp->lock, flags);
251 phase = rnp->gpnum & 0x1; /* re-read under lock. */
252 lp = &rnp->blocked_tasks[phase];
253 list_for_each_entry(t, lp, rcu_node_entry)
254 printk(" P%d", t->pid);
255 spin_unlock_irqrestore(&rnp->lock, flags);
256 }
257}
258
259#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
260
261/*
262 * Check for preempted RCU readers for the specified rcu_node structure.
263 * If the caller needs a reliable answer, it must hold the rcu_node's
264 * >lock.
265 */
266static int rcu_preempted_readers(struct rcu_node *rnp)
267{
268 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
269}
270
271#ifdef CONFIG_HOTPLUG_CPU
272
273/*
274 * Handle tasklist migration for case in which all CPUs covered by the
275 * specified rcu_node have gone offline. Move them up to the root
276 * rcu_node. The reason for not just moving them to the immediate
277 * parent is to remove the need for rcu_read_unlock_special() to
278 * make more than two attempts to acquire the target rcu_node's lock.
279 *
280 * The caller must hold rnp->lock with irqs disabled.
281 */
282static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
283 struct rcu_node *rnp)
284{
285 int i;
286 struct list_head *lp;
287 struct list_head *lp_root;
288 struct rcu_node *rnp_root = rcu_get_root(rsp);
289 struct task_struct *tp;
290
291 if (rnp == rnp_root) {
292 WARN_ONCE(1, "Last CPU thought to be offlined?");
293 return; /* Shouldn't happen: at least one CPU online. */
294 }
295
296 /*
297 * Move tasks up to root rcu_node. Rely on the fact that the
298 * root rcu_node can be at most one ahead of the rest of the
299 * rcu_nodes in terms of gp_num value. This fact allows us to
300 * move the blocked_tasks[] array directly, element by element.
301 */
302 for (i = 0; i < 2; i++) {
303 lp = &rnp->blocked_tasks[i];
304 lp_root = &rnp_root->blocked_tasks[i];
305 while (!list_empty(lp)) {
306 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
307 spin_lock(&rnp_root->lock); /* irqs already disabled */
308 list_del(&tp->rcu_node_entry);
309 tp->rcu_blocked_node = rnp_root;
310 list_add(&tp->rcu_node_entry, lp_root);
311 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
312 }
313 }
314}
315
316/*
317 * Do CPU-offline processing for preemptable RCU.
318 */
319static void rcu_preempt_offline_cpu(int cpu)
320{
321 __rcu_offline_cpu(cpu, &rcu_preempt_state);
322}
323
324#endif /* #ifdef CONFIG_HOTPLUG_CPU */
325
326/*
327 * Check for a quiescent state from the current CPU. When a task blocks,
328 * the task is recorded in the corresponding CPU's rcu_node structure,
329 * which is checked elsewhere.
330 *
331 * Caller must disable hard irqs.
332 */
333static void rcu_preempt_check_callbacks(int cpu)
334{
335 struct task_struct *t = current;
336
337 if (t->rcu_read_lock_nesting == 0) {
338 t->rcu_read_unlock_special &=
339 ~(RCU_READ_UNLOCK_NEED_QS | RCU_READ_UNLOCK_GOT_QS);
340 rcu_preempt_qs_record(cpu);
341 return;
342 }
343 if (per_cpu(rcu_preempt_data, cpu).qs_pending) {
344 if (t->rcu_read_unlock_special & RCU_READ_UNLOCK_GOT_QS) {
345 rcu_preempt_qs_record(cpu);
346 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_GOT_QS;
347 } else if (!(t->rcu_read_unlock_special &
348 RCU_READ_UNLOCK_NEED_QS)) {
349 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
350 }
351 }
352}
353
354/*
355 * Process callbacks for preemptable RCU.
356 */
357static void rcu_preempt_process_callbacks(void)
358{
359 __rcu_process_callbacks(&rcu_preempt_state,
360 &__get_cpu_var(rcu_preempt_data));
361}
362
363/*
364 * Queue a preemptable-RCU callback for invocation after a grace period.
365 */
366void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
367{
368 __call_rcu(head, func, &rcu_preempt_state);
369}
370EXPORT_SYMBOL_GPL(call_rcu);
371
372/*
373 * Check to see if there is any immediate preemptable-RCU-related work
374 * to be done.
375 */
376static int rcu_preempt_pending(int cpu)
377{
378 return __rcu_pending(&rcu_preempt_state,
379 &per_cpu(rcu_preempt_data, cpu));
380}
381
382/*
383 * Does preemptable RCU need the CPU to stay out of dynticks mode?
384 */
385static int rcu_preempt_needs_cpu(int cpu)
386{
387 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
388}
389
390/*
391 * Initialize preemptable RCU's per-CPU data.
392 */
393static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
394{
395 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
396}
397
398/*
399 * Check for a task exiting while in a preemptable-RCU read-side
400 * critical section, clean up if so. No need to issue warnings,
401 * as debug_check_no_locks_held() already does this if lockdep
402 * is enabled.
403 */
404void exit_rcu(void)
405{
406 struct task_struct *t = current;
407
408 if (t->rcu_read_lock_nesting == 0)
409 return;
410 t->rcu_read_lock_nesting = 1;
411 rcu_read_unlock();
412}
413
414#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
415
416/*
417 * Tell them what RCU they are running.
418 */
419static inline void rcu_bootup_announce(void)
420{
421 printk(KERN_INFO "Hierarchical RCU implementation.\n");
422}
423
424/*
425 * Return the number of RCU batches processed thus far for debug & stats.
426 */
427long rcu_batches_completed(void)
428{
429 return rcu_batches_completed_sched();
430}
431EXPORT_SYMBOL_GPL(rcu_batches_completed);
432
433/*
434 * Because preemptable RCU does not exist, we never have to check for
435 * CPUs being in quiescent states.
436 */
437static void rcu_preempt_qs(int cpu)
438{
439}
440
441#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
442
443/*
444 * Because preemptable RCU does not exist, we never have to check for
445 * tasks blocked within RCU read-side critical sections.
446 */
447static void rcu_print_task_stall(struct rcu_node *rnp)
448{
449}
450
451#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
452
453/*
454 * Because preemptable RCU does not exist, there are never any preempted
455 * RCU readers.
456 */
457static int rcu_preempted_readers(struct rcu_node *rnp)
458{
459 return 0;
460}
461
462#ifdef CONFIG_HOTPLUG_CPU
463
464/*
465 * Because preemptable RCU does not exist, it never needs to migrate
466 * tasks that were blocked within RCU read-side critical sections.
467 */
468static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
469 struct rcu_node *rnp)
470{
471}
472
473/*
474 * Because preemptable RCU does not exist, it never needs CPU-offline
475 * processing.
476 */
477static void rcu_preempt_offline_cpu(int cpu)
478{
479}
480
481#endif /* #ifdef CONFIG_HOTPLUG_CPU */
482
483/*
484 * Because preemptable RCU does not exist, it never has any callbacks
485 * to check.
486 */
487void rcu_preempt_check_callbacks(int cpu)
488{
489}
490
491/*
492 * Because preemptable RCU does not exist, it never has any callbacks
493 * to process.
494 */
495void rcu_preempt_process_callbacks(void)
496{
497}
498
499/*
500 * In classic RCU, call_rcu() is just call_rcu_sched().
501 */
502void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
503{
504 call_rcu_sched(head, func);
505}
506EXPORT_SYMBOL_GPL(call_rcu);
507
508/*
509 * Because preemptable RCU does not exist, it never has any work to do.
510 */
511static int rcu_preempt_pending(int cpu)
512{
513 return 0;
514}
515
516/*
517 * Because preemptable RCU does not exist, it never needs any CPU.
518 */
519static int rcu_preempt_needs_cpu(int cpu)
520{
521 return 0;
522}
523
524/*
525 * Because preemptable RCU does not exist, there is no per-CPU
526 * data to initialize.
527 */
528static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
529{
530}
531
532#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..0ea1bff69727 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..e27a53685ed9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,8 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2); 122static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126 123
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 124static inline int rt_policy(int policy)
148{ 125{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 126 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +286,8 @@ void set_tg_uid(struct user_struct *user)
309 286
310/* 287/*
311 * Root task group. 288 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 289 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 290 * be a child to this group.
314 */ 291 */
315struct task_group root_task_group; 292struct task_group root_task_group;
316 293
@@ -318,7 +295,7 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 295/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 296static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 297/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 298static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 299#endif /* CONFIG_FAIR_GROUP_SCHED */
323 300
324#ifdef CONFIG_RT_GROUP_SCHED 301#ifdef CONFIG_RT_GROUP_SCHED
@@ -616,6 +593,7 @@ struct rq {
616 593
617 unsigned char idle_at_tick; 594 unsigned char idle_at_tick;
618 /* For active balancing */ 595 /* For active balancing */
596 int post_schedule;
619 int active_balance; 597 int active_balance;
620 int push_cpu; 598 int push_cpu;
621 /* cpu of this runqueue: */ 599 /* cpu of this runqueue: */
@@ -626,6 +604,9 @@ struct rq {
626 604
627 struct task_struct *migration_thread; 605 struct task_struct *migration_thread;
628 struct list_head migration_queue; 606 struct list_head migration_queue;
607
608 u64 rt_avg;
609 u64 age_stamp;
629#endif 610#endif
630 611
631 /* calc_load related fields */ 612 /* calc_load related fields */
@@ -693,6 +674,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 674#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 675#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 676#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
677#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 678
697inline void update_rq_clock(struct rq *rq) 679inline void update_rq_clock(struct rq *rq)
698{ 680{
@@ -861,6 +843,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 843unsigned int sysctl_sched_shares_thresh = 4;
862 844
863/* 845/*
846 * period over which we average the RT time consumption, measured
847 * in ms.
848 *
849 * default: 1s
850 */
851const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
852
853/*
864 * period over which we measure -rt task cpu usage in us. 854 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 855 * default: 1s
866 */ 856 */
@@ -1278,12 +1268,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1268}
1279#endif /* CONFIG_NO_HZ */ 1269#endif /* CONFIG_NO_HZ */
1280 1270
1271static u64 sched_avg_period(void)
1272{
1273 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1274}
1275
1276static void sched_avg_update(struct rq *rq)
1277{
1278 s64 period = sched_avg_period();
1279
1280 while ((s64)(rq->clock - rq->age_stamp) > period) {
1281 rq->age_stamp += period;
1282 rq->rt_avg /= 2;
1283 }
1284}
1285
1286static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1287{
1288 rq->rt_avg += rt_delta;
1289 sched_avg_update(rq);
1290}
1291
1281#else /* !CONFIG_SMP */ 1292#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1293static void resched_task(struct task_struct *p)
1283{ 1294{
1284 assert_spin_locked(&task_rq(p)->lock); 1295 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1296 set_tsk_need_resched(p);
1286} 1297}
1298
1299static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1300{
1301}
1287#endif /* CONFIG_SMP */ 1302#endif /* CONFIG_SMP */
1288 1303
1289#if BITS_PER_LONG == 32 1304#if BITS_PER_LONG == 32
@@ -1513,28 +1528,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1528
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1529#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1530
1531struct update_shares_data {
1532 unsigned long rq_weight[NR_CPUS];
1533};
1534
1535static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1536
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1537static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1538
1518/* 1539/*
1519 * Calculate and set the cpu's group shares. 1540 * Calculate and set the cpu's group shares.
1520 */ 1541 */
1521static void 1542static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1543 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1544 unsigned long sd_rq_weight,
1545 struct update_shares_data *usd)
1524{ 1546{
1525 unsigned long shares; 1547 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1548 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1549
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1550 rq_weight = usd->rq_weight[cpu];
1551 if (!rq_weight) {
1552 boost = 1;
1553 rq_weight = NICE_0_LOAD;
1554 }
1532 1555
1533 /* 1556 /*
1534 * \Sum shares * rq_weight 1557 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1558 * shares_i = -----------------------------
1536 * \Sum rq_weight 1559 * \Sum_j rq_weight_j
1537 *
1538 */ 1560 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1561 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1562 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1567,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1567 unsigned long flags;
1546 1568
1547 spin_lock_irqsave(&rq->lock, flags); 1569 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1570 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1571 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1572 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1573 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1574 }
@@ -1559,22 +1581,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1581 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1582static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1583{
1562 unsigned long weight, rq_weight = 0; 1584 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1585 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1586 struct sched_domain *sd = data;
1587 unsigned long flags;
1565 int i; 1588 int i;
1566 1589
1590 if (!tg->se[0])
1591 return 0;
1592
1593 local_irq_save(flags);
1594 usd = &__get_cpu_var(update_shares_data);
1595
1567 for_each_cpu(i, sched_domain_span(sd)) { 1596 for_each_cpu(i, sched_domain_span(sd)) {
1597 weight = tg->cfs_rq[i]->load.weight;
1598 usd->rq_weight[i] = weight;
1599
1568 /* 1600 /*
1569 * If there are currently no tasks on the cpu pretend there 1601 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1602 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1603 * run here it will not get delayed by group starvation.
1572 */ 1604 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1605 if (!weight)
1575 weight = NICE_0_LOAD; 1606 weight = NICE_0_LOAD;
1576 1607
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1608 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1609 shares += tg->cfs_rq[i]->shares;
1580 } 1610 }
@@ -1586,7 +1616,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1616 shares = tg->shares;
1587 1617
1588 for_each_cpu(i, sched_domain_span(sd)) 1618 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1619 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1620
1621 local_irq_restore(flags);
1590 1622
1591 return 0; 1623 return 0;
1592} 1624}
@@ -1616,8 +1648,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1648
1617static void update_shares(struct sched_domain *sd) 1649static void update_shares(struct sched_domain *sd)
1618{ 1650{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1651 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1652 u64 now;
1653
1654 if (root_task_group_empty())
1655 return;
1656
1657 now = cpu_clock(raw_smp_processor_id());
1658 elapsed = now - sd->last_update;
1621 1659
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1660 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1661 sd->last_update = now;
@@ -1627,6 +1665,9 @@ static void update_shares(struct sched_domain *sd)
1627 1665
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1666static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1667{
1668 if (root_task_group_empty())
1669 return;
1670
1630 spin_unlock(&rq->lock); 1671 spin_unlock(&rq->lock);
1631 update_shares(sd); 1672 update_shares(sd);
1632 spin_lock(&rq->lock); 1673 spin_lock(&rq->lock);
@@ -1634,6 +1675,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1675
1635static void update_h_load(long cpu) 1676static void update_h_load(long cpu)
1636{ 1677{
1678 if (root_task_group_empty())
1679 return;
1680
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1681 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1682}
1639 1683
@@ -2268,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2268 } 2312 }
2269 2313
2270 /* Adjust by relative CPU power of the group */ 2314 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group, 2315 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2272 avg_load * SCHED_LOAD_SCALE);
2273 2316
2274 if (local_group) { 2317 if (local_group) {
2275 this_load = avg_load; 2318 this_load = avg_load;
@@ -2637,9 +2680,32 @@ void sched_fork(struct task_struct *p, int clone_flags)
2637 set_task_cpu(p, cpu); 2680 set_task_cpu(p, cpu);
2638 2681
2639 /* 2682 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2683 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2684 */
2642 p->prio = current->normal_prio; 2685 p->prio = current->normal_prio;
2686
2687 /*
2688 * Revert to default priority/policy on fork if requested.
2689 */
2690 if (unlikely(p->sched_reset_on_fork)) {
2691 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2692 p->policy = SCHED_NORMAL;
2693
2694 if (p->normal_prio < DEFAULT_PRIO)
2695 p->prio = DEFAULT_PRIO;
2696
2697 if (PRIO_TO_NICE(p->static_prio) < 0) {
2698 p->static_prio = NICE_TO_PRIO(0);
2699 set_load_weight(p);
2700 }
2701
2702 /*
2703 * We don't need the reset flag anymore after the fork. It has
2704 * fulfilled its duty:
2705 */
2706 p->sched_reset_on_fork = 0;
2707 }
2708
2643 if (!rt_prio(p->prio)) 2709 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2710 p->sched_class = &fair_sched_class;
2645 2711
@@ -2796,12 +2862,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2862{
2797 struct mm_struct *mm = rq->prev_mm; 2863 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2864 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2865
2806 rq->prev_mm = NULL; 2866 rq->prev_mm = NULL;
2807 2867
@@ -2820,10 +2880,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2880 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2881 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2882 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2883
2828 fire_sched_in_preempt_notifiers(current); 2884 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2885 if (mm)
@@ -2838,6 +2894,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2894 }
2839} 2895}
2840 2896
2897#ifdef CONFIG_SMP
2898
2899/* assumes rq->lock is held */
2900static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2901{
2902 if (prev->sched_class->pre_schedule)
2903 prev->sched_class->pre_schedule(rq, prev);
2904}
2905
2906/* rq->lock is NOT held, but preemption is disabled */
2907static inline void post_schedule(struct rq *rq)
2908{
2909 if (rq->post_schedule) {
2910 unsigned long flags;
2911
2912 spin_lock_irqsave(&rq->lock, flags);
2913 if (rq->curr->sched_class->post_schedule)
2914 rq->curr->sched_class->post_schedule(rq);
2915 spin_unlock_irqrestore(&rq->lock, flags);
2916
2917 rq->post_schedule = 0;
2918 }
2919}
2920
2921#else
2922
2923static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2924{
2925}
2926
2927static inline void post_schedule(struct rq *rq)
2928{
2929}
2930
2931#endif
2932
2841/** 2933/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2934 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2935 * @prev: the thread we just switched away from.
@@ -2848,6 +2940,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2940 struct rq *rq = this_rq();
2849 2941
2850 finish_task_switch(rq, prev); 2942 finish_task_switch(rq, prev);
2943
2944 /*
2945 * FIXME: do we need to worry about rq being invalidated by the
2946 * task_switch?
2947 */
2948 post_schedule(rq);
2949
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2950#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2951 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2952 preempt_enable();
@@ -3379,9 +3478,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3478{
3380 const struct sched_class *class; 3479 const struct sched_class *class;
3381 3480
3382 for (class = sched_class_highest; class; class = class->next) 3481 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3482 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3483 return 1;
3484 }
3385 3485
3386 return 0; 3486 return 0;
3387} 3487}
@@ -3544,7 +3644,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3644 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3645 * from other group and save more power
3546 */ 3646 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3647 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3648 return;
3549 3649
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3650 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3611,6 +3711,77 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3611} 3711}
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3712#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3713
3714unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3715{
3716 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3717 unsigned long smt_gain = sd->smt_gain;
3718
3719 smt_gain /= weight;
3720
3721 return smt_gain;
3722}
3723
3724unsigned long scale_rt_power(int cpu)
3725{
3726 struct rq *rq = cpu_rq(cpu);
3727 u64 total, available;
3728
3729 sched_avg_update(rq);
3730
3731 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3732 available = total - rq->rt_avg;
3733
3734 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3735 total = SCHED_LOAD_SCALE;
3736
3737 total >>= SCHED_LOAD_SHIFT;
3738
3739 return div_u64(available, total);
3740}
3741
3742static void update_cpu_power(struct sched_domain *sd, int cpu)
3743{
3744 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3745 unsigned long power = SCHED_LOAD_SCALE;
3746 struct sched_group *sdg = sd->groups;
3747
3748 /* here we could scale based on cpufreq */
3749
3750 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3751 power *= arch_scale_smt_power(sd, cpu);
3752 power >>= SCHED_LOAD_SHIFT;
3753 }
3754
3755 power *= scale_rt_power(cpu);
3756 power >>= SCHED_LOAD_SHIFT;
3757
3758 if (!power)
3759 power = 1;
3760
3761 sdg->cpu_power = power;
3762}
3763
3764static void update_group_power(struct sched_domain *sd, int cpu)
3765{
3766 struct sched_domain *child = sd->child;
3767 struct sched_group *group, *sdg = sd->groups;
3768 unsigned long power;
3769
3770 if (!child) {
3771 update_cpu_power(sd, cpu);
3772 return;
3773 }
3774
3775 power = 0;
3776
3777 group = child->groups;
3778 do {
3779 power += group->cpu_power;
3780 group = group->next;
3781 } while (group != child->groups);
3782
3783 sdg->cpu_power = power;
3784}
3614 3785
3615/** 3786/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3787 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@ -3624,7 +3795,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3795 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3796 * @sgs: variable to hold the statistics for this group.
3626 */ 3797 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3798static inline void update_sg_lb_stats(struct sched_domain *sd,
3799 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3800 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3801 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3802 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3807,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3807 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3808 unsigned long avg_load_per_task;
3637 3809
3638 if (local_group) 3810 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3811 balance_cpu = group_first_cpu(group);
3812 if (balance_cpu == this_cpu)
3813 update_group_power(sd, this_cpu);
3814 }
3640 3815
3641 /* Tally up the load of all CPUs in the group */ 3816 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3817 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3860,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3860 }
3686 3861
3687 /* Adjust by relative CPU power of the group */ 3862 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3863 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3864
3691 3865
3692 /* 3866 /*
@@ -3698,14 +3872,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3872 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3873 * the hierarchy?
3700 */ 3874 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3875 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3876 group->cpu_power;
3703 3877
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3878 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3879 sgs->group_imb = 1;
3706 3880
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3881 sgs->group_capacity =
3708 3882 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3883}
3710 3884
3711/** 3885/**
@@ -3723,9 +3897,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3897 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3898 struct sd_lb_stats *sds)
3725{ 3899{
3900 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3901 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3902 struct sg_lb_stats sgs;
3728 int load_idx; 3903 int load_idx, prefer_sibling = 0;
3904
3905 if (child && child->flags & SD_PREFER_SIBLING)
3906 prefer_sibling = 1;
3729 3907
3730 init_sd_power_savings_stats(sd, sds, idle); 3908 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3909 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3914,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3914 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3915 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3916 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3917 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3918 local_group, cpus, balance, &sgs);
3741 3919
3742 if (local_group && balance && !(*balance)) 3920 if (local_group && balance && !(*balance))
3743 return; 3921 return;
3744 3922
3745 sds->total_load += sgs.group_load; 3923 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3924 sds->total_pwr += group->cpu_power;
3925
3926 /*
3927 * In case the child domain prefers tasks go to siblings
3928 * first, lower the group capacity to one so that we'll try
3929 * and move all the excess tasks away.
3930 */
3931 if (prefer_sibling)
3932 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3933
3748 if (local_group) { 3934 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3935 sds->this_load = sgs.avg_load;
@@ -3763,7 +3949,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3949 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3950 group = group->next;
3765 } while (group != sd->groups); 3951 } while (group != sd->groups);
3766
3767} 3952}
3768 3953
3769/** 3954/**
@@ -3801,28 +3986,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3986 * moving them.
3802 */ 3987 */
3803 3988
3804 pwr_now += sds->busiest->__cpu_power * 3989 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3990 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3991 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3992 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3993 pwr_now /= SCHED_LOAD_SCALE;
3809 3994
3810 /* Amount of load we'd subtract */ 3995 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3996 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3997 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3998 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3999 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 4000 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 4001
3817 /* Amount of load we'd add */ 4002 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 4003 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 4004 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 4005 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 4006 sds->this->cpu_power;
3822 else 4007 else
3823 tmp = sg_div_cpu_power(sds->this, 4008 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 4009 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 4010 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 4011 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 4012 pwr_move /= SCHED_LOAD_SCALE;
3828 4013
@@ -3857,8 +4042,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 4042 sds->max_load - sds->busiest_load_per_task);
3858 4043
3859 /* How much load to actually move to equalise the imbalance */ 4044 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 4045 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 4046 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 4047 / SCHED_LOAD_SCALE;
3863 4048
3864 /* 4049 /*
@@ -3976,6 +4161,26 @@ ret:
3976 return NULL; 4161 return NULL;
3977} 4162}
3978 4163
4164static struct sched_group *group_of(int cpu)
4165{
4166 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
4167
4168 if (!sd)
4169 return NULL;
4170
4171 return sd->groups;
4172}
4173
4174static unsigned long power_of(int cpu)
4175{
4176 struct sched_group *group = group_of(cpu);
4177
4178 if (!group)
4179 return SCHED_LOAD_SCALE;
4180
4181 return group->cpu_power;
4182}
4183
3979/* 4184/*
3980 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4185 * find_busiest_queue - find the busiest runqueue among the cpus in group.
3981 */ 4186 */
@@ -3988,15 +4193,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4193 int i;
3989 4194
3990 for_each_cpu(i, sched_group_cpus(group)) { 4195 for_each_cpu(i, sched_group_cpus(group)) {
4196 unsigned long power = power_of(i);
4197 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4198 unsigned long wl;
3992 4199
3993 if (!cpumask_test_cpu(i, cpus)) 4200 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4201 continue;
3995 4202
3996 rq = cpu_rq(i); 4203 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4204 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4205 wl /= power;
3998 4206
3999 if (rq->nr_running == 1 && wl > imbalance) 4207 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4208 continue;
4001 4209
4002 if (wl > max_load) { 4210 if (wl > max_load) {
@@ -5325,7 +5533,7 @@ need_resched:
5325 preempt_disable(); 5533 preempt_disable();
5326 cpu = smp_processor_id(); 5534 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5535 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5536 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5537 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5538 switch_count = &prev->nivcsw;
5331 5539
@@ -5349,10 +5557,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5557 switch_count = &prev->nvcsw;
5350 } 5558 }
5351 5559
5352#ifdef CONFIG_SMP 5560 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5561
5357 if (unlikely(!rq->nr_running)) 5562 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5563 idle_balance(cpu, rq);
@@ -5378,6 +5583,8 @@ need_resched_nonpreemptible:
5378 } else 5583 } else
5379 spin_unlock_irq(&rq->lock); 5584 spin_unlock_irq(&rq->lock);
5380 5585
5586 post_schedule(rq);
5587
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5588 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5589 goto need_resched_nonpreemptible;
5383 5590
@@ -6123,17 +6330,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6330 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6331 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6332 struct rq *rq;
6333 int reset_on_fork;
6126 6334
6127 /* may grab non-irq protected spin_locks */ 6335 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6336 BUG_ON(in_interrupt());
6129recheck: 6337recheck:
6130 /* double check policy once rq lock held */ 6338 /* double check policy once rq lock held */
6131 if (policy < 0) 6339 if (policy < 0) {
6340 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6341 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6342 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6343 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6344 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6345
6346 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6347 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6348 policy != SCHED_IDLE)
6349 return -EINVAL;
6350 }
6351
6137 /* 6352 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6353 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6354 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6392,10 @@ recheck:
6177 /* can't change other user's priorities */ 6392 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6393 if (!check_same_owner(p))
6179 return -EPERM; 6394 return -EPERM;
6395
6396 /* Normal users shall not reset the sched_reset_on_fork flag */
6397 if (p->sched_reset_on_fork && !reset_on_fork)
6398 return -EPERM;
6180 } 6399 }
6181 6400
6182 if (user) { 6401 if (user) {
@@ -6220,6 +6439,8 @@ recheck:
6220 if (running) 6439 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6440 p->sched_class->put_prev_task(rq, p);
6222 6441
6442 p->sched_reset_on_fork = reset_on_fork;
6443
6223 oldprio = p->prio; 6444 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6445 __setscheduler(rq, p, policy, param->sched_priority);
6225 6446
@@ -6336,14 +6557,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6557 if (p) {
6337 retval = security_task_getscheduler(p); 6558 retval = security_task_getscheduler(p);
6338 if (!retval) 6559 if (!retval)
6339 retval = p->policy; 6560 retval = p->policy
6561 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6562 }
6341 read_unlock(&tasklist_lock); 6563 read_unlock(&tasklist_lock);
6342 return retval; 6564 return retval;
6343} 6565}
6344 6566
6345/** 6567/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6568 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6569 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6570 * @param: structure containing the RT priority.
6349 */ 6571 */
@@ -6571,19 +6793,9 @@ static inline int should_resched(void)
6571 6793
6572static void __cond_resched(void) 6794static void __cond_resched(void)
6573{ 6795{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6796 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6797 schedule();
6576#endif 6798 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6799}
6588 6800
6589int __sched _cond_resched(void) 6801int __sched _cond_resched(void)
@@ -6597,18 +6809,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6809EXPORT_SYMBOL(_cond_resched);
6598 6810
6599/* 6811/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6812 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6813 * call schedule, and on return reacquire the lock.
6602 * 6814 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6815 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6816 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6817 * spin_unlock(), once by hand).
6606 */ 6818 */
6607int cond_resched_lock(spinlock_t *lock) 6819int __cond_resched_lock(spinlock_t *lock)
6608{ 6820{
6609 int resched = should_resched(); 6821 int resched = should_resched();
6610 int ret = 0; 6822 int ret = 0;
6611 6823
6824 lockdep_assert_held(lock);
6825
6612 if (spin_needbreak(lock) || resched) { 6826 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6827 spin_unlock(lock);
6614 if (resched) 6828 if (resched)
@@ -6620,9 +6834,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6834 }
6621 return ret; 6835 return ret;
6622} 6836}
6623EXPORT_SYMBOL(cond_resched_lock); 6837EXPORT_SYMBOL(__cond_resched_lock);
6624 6838
6625int __sched cond_resched_softirq(void) 6839int __sched __cond_resched_softirq(void)
6626{ 6840{
6627 BUG_ON(!in_softirq()); 6841 BUG_ON(!in_softirq());
6628 6842
@@ -6634,7 +6848,7 @@ int __sched cond_resched_softirq(void)
6634 } 6848 }
6635 return 0; 6849 return 0;
6636} 6850}
6637EXPORT_SYMBOL(cond_resched_softirq); 6851EXPORT_SYMBOL(__cond_resched_softirq);
6638 6852
6639/** 6853/**
6640 * yield - yield the current processor to other threads. 6854 * yield - yield the current processor to other threads.
@@ -6658,11 +6872,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6872 */
6659void __sched io_schedule(void) 6873void __sched io_schedule(void)
6660{ 6874{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6875 struct rq *rq = raw_rq();
6662 6876
6663 delayacct_blkio_start(); 6877 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6878 atomic_inc(&rq->nr_iowait);
6879 current->in_iowait = 1;
6665 schedule(); 6880 schedule();
6881 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6882 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6883 delayacct_blkio_end();
6668} 6884}
@@ -6670,12 +6886,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6886
6671long __sched io_schedule_timeout(long timeout) 6887long __sched io_schedule_timeout(long timeout)
6672{ 6888{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6889 struct rq *rq = raw_rq();
6674 long ret; 6890 long ret;
6675 6891
6676 delayacct_blkio_start(); 6892 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6893 atomic_inc(&rq->nr_iowait);
6894 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6895 ret = schedule_timeout(timeout);
6896 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6897 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6898 delayacct_blkio_end();
6681 return ret; 6899 return ret;
@@ -6992,8 +7210,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7210
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7211 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7212 /* Need help from migration thread: drop lock and wait. */
7213 struct task_struct *mt = rq->migration_thread;
7214
7215 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7216 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7217 wake_up_process(rq->migration_thread);
7218 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7219 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7220 tlb_migrate_finish(p->mm);
6999 return 0; 7221 return 0;
@@ -7051,6 +7273,11 @@ fail:
7051 return ret; 7273 return ret;
7052} 7274}
7053 7275
7276#define RCU_MIGRATION_IDLE 0
7277#define RCU_MIGRATION_NEED_QS 1
7278#define RCU_MIGRATION_GOT_QS 2
7279#define RCU_MIGRATION_MUST_SYNC 3
7280
7054/* 7281/*
7055 * migration_thread - this is a highprio system thread that performs 7282 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7283 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7285,7 @@ fail:
7058 */ 7285 */
7059static int migration_thread(void *data) 7286static int migration_thread(void *data)
7060{ 7287{
7288 int badcpu;
7061 int cpu = (long)data; 7289 int cpu = (long)data;
7062 struct rq *rq; 7290 struct rq *rq;
7063 7291
@@ -7092,8 +7320,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7320 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7321 list_del_init(head->next);
7094 7322
7095 spin_unlock(&rq->lock); 7323 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7324 spin_unlock(&rq->lock);
7325 __migrate_task(req->task, cpu, req->dest_cpu);
7326 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7327 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7328 spin_unlock(&rq->lock);
7329 } else {
7330 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7331 spin_unlock(&rq->lock);
7332 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7333 }
7097 local_irq_enable(); 7334 local_irq_enable();
7098 7335
7099 complete(&req->done); 7336 complete(&req->done);
@@ -7625,7 +7862,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7862 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7863 register_cpu_notifier(&migration_notifier);
7627 7864
7628 return err; 7865 return 0;
7629} 7866}
7630early_initcall(migration_init); 7867early_initcall(migration_init);
7631#endif 7868#endif
@@ -7672,7 +7909,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7909 break;
7673 } 7910 }
7674 7911
7675 if (!group->__cpu_power) { 7912 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7913 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7914 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7915 "set\n");
@@ -7696,9 +7933,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7933 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7934
7698 printk(KERN_CONT " %s", str); 7935 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7936 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7937 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7938 group->cpu_power);
7702 } 7939 }
7703 7940
7704 group = group->next; 7941 group = group->next;
@@ -7841,7 +8078,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 8078 rq->rd = rd;
7842 8079
7843 cpumask_set_cpu(rq->cpu, rd->span); 8080 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 8081 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 8082 set_rq_online(rq);
7846 8083
7847 spin_unlock_irqrestore(&rq->lock, flags); 8084 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8220,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8220 continue;
7984 8221
7985 cpumask_clear(sched_group_cpus(sg)); 8222 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8223 sg->cpu_power = 0;
7987 8224
7988 for_each_cpu(j, span) { 8225 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8226 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8328,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8328 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8329};
8093 8330
8331struct s_data {
8332#ifdef CONFIG_NUMA
8333 int sd_allnodes;
8334 cpumask_var_t domainspan;
8335 cpumask_var_t covered;
8336 cpumask_var_t notcovered;
8337#endif
8338 cpumask_var_t nodemask;
8339 cpumask_var_t this_sibling_map;
8340 cpumask_var_t this_core_map;
8341 cpumask_var_t send_covered;
8342 cpumask_var_t tmpmask;
8343 struct sched_group **sched_group_nodes;
8344 struct root_domain *rd;
8345};
8346
8347enum s_alloc {
8348 sa_sched_groups = 0,
8349 sa_rootdomain,
8350 sa_tmpmask,
8351 sa_send_covered,
8352 sa_this_core_map,
8353 sa_this_sibling_map,
8354 sa_nodemask,
8355 sa_sched_group_nodes,
8356#ifdef CONFIG_NUMA
8357 sa_notcovered,
8358 sa_covered,
8359 sa_domainspan,
8360#endif
8361 sa_none,
8362};
8363
8094/* 8364/*
8095 * SMT sched-domains: 8365 * SMT sched-domains:
8096 */ 8366 */
@@ -8208,11 +8478,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8478 continue;
8209 } 8479 }
8210 8480
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8481 sg->cpu_power += sd->groups->cpu_power;
8212 } 8482 }
8213 sg = sg->next; 8483 sg = sg->next;
8214 } while (sg != group_head); 8484 } while (sg != group_head);
8215} 8485}
8486
8487static int build_numa_sched_groups(struct s_data *d,
8488 const struct cpumask *cpu_map, int num)
8489{
8490 struct sched_domain *sd;
8491 struct sched_group *sg, *prev;
8492 int n, j;
8493
8494 cpumask_clear(d->covered);
8495 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8496 if (cpumask_empty(d->nodemask)) {
8497 d->sched_group_nodes[num] = NULL;
8498 goto out;
8499 }
8500
8501 sched_domain_node_span(num, d->domainspan);
8502 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8503
8504 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8505 GFP_KERNEL, num);
8506 if (!sg) {
8507 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8508 num);
8509 return -ENOMEM;
8510 }
8511 d->sched_group_nodes[num] = sg;
8512
8513 for_each_cpu(j, d->nodemask) {
8514 sd = &per_cpu(node_domains, j).sd;
8515 sd->groups = sg;
8516 }
8517
8518 sg->cpu_power = 0;
8519 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8520 sg->next = sg;
8521 cpumask_or(d->covered, d->covered, d->nodemask);
8522
8523 prev = sg;
8524 for (j = 0; j < nr_node_ids; j++) {
8525 n = (num + j) % nr_node_ids;
8526 cpumask_complement(d->notcovered, d->covered);
8527 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8528 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8529 if (cpumask_empty(d->tmpmask))
8530 break;
8531 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8532 if (cpumask_empty(d->tmpmask))
8533 continue;
8534 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8535 GFP_KERNEL, num);
8536 if (!sg) {
8537 printk(KERN_WARNING
8538 "Can not alloc domain group for node %d\n", j);
8539 return -ENOMEM;
8540 }
8541 sg->cpu_power = 0;
8542 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8543 sg->next = prev->next;
8544 cpumask_or(d->covered, d->covered, d->tmpmask);
8545 prev->next = sg;
8546 prev = sg;
8547 }
8548out:
8549 return 0;
8550}
8216#endif /* CONFIG_NUMA */ 8551#endif /* CONFIG_NUMA */
8217 8552
8218#ifdef CONFIG_NUMA 8553#ifdef CONFIG_NUMA
@@ -8266,15 +8601,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8601 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8602 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8603 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8604 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8605static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8606{
8276 struct sched_domain *child; 8607 struct sched_domain *child;
8277 struct sched_group *group; 8608 struct sched_group *group;
8609 long power;
8610 int weight;
8278 8611
8279 WARN_ON(!sd || !sd->groups); 8612 WARN_ON(!sd || !sd->groups);
8280 8613
@@ -8283,28 +8616,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8616
8284 child = sd->child; 8617 child = sd->child;
8285 8618
8286 sd->groups->__cpu_power = 0; 8619 sd->groups->cpu_power = 0;
8287 8620
8288 /* 8621 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8622 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8623 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8624 /*
8292 * can handle only one task, when there are other idle groups in the 8625 * SMT siblings share the power of a single core.
8293 * same sched domain. 8626 * Usually multiple threads get a better yield out of
8294 */ 8627 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8628 * reflect that in sd->smt_gain.
8296 (child->flags & 8629 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8630 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8631 power *= sd->smt_gain;
8632 power /= weight;
8633 power >>= SCHED_LOAD_SHIFT;
8634 }
8635 sd->groups->cpu_power += power;
8299 return; 8636 return;
8300 } 8637 }
8301 8638
8302 /* 8639 /*
8303 * add cpu_power of each child group to this groups cpu_power 8640 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8641 */
8305 group = child->groups; 8642 group = child->groups;
8306 do { 8643 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8644 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8645 group = group->next;
8309 } while (group != child->groups); 8646 } while (group != child->groups);
8310} 8647}
@@ -8378,280 +8715,285 @@ static void set_domain_attribute(struct sched_domain *sd,
8378 } 8715 }
8379} 8716}
8380 8717
8381/* 8718static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8719 const struct cpumask *cpu_map)
8383 * to the individual cpus 8720{
8384 */ 8721 switch (what) {
8385static int __build_sched_domains(const struct cpumask *cpu_map, 8722 case sa_sched_groups:
8386 struct sched_domain_attr *attr) 8723 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8387{ 8724 d->sched_group_nodes = NULL;
8388 int i, err = -ENOMEM; 8725 case sa_rootdomain:
8389 struct root_domain *rd; 8726 free_rootdomain(d->rd); /* fall through */
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, 8727 case sa_tmpmask:
8391 tmpmask; 8728 free_cpumask_var(d->tmpmask); /* fall through */
8729 case sa_send_covered:
8730 free_cpumask_var(d->send_covered); /* fall through */
8731 case sa_this_core_map:
8732 free_cpumask_var(d->this_core_map); /* fall through */
8733 case sa_this_sibling_map:
8734 free_cpumask_var(d->this_sibling_map); /* fall through */
8735 case sa_nodemask:
8736 free_cpumask_var(d->nodemask); /* fall through */
8737 case sa_sched_group_nodes:
8392#ifdef CONFIG_NUMA 8738#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered; 8739 kfree(d->sched_group_nodes); /* fall through */
8394 struct sched_group **sched_group_nodes = NULL; 8740 case sa_notcovered:
8395 int sd_allnodes = 0; 8741 free_cpumask_var(d->notcovered); /* fall through */
8396 8742 case sa_covered:
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8743 free_cpumask_var(d->covered); /* fall through */
8398 goto out; 8744 case sa_domainspan:
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8745 free_cpumask_var(d->domainspan); /* fall through */
8400 goto free_domainspan; 8746#endif
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8747 case sa_none:
8402 goto free_covered; 8748 break;
8403#endif 8749 }
8404 8750}
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415 8751
8752static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8753 const struct cpumask *cpu_map)
8754{
8416#ifdef CONFIG_NUMA 8755#ifdef CONFIG_NUMA
8417 /* 8756 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8757 return sa_none;
8419 */ 8758 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8759 return sa_domainspan;
8421 GFP_KERNEL); 8760 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8761 return sa_covered;
8762 /* Allocate the per-node list of sched groups */
8763 d->sched_group_nodes = kcalloc(nr_node_ids,
8764 sizeof(struct sched_group *), GFP_KERNEL);
8765 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8766 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8767 return sa_notcovered;
8425 } 8768 }
8426#endif 8769 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8770#endif
8428 rd = alloc_rootdomain(); 8771 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8772 return sa_sched_group_nodes;
8773 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8774 return sa_nodemask;
8775 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8776 return sa_this_sibling_map;
8777 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8778 return sa_this_core_map;
8779 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8780 return sa_send_covered;
8781 d->rd = alloc_rootdomain();
8782 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8783 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8784 return sa_tmpmask;
8432 } 8785 }
8786 return sa_rootdomain;
8787}
8433 8788
8789static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8790 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8791{
8792 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8793#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8794 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8795
8459 sd = &per_cpu(node_domains, i).sd; 8796 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8797 if (cpumask_weight(cpu_map) >
8798 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8799 sd = &per_cpu(allnodes_domains, i).sd;
8800 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8801 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8802 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8803 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8804 d->sd_allnodes = 1;
8465 p->child = sd; 8805 }
8466 cpumask_and(sched_domain_span(sd), 8806 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8807
8808 sd = &per_cpu(node_domains, i).sd;
8809 SD_INIT(sd, NODE);
8810 set_domain_attribute(sd, attr);
8811 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8812 sd->parent = parent;
8813 if (parent)
8814 parent->child = sd;
8815 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8816#endif
8817 return sd;
8818}
8469 8819
8470 p = sd; 8820static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8821 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8822 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8823{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8824 struct sched_domain *sd;
8475 sd->parent = p; 8825 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8826 SD_INIT(sd, CPU);
8477 p->child = sd; 8827 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8828 cpumask_copy(sched_domain_span(sd), d->nodemask);
8829 sd->parent = parent;
8830 if (parent)
8831 parent->child = sd;
8832 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8833 return sd;
8834}
8479 8835
8836static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8837 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8838 struct sched_domain *parent, int i)
8839{
8840 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8841#ifdef CONFIG_SCHED_MC
8481 p = sd; 8842 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8843 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8844 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8845 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8846 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8847 parent->child = sd;
8487 sd->parent = p; 8848 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8849#endif
8850 return sd;
8851}
8491 8852
8853static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8854 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8855 struct sched_domain *parent, int i)
8856{
8857 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8858#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8859 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8860 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8861 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8862 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8863 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8864 parent->child = sd;
8499 sd->parent = p; 8865 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8866#endif
8503 } 8867 return sd;
8868}
8504 8869
8870static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8871 const struct cpumask *cpu_map, int cpu)
8872{
8873 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8874#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8875 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8876 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8877 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8878 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8879 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8880 &cpu_to_cpu_group,
8512 8881 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8882 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8883#endif
8518
8519#ifdef CONFIG_SCHED_MC 8884#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8885 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8886 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8887 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8888 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8889 &cpu_to_core_group,
8525 8890 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8891 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8892#endif
8531 8893 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8894 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8895 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8896 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8897 &cpu_to_phys_group,
8536 continue; 8898 d->send_covered, d->tmpmask);
8537 8899 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8900#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8901 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8902 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8903 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8904 break;
8548 send_covered, tmpmask); 8905#endif
8906 default:
8907 break;
8549 } 8908 }
8909}
8550 8910
8551 for (i = 0; i < nr_node_ids; i++) { 8911/*
8552 /* Set up node groups */ 8912 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8913 * to the individual cpus
8554 int j; 8914 */
8555 8915static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8916 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8917{
8558 if (cpumask_empty(nodemask)) { 8918 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8919 struct s_data d;
8560 continue; 8920 struct sched_domain *sd;
8561 } 8921 int i;
8922#ifdef CONFIG_NUMA
8923 d.sd_allnodes = 0;
8924#endif
8562 8925
8563 sched_domain_node_span(i, domainspan); 8926 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8927 if (alloc_state != sa_rootdomain)
8928 goto error;
8929 alloc_state = sa_sched_groups;
8565 8930
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8931 /*
8567 GFP_KERNEL, i); 8932 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8933 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8934 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8935 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8936 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8937
8577 sd = &per_cpu(node_domains, j).sd; 8938 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8939 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8940 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8941 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8942 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8943
8586 for (j = 0; j < nr_node_ids; j++) { 8944 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8945 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8946 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8947 }
8588 8948
8589 cpumask_complement(notcovered, covered); 8949 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8950 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8951 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8952
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8953#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8954 /* Set up node groups */
8597 continue; 8955 if (d.sd_allnodes)
8956 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8957
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8958 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8959 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8960 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8961#endif
8616 8962
8617 /* Calculate CPU power for physical packages and nodes */ 8963 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8964#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8965 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8966 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8967 init_sched_groups_power(i, sd);
8623 } 8968 }
8624#endif 8969#endif
8625#ifdef CONFIG_SCHED_MC 8970#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8971 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8972 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8973 init_sched_groups_power(i, sd);
8630 } 8974 }
8631#endif 8975#endif
8632 8976
8633 for_each_cpu(i, cpu_map) { 8977 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8978 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8979 init_sched_groups_power(i, sd);
8637 } 8980 }
8638 8981
8639#ifdef CONFIG_NUMA 8982#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8983 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8984 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8985
8643 if (sd_allnodes) { 8986 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8987 struct sched_group *sg;
8645 8988
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8989 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8990 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8991 init_numa_sched_groups_power(sg);
8649 } 8992 }
8650#endif 8993#endif
8651 8994
8652 /* Attach the domains */ 8995 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8996 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8997#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8998 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8999#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +9001,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 9001#else
8660 sd = &per_cpu(phys_domains, i).sd; 9002 sd = &per_cpu(phys_domains, i).sd;
8661#endif 9003#endif
8662 cpu_attach_domain(sd, rd, i); 9004 cpu_attach_domain(sd, d.rd, i);
8663 } 9005 }
8664 9006
8665 err = 0; 9007 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 9008 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 9009 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 9010
8694#ifdef CONFIG_NUMA
8695error: 9011error:
8696 free_sched_groups(cpu_map, tmpmask); 9012 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 9013 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 9014}
8701 9015
8702static int build_sched_domains(const struct cpumask *cpu_map) 9016static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9618,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9618 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9619 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9620 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9621 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9622 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9623 */
9310 init_tg_cfs_entry(&init_task_group, 9624 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9625 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9626 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9627 root_task_group.se[i]);
9314 9628
@@ -9334,6 +9648,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9648#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9649 rq->sd = NULL;
9336 rq->rd = NULL; 9650 rq->rd = NULL;
9651 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9652 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9653 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9654 rq->push_cpu = 0;
@@ -9398,13 +9713,20 @@ void __init sched_init(void)
9398} 9713}
9399 9714
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9715#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9716static inline int preempt_count_equals(int preempt_offset)
9717{
9718 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9719
9720 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9721}
9722
9723void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9724{
9403#ifdef in_atomic 9725#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9726 static unsigned long prev_jiffy; /* ratelimiting */
9405 9727
9406 if ((!in_atomic() && !irqs_disabled()) || 9728 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9729 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9730 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9731 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9732 return;
@@ -10581,3 +10903,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10903 .subsys_id = cpuacct_subsys_id,
10582}; 10904};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10905#endif /* CONFIG_CGROUP_CPUACCT */
10906
10907#ifndef CONFIG_SMP
10908
10909int rcu_expedited_torture_stats(char *page)
10910{
10911 return 0;
10912}
10913EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10914
10915void synchronize_sched_expedited(void)
10916{
10917}
10918EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10919
10920#else /* #ifndef CONFIG_SMP */
10921
10922static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10923static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10924
10925#define RCU_EXPEDITED_STATE_POST -2
10926#define RCU_EXPEDITED_STATE_IDLE -1
10927
10928static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10929
10930int rcu_expedited_torture_stats(char *page)
10931{
10932 int cnt = 0;
10933 int cpu;
10934
10935 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10936 for_each_online_cpu(cpu) {
10937 cnt += sprintf(&page[cnt], " %d:%d",
10938 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10939 }
10940 cnt += sprintf(&page[cnt], "\n");
10941 return cnt;
10942}
10943EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10944
10945static long synchronize_sched_expedited_count;
10946
10947/*
10948 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10949 * approach to force grace period to end quickly. This consumes
10950 * significant time on all CPUs, and is thus not recommended for
10951 * any sort of common-case code.
10952 *
10953 * Note that it is illegal to call this function while holding any
10954 * lock that is acquired by a CPU-hotplug notifier. Failing to
10955 * observe this restriction will result in deadlock.
10956 */
10957void synchronize_sched_expedited(void)
10958{
10959 int cpu;
10960 unsigned long flags;
10961 bool need_full_sync = 0;
10962 struct rq *rq;
10963 struct migration_req *req;
10964 long snap;
10965 int trycount = 0;
10966
10967 smp_mb(); /* ensure prior mod happens before capturing snap. */
10968 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10969 get_online_cpus();
10970 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10971 put_online_cpus();
10972 if (trycount++ < 10)
10973 udelay(trycount * num_online_cpus());
10974 else {
10975 synchronize_sched();
10976 return;
10977 }
10978 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10979 smp_mb(); /* ensure test happens before caller kfree */
10980 return;
10981 }
10982 get_online_cpus();
10983 }
10984 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10985 for_each_online_cpu(cpu) {
10986 rq = cpu_rq(cpu);
10987 req = &per_cpu(rcu_migration_req, cpu);
10988 init_completion(&req->done);
10989 req->task = NULL;
10990 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10991 spin_lock_irqsave(&rq->lock, flags);
10992 list_add(&req->list, &rq->migration_queue);
10993 spin_unlock_irqrestore(&rq->lock, flags);
10994 wake_up_process(rq->migration_thread);
10995 }
10996 for_each_online_cpu(cpu) {
10997 rcu_expedited_state = cpu;
10998 req = &per_cpu(rcu_migration_req, cpu);
10999 rq = cpu_rq(cpu);
11000 wait_for_completion(&req->done);
11001 spin_lock_irqsave(&rq->lock, flags);
11002 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
11003 need_full_sync = 1;
11004 req->dest_cpu = RCU_MIGRATION_IDLE;
11005 spin_unlock_irqrestore(&rq->lock, flags);
11006 }
11007 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
11008 mutex_unlock(&rcu_sched_expedited_mutex);
11009 put_online_cpus();
11010 if (need_full_sync)
11011 synchronize_sched();
11012}
11013EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
11014
11015#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index d014efbf947a..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
127 127
128 /* 128 /*
129 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
130 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
131 */ 134 */
132 if (likely(oldpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
134
135 spin_lock_irqsave(&vec->lock, flags);
136
137 vec->count--;
138 if (!vec->count)
139 clear_bit(oldpri, cp->pri_active);
140 cpumask_clear_cpu(cpu, vec->mask);
141
142 spin_unlock_irqrestore(&vec->lock, flags);
143 }
144
145 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
146 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
147 137
@@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
154 144
155 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
156 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
157 159
158 *currpri = newpri; 160 *currpri = newpri;
159} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..5ddbd0891267 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 409 PN(se.wait_max);
410 PN(se.wait_sum); 410 PN(se.wait_sum);
411 P(se.wait_count); 411 P(se.wait_count);
412 PN(se.iowait_sum);
413 P(se.iowait_count);
412 P(sched_info.bkl_count); 414 P(sched_info.bkl_count);
413 P(se.nr_migrations); 415 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 416 P(se.nr_migrations_cold);
@@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 481 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 482 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 483 p->se.wait_count = 0;
484 p->se.iowait_sum = 0;
485 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 486 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 487 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 488 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 652e8bdef9aa..aa7f84121016 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
537 schedstat_set(se->wait_count, se->wait_count + 1); 545 schedstat_set(se->wait_count, se->wait_count + 1);
538 schedstat_set(se->wait_sum, se->wait_sum + 546 schedstat_set(se->wait_sum, se->wait_sum +
539 rq_of(cfs_rq)->clock - se->wait_start); 547 rq_of(cfs_rq)->clock - se->wait_start);
548#ifdef CONFIG_SCHEDSTATS
549 if (entity_is_task(se)) {
550 trace_sched_stat_wait(task_of(se),
551 rq_of(cfs_rq)->clock - se->wait_start);
552 }
553#endif
540 schedstat_set(se->wait_start, 0); 554 schedstat_set(se->wait_start, 0);
541} 555}
542 556
@@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
628 se->sleep_start = 0; 642 se->sleep_start = 0;
629 se->sum_sleep_runtime += delta; 643 se->sum_sleep_runtime += delta;
630 644
631 if (tsk) 645 if (tsk) {
632 account_scheduler_latency(tsk, delta >> 10, 1); 646 account_scheduler_latency(tsk, delta >> 10, 1);
647 trace_sched_stat_sleep(tsk, delta);
648 }
633 } 649 }
634 if (se->block_start) { 650 if (se->block_start) {
635 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 651 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
@@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
644 se->sum_sleep_runtime += delta; 660 se->sum_sleep_runtime += delta;
645 661
646 if (tsk) { 662 if (tsk) {
663 if (tsk->in_iowait) {
664 se->iowait_sum += delta;
665 se->iowait_count++;
666 trace_sched_stat_iowait(tsk, delta);
667 }
668
647 /* 669 /*
648 * Blocking time is in units of nanosecs, so shift by 670 * Blocking time is in units of nanosecs, so shift by
649 * 20 to get a milliseconds-range estimation of the 671 * 20 to get a milliseconds-range estimation of the
@@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
705 727
706 vruntime -= thresh; 728 vruntime -= thresh;
707 } 729 }
708
709 /* ensure we never gain time by being placed backwards. */
710 vruntime = max_vruntime(se->vruntime, vruntime);
711 } 730 }
712 731
732 /* ensure we never gain time by being placed backwards. */
733 vruntime = max_vruntime(se->vruntime, vruntime);
734
713 se->vruntime = vruntime; 735 se->vruntime = vruntime;
714} 736}
715 737
@@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq)
1046 * search starts with cpus closest then further out as needed, 1068 * search starts with cpus closest then further out as needed,
1047 * so we always favor a closer, idle cpu. 1069 * so we always favor a closer, idle cpu.
1048 * Domains may include CPUs that are not usable for migration, 1070 * Domains may include CPUs that are not usable for migration,
1049 * hence we need to mask them out (cpu_active_mask) 1071 * hence we need to mask them out (rq->rd->online)
1050 * 1072 *
1051 * Returns the CPU we should wake onto. 1073 * Returns the CPU we should wake onto.
1052 */ 1074 */
1053#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1075#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1054static int wake_idle(int cpu, struct task_struct *p) 1079static int wake_idle(int cpu, struct task_struct *p)
1055{ 1080{
1056 struct sched_domain *sd; 1081 struct sched_domain *sd;
1057 int i; 1082 int i;
1058 unsigned int chosen_wakeup_cpu; 1083 unsigned int chosen_wakeup_cpu;
1059 int this_cpu; 1084 int this_cpu;
1085 struct rq *task_rq = task_rq(p);
1060 1086
1061 /* 1087 /*
1062 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu 1088 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
@@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p)
1089 for_each_domain(cpu, sd) { 1115 for_each_domain(cpu, sd) {
1090 if ((sd->flags & SD_WAKE_IDLE) 1116 if ((sd->flags & SD_WAKE_IDLE)
1091 || ((sd->flags & SD_WAKE_IDLE_FAR) 1117 || ((sd->flags & SD_WAKE_IDLE_FAR)
1092 && !task_hot(p, task_rq(p)->clock, sd))) { 1118 && !task_hot(p, task_rq->clock, sd))) {
1093 for_each_cpu_and(i, sched_domain_span(sd), 1119 for_each_cpu_and(i, sched_domain_span(sd),
1094 &p->cpus_allowed) { 1120 &p->cpus_allowed) {
1095 if (cpu_active(i) && idle_cpu(i)) { 1121 if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1096 if (i != task_cpu(p)) { 1122 if (i != task_cpu(p)) {
1097 schedstat_inc(p, 1123 schedstat_inc(p,
1098 se.nr_wakeups_idle); 1124 se.nr_wakeups_idle);
@@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1235 tg = task_group(p); 1261 tg = task_group(p);
1236 weight = p->se.load.weight; 1262 weight = p->se.load.weight;
1237 1263
1238 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1264 /*
1265 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1266 * due to the sync cause above having dropped tl to 0, we'll always have
1267 * an imbalance, but there's really nothing you can do about that, so
1268 * that's good too.
1269 *
1270 * Otherwise check if either cpus are near enough in load to allow this
1271 * task to be woken on this_cpu.
1272 */
1273 balanced = !tl ||
1274 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1239 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1275 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1240 1276
1241 /* 1277 /*
@@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1278 this_rq = cpu_rq(this_cpu); 1314 this_rq = cpu_rq(this_cpu);
1279 new_cpu = prev_cpu; 1315 new_cpu = prev_cpu;
1280 1316
1281 if (prev_cpu == this_cpu)
1282 goto out;
1283 /* 1317 /*
1284 * 'this_sd' is the first domain that both 1318 * 'this_sd' is the first domain that both
1285 * this_cpu and prev_cpu are present in: 1319 * this_cpu and prev_cpu are present in:
@@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1721 sched_info_queued(p); 1755 sched_info_queued(p);
1722 1756
1723 update_curr(cfs_rq); 1757 update_curr(cfs_rq);
1758 if (curr)
1759 se->vruntime = curr->vruntime;
1724 place_entity(cfs_rq, se, 1); 1760 place_entity(cfs_rq, se, 1);
1725 1761
1726 /* 'curr' will be NULL if the child belongs to a different group */ 1762 /* 'curr' will be NULL if the child belongs to a different group */
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..e2dc63a5815d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,4 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 0)
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 2SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 3SCHED_FEAT(ADAPTIVE_GRAN, 1)
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 4SCHED_FEAT(WAKEUP_PREEMPT, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 3918e01994e0..2eb4bd6a526c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,15 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
14
15static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
16{ 19{
17 return rt_rq->rq; 20 return rt_rq->rq;
@@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
26 29
27#define rt_entity_is_task(rt_se) (1) 30#define rt_entity_is_task(rt_se) (1)
28 31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
29static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
30{ 38{
31 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -128,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
128 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
129} 137}
130 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
131#else 144#else
132 145
133static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -602,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
602 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
603 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
604 617
618 sched_rt_avg_update(rq, delta_exec);
619
605 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
606 return; 621 return;
607 622
@@ -874,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
874 889
875 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
876 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
877
878 inc_cpu_load(rq, p->se.load.weight);
879} 892}
880 893
881static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -886,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
886 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
887 900
888 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
889
890 dec_cpu_load(rq, p->se.load.weight);
891} 902}
892 903
893/* 904/*
@@ -1064,6 +1075,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1064 if (p) 1075 if (p)
1065 dequeue_pushable_task(rq, p); 1076 dequeue_pushable_task(rq, p);
1066 1077
1078#ifdef CONFIG_SMP
1079 /*
1080 * We detect this state here so that we can avoid taking the RQ
1081 * lock again later if there is no need to push
1082 */
1083 rq->post_schedule = has_pushable_tasks(rq);
1084#endif
1085
1067 return p; 1086 return p;
1068} 1087}
1069 1088
@@ -1162,13 +1181,6 @@ static int find_lowest_rq(struct task_struct *task)
1162 return -1; /* No targets found */ 1181 return -1; /* No targets found */
1163 1182
1164 /* 1183 /*
1165 * Only consider CPUs that are usable for migration.
1166 * I guess we might want to change cpupri_find() to ignore those
1167 * in the first place.
1168 */
1169 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1170
1171 /*
1172 * At this point we have built a mask of cpus representing the 1184 * At this point we have built a mask of cpus representing the
1173 * lowest priority tasks in the system. Now we want to elect 1185 * lowest priority tasks in the system. Now we want to elect
1174 * the best one based on our affinity and topology. 1186 * the best one based on our affinity and topology.
@@ -1262,11 +1274,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1262 return lowest_rq; 1274 return lowest_rq;
1263} 1275}
1264 1276
1265static inline int has_pushable_tasks(struct rq *rq)
1266{
1267 return !plist_head_empty(&rq->rt.pushable_tasks);
1268}
1269
1270static struct task_struct *pick_next_pushable_task(struct rq *rq) 1277static struct task_struct *pick_next_pushable_task(struct rq *rq)
1271{ 1278{
1272 struct task_struct *p; 1279 struct task_struct *p;
@@ -1466,23 +1473,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1466 pull_rt_task(rq); 1473 pull_rt_task(rq);
1467} 1474}
1468 1475
1469/*
1470 * assumes rq->lock is held
1471 */
1472static int needs_post_schedule_rt(struct rq *rq)
1473{
1474 return has_pushable_tasks(rq);
1475}
1476
1477static void post_schedule_rt(struct rq *rq) 1476static void post_schedule_rt(struct rq *rq)
1478{ 1477{
1479 /*
1480 * This is only called if needs_post_schedule_rt() indicates that
1481 * we need to push tasks away
1482 */
1483 spin_lock_irq(&rq->lock);
1484 push_rt_tasks(rq); 1478 push_rt_tasks(rq);
1485 spin_unlock_irq(&rq->lock);
1486} 1479}
1487 1480
1488/* 1481/*
@@ -1758,7 +1751,6 @@ static const struct sched_class rt_sched_class = {
1758 .rq_online = rq_online_rt, 1751 .rq_online = rq_online_rt,
1759 .rq_offline = rq_offline_rt, 1752 .rq_offline = rq_offline_rt,
1760 .pre_schedule = pre_schedule_rt, 1753 .pre_schedule = pre_schedule_rt,
1761 .needs_post_schedule = needs_post_schedule_rt,
1762 .post_schedule = post_schedule_rt, 1754 .post_schedule = post_schedule_rt,
1763 .task_wake_up = task_wake_up_rt, 1755 .task_wake_up = task_wake_up_rt,
1764 .switched_from = switched_from_rt, 1756 .switched_from = switched_from_rt,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index eb5e131a0485..7db25067cd2d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -721,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
721 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
722 cond_resched(); 722 cond_resched();
723 preempt_disable(); 723 preempt_disable();
724 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
725 } 725 }
726 preempt_enable(); 726 preempt_enable();
727 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 58be76017fd0..3125cff1c570 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,7 +49,6 @@
49#include <linux/acpi.h> 49#include <linux/acpi.h>
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/security.h>
53#include <linux/slow-work.h> 52#include <linux/slow-work.h>
54#include <linux/perf_counter.h> 53#include <linux/perf_counter.h>
55 54
@@ -246,6 +245,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
246#endif 245#endif
247 246
248static struct ctl_table kern_table[] = { 247static struct ctl_table kern_table[] = {
248 {
249 .ctl_name = CTL_UNNUMBERED,
250 .procname = "sched_child_runs_first",
251 .data = &sysctl_sched_child_runs_first,
252 .maxlen = sizeof(unsigned int),
253 .mode = 0644,
254 .proc_handler = &proc_dointvec,
255 },
249#ifdef CONFIG_SCHED_DEBUG 256#ifdef CONFIG_SCHED_DEBUG
250 { 257 {
251 .ctl_name = CTL_UNNUMBERED, 258 .ctl_name = CTL_UNNUMBERED,
@@ -300,14 +307,6 @@ static struct ctl_table kern_table[] = {
300 }, 307 },
301 { 308 {
302 .ctl_name = CTL_UNNUMBERED, 309 .ctl_name = CTL_UNNUMBERED,
303 .procname = "sched_child_runs_first",
304 .data = &sysctl_sched_child_runs_first,
305 .maxlen = sizeof(unsigned int),
306 .mode = 0644,
307 .proc_handler = &proc_dointvec,
308 },
309 {
310 .ctl_name = CTL_UNNUMBERED,
311 .procname = "sched_features", 310 .procname = "sched_features",
312 .data = &sysctl_sched_features, 311 .data = &sysctl_sched_features,
313 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
@@ -332,6 +331,14 @@ static struct ctl_table kern_table[] = {
332 }, 331 },
333 { 332 {
334 .ctl_name = CTL_UNNUMBERED, 333 .ctl_name = CTL_UNNUMBERED,
334 .procname = "sched_time_avg",
335 .data = &sysctl_sched_time_avg,
336 .maxlen = sizeof(unsigned int),
337 .mode = 0644,
338 .proc_handler = &proc_dointvec,
339 },
340 {
341 .ctl_name = CTL_UNNUMBERED,
335 .procname = "timer_migration", 342 .procname = "timer_migration",
336 .data = &sysctl_timer_migration, 343 .data = &sysctl_timer_migration,
337 .maxlen = sizeof(unsigned int), 344 .maxlen = sizeof(unsigned int),
diff --git a/kernel/timer.c b/kernel/timer.c
index a7f07d5a6241..a3d25f415019 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1156,8 +1156,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1156 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1157 account_process_tick(p, user_tick);
1158 run_local_timers(); 1158 run_local_timers();
1159 if (rcu_pending(cpu)) 1159 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1160 printk_tick();
1162 scheduler_tick(); 1161 scheduler_tick();
1163 run_posix_cpu_timers(p); 1162 run_posix_cpu_timers(p);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{