aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2008-12-30 19:10:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2008-12-30 19:10:19 -0500
commit5f34fe1cfc1bdd8b4711bbe37421fba4ed0d1ed4 (patch)
tree85b21c8bb0e53005bd970d648ca093acfd0584a3 /kernel
parenteca1bf5b4fab56d2feb1572d34d59fcd92ea7df3 (diff)
parent6638101c1124c19c8a65b1645e4ecd09e0572f3e (diff)
Merge branch 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (63 commits) stacktrace: provide save_stack_trace_tsk() weak alias rcu: provide RCU options on non-preempt architectures too printk: fix discarding message when recursion_bug futex: clean up futex_(un)lock_pi fault handling "Tree RCU": scalable classic RCU implementation futex: rename field in futex_q to clarify single waiter semantics x86/swiotlb: add default swiotlb_arch_range_needs_mapping x86/swiotlb: add default phys<->bus conversion x86: unify pci iommu setup and allow swiotlb to compile for 32 bit x86: add swiotlb allocation functions swiotlb: consolidate swiotlb info message printing swiotlb: support bouncing of HighMem pages swiotlb: factor out copy to/from device swiotlb: add arch hook to force mapping swiotlb: allow architectures to override phys<->bus<->phys conversions swiotlb: add comment where we handle the overflow of a dma mask on 32 bit rcu: fix rcutorture behavior during reboot resources: skip sanity check of busy resources swiotlb: move some definitions to header swiotlb: allow architectures to override swiotlb pool allocation ... Fix up trivial conflicts in arch/x86/kernel/Makefile arch/x86/mm/init_32.c include/linux/hardirq.h as per Ingo's suggestions.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c16
-rw-r--r--kernel/futex.c351
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/lockdep.c60
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rcuclassic.c4
-rw-r--r--kernel/rcupreempt.c10
-rw-r--r--kernel/rcupreempt_trace.c10
-rw-r--r--kernel/rcutorture.c66
-rw-r--r--kernel/rcutree.c1535
-rw-r--r--kernel/rcutree_trace.c271
-rw-r--r--kernel/resource.c9
-rw-r--r--kernel/sched.c3
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/stacktrace.c11
-rw-r--r--kernel/sys.c2
25 files changed, 2159 insertions, 345 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03dc1fc..bf987b95b356 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,28 +52,3 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_RCU
56 bool "Preemptible RCU"
57 depends on PREEMPT
58 default n
59 help
60 This option reduces the latency of the kernel by making certain
61 RCU sections preemptible. Normally RCU code is non-preemptible, if
62 this option is selected then read-only RCU sections become
63 preemptible. This helps latency, but may expose bugs due to
64 now-naive assumptions about each RCU read-side critical section
65 remaining on a given CPU through its execution.
66
67 Say N if you are unsure.
68
69config RCU_TRACE
70 bool "Enable tracing for RCU - currently stats in debugfs"
71 depends on PREEMPT_RCU
72 select DEBUG_FS
73 default y
74 help
75 This option provides tracing in RCU which presents stats
76 in debugfs for debugging RCU implementation.
77
78 Say Y here if you want to enable RCU tracing
79 Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 027edda63511..e1c5bf3365c0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -73,10 +73,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
73obj-$(CONFIG_SECCOMP) += seccomp.o 73obj-$(CONFIG_SECCOMP) += seccomp.o
74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 74obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o 75obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
76obj-$(CONFIG_TREE_RCU) += rcutree.o
76obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 77obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
77ifeq ($(CONFIG_PREEMPT_RCU),y) 78obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
78obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o 79obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
79endif
80obj-$(CONFIG_RELAY) += relay.o 80obj-$(CONFIG_RELAY) += relay.o
81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 81obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 82obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/exit.c b/kernel/exit.c
index c7422ca92038..a946221879d7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1328,10 +1328,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1328 * group, which consolidates times for all threads in the 1328 * group, which consolidates times for all threads in the
1329 * group including the group leader. 1329 * group including the group leader.
1330 */ 1330 */
1331 thread_group_cputime(p, &cputime);
1331 spin_lock_irq(&p->parent->sighand->siglock); 1332 spin_lock_irq(&p->parent->sighand->siglock);
1332 psig = p->parent->signal; 1333 psig = p->parent->signal;
1333 sig = p->signal; 1334 sig = p->signal;
1334 thread_group_cputime(p, &cputime);
1335 psig->cutime = 1335 psig->cutime =
1336 cputime_add(psig->cutime, 1336 cputime_add(psig->cutime,
1337 cputime_add(cputime.utime, 1337 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index feb0317cf09a..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -67,3 +67,19 @@ int kernel_text_address(unsigned long addr)
67 return 1; 67 return 1;
68 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
69} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/futex.c b/kernel/futex.c
index 4fe790e89d0f..7c6cbabe52b3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -92,11 +92,12 @@ struct futex_pi_state {
92 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
94 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
95 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiter, then make the second condition true.
96 */ 96 */
97struct futex_q { 97struct futex_q {
98 struct plist_node list; 98 struct plist_node list;
99 wait_queue_head_t waiters; 99 /* There can only be a single waiter */
100 wait_queue_head_t waiter;
100 101
101 /* Which hash list lock to use: */ 102 /* Which hash list lock to use: */
102 spinlock_t *lock_ptr; 103 spinlock_t *lock_ptr;
@@ -123,24 +124,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 124static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 125
125/* 126/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 127 * We hash on the keys returned from get_futex_key (see below).
145 */ 128 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 129static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +144,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 144 && key1->both.offset == key2->both.offset);
162} 145}
163 146
147/*
148 * Take a reference to the resource addressed by a key.
149 * Can be called while holding spinlocks.
150 *
151 */
152static void get_futex_key_refs(union futex_key *key)
153{
154 if (!key->both.ptr)
155 return;
156
157 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
158 case FUT_OFF_INODE:
159 atomic_inc(&key->shared.inode->i_count);
160 break;
161 case FUT_OFF_MMSHARED:
162 atomic_inc(&key->private.mm->mm_count);
163 break;
164 }
165}
166
167/*
168 * Drop a reference to the resource addressed by a key.
169 * The hash bucket spinlock must not be held.
170 */
171static void drop_futex_key_refs(union futex_key *key)
172{
173 if (!key->both.ptr)
174 return;
175
176 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
177 case FUT_OFF_INODE:
178 iput(key->shared.inode);
179 break;
180 case FUT_OFF_MMSHARED:
181 mmdrop(key->private.mm);
182 break;
183 }
184}
185
164/** 186/**
165 * get_futex_key - Get parameters which are the keys for a futex. 187 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 188 * @uaddr: virtual address of the futex
@@ -179,12 +201,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 201 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 202 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 203 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 204static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 205{
185 unsigned long address = (unsigned long)uaddr; 206 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 207 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 208 struct page *page;
189 int err; 209 int err;
190 210
@@ -208,100 +228,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 228 return -EFAULT;
209 key->private.mm = mm; 229 key->private.mm = mm;
210 key->private.address = address; 230 key->private.address = address;
231 get_futex_key_refs(key);
211 return 0; 232 return 0;
212 } 233 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 234
221 /* 235again:
222 * Permissions. 236 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 237 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 238 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 239
240 lock_page(page);
241 if (!page->mapping) {
242 unlock_page(page);
243 put_page(page);
244 goto again;
245 }
226 246
227 /* 247 /*
228 * Private mappings are handled in a simple way. 248 * Private mappings are handled in a simple way.
229 * 249 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 250 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 251 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 252 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 253 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 254 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 255 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 256 key->private.mm = mm;
239 key->private.address = address; 257 key->private.address = address;
240 return 0; 258 } else {
259 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
260 key->shared.inode = page->mapping->host;
261 key->shared.pgoff = page->index;
241 } 262 }
242 263
243 /* 264 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 265
254 /* 266 unlock_page(page);
255 * We could walk the page table to read the non-linear 267 put_page(page);
256 * pte, and get the page index without fetching the page 268 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 269}
288 270
289/* 271static inline
290 * Drop a reference to the resource addressed by a key. 272void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 273{
295 if (!key->both.ptr) 274 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 275}
306 276
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 277static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +298,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 298
329/* 299/*
330 * Fault handling. 300 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 301 */
333static int futex_handle_fault(unsigned long address, 302static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 303{
336 struct vm_area_struct * vma; 304 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 305 struct mm_struct *mm = current->mm;
@@ -340,8 +308,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 308 if (attempt > 2)
341 return ret; 309 return ret;
342 310
343 if (!fshared) 311 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 312 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 313 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 314 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +328,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 328 current->min_flt++;
362 } 329 }
363 } 330 }
364 if (!fshared) 331 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 332 return ret;
367} 333}
368 334
@@ -385,6 +351,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 351 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 352 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 353 atomic_set(&pi_state->refcount, 1);
354 pi_state->key = FUTEX_KEY_INIT;
388 355
389 current->pi_state_cache = pi_state; 356 current->pi_state_cache = pi_state;
390 357
@@ -469,7 +436,7 @@ void exit_pi_state_list(struct task_struct *curr)
469 struct list_head *next, *head = &curr->pi_state_list; 436 struct list_head *next, *head = &curr->pi_state_list;
470 struct futex_pi_state *pi_state; 437 struct futex_pi_state *pi_state;
471 struct futex_hash_bucket *hb; 438 struct futex_hash_bucket *hb;
472 union futex_key key; 439 union futex_key key = FUTEX_KEY_INIT;
473 440
474 if (!futex_cmpxchg_enabled) 441 if (!futex_cmpxchg_enabled)
475 return; 442 return;
@@ -614,7 +581,7 @@ static void wake_futex(struct futex_q *q)
614 * The lock in wake_up_all() is a crucial memory barrier after the 581 * The lock in wake_up_all() is a crucial memory barrier after the
615 * plist_del() and also before assigning to q->lock_ptr. 582 * plist_del() and also before assigning to q->lock_ptr.
616 */ 583 */
617 wake_up_all(&q->waiters); 584 wake_up(&q->waiter);
618 /* 585 /*
619 * The waiting task can free the futex_q as soon as this is written, 586 * The waiting task can free the futex_q as soon as this is written,
620 * without taking any locks. This must come last. 587 * without taking any locks. This must come last.
@@ -726,20 +693,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
726 * Wake up all waiters hashed on the physical page that is mapped 693 * Wake up all waiters hashed on the physical page that is mapped
727 * to this virtual address: 694 * to this virtual address:
728 */ 695 */
729static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 696static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
730 int nr_wake, u32 bitset)
731{ 697{
732 struct futex_hash_bucket *hb; 698 struct futex_hash_bucket *hb;
733 struct futex_q *this, *next; 699 struct futex_q *this, *next;
734 struct plist_head *head; 700 struct plist_head *head;
735 union futex_key key; 701 union futex_key key = FUTEX_KEY_INIT;
736 int ret; 702 int ret;
737 703
738 if (!bitset) 704 if (!bitset)
739 return -EINVAL; 705 return -EINVAL;
740 706
741 futex_lock_mm(fshared);
742
743 ret = get_futex_key(uaddr, fshared, &key); 707 ret = get_futex_key(uaddr, fshared, &key);
744 if (unlikely(ret != 0)) 708 if (unlikely(ret != 0))
745 goto out; 709 goto out;
@@ -767,7 +731,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
767 731
768 spin_unlock(&hb->lock); 732 spin_unlock(&hb->lock);
769out: 733out:
770 futex_unlock_mm(fshared); 734 put_futex_key(fshared, &key);
771 return ret; 735 return ret;
772} 736}
773 737
@@ -776,19 +740,16 @@ out:
776 * to this virtual address: 740 * to this virtual address:
777 */ 741 */
778static int 742static int
779futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 743futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
780 u32 __user *uaddr2,
781 int nr_wake, int nr_wake2, int op) 744 int nr_wake, int nr_wake2, int op)
782{ 745{
783 union futex_key key1, key2; 746 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
784 struct futex_hash_bucket *hb1, *hb2; 747 struct futex_hash_bucket *hb1, *hb2;
785 struct plist_head *head; 748 struct plist_head *head;
786 struct futex_q *this, *next; 749 struct futex_q *this, *next;
787 int ret, op_ret, attempt = 0; 750 int ret, op_ret, attempt = 0;
788 751
789retryfull: 752retryfull:
790 futex_lock_mm(fshared);
791
792 ret = get_futex_key(uaddr1, fshared, &key1); 753 ret = get_futex_key(uaddr1, fshared, &key1);
793 if (unlikely(ret != 0)) 754 if (unlikely(ret != 0))
794 goto out; 755 goto out;
@@ -833,18 +794,12 @@ retry:
833 */ 794 */
834 if (attempt++) { 795 if (attempt++) {
835 ret = futex_handle_fault((unsigned long)uaddr2, 796 ret = futex_handle_fault((unsigned long)uaddr2,
836 fshared, attempt); 797 attempt);
837 if (ret) 798 if (ret)
838 goto out; 799 goto out;
839 goto retry; 800 goto retry;
840 } 801 }
841 802
842 /*
843 * If we would have faulted, release mmap_sem,
844 * fault it in and start all over again.
845 */
846 futex_unlock_mm(fshared);
847
848 ret = get_user(dummy, uaddr2); 803 ret = get_user(dummy, uaddr2);
849 if (ret) 804 if (ret)
850 return ret; 805 return ret;
@@ -880,7 +835,8 @@ retry:
880 if (hb1 != hb2) 835 if (hb1 != hb2)
881 spin_unlock(&hb2->lock); 836 spin_unlock(&hb2->lock);
882out: 837out:
883 futex_unlock_mm(fshared); 838 put_futex_key(fshared, &key2);
839 put_futex_key(fshared, &key1);
884 840
885 return ret; 841 return ret;
886} 842}
@@ -889,19 +845,16 @@ out:
889 * Requeue all waiters hashed on one physical page to another 845 * Requeue all waiters hashed on one physical page to another
890 * physical page. 846 * physical page.
891 */ 847 */
892static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 848static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
893 u32 __user *uaddr2,
894 int nr_wake, int nr_requeue, u32 *cmpval) 849 int nr_wake, int nr_requeue, u32 *cmpval)
895{ 850{
896 union futex_key key1, key2; 851 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
897 struct futex_hash_bucket *hb1, *hb2; 852 struct futex_hash_bucket *hb1, *hb2;
898 struct plist_head *head1; 853 struct plist_head *head1;
899 struct futex_q *this, *next; 854 struct futex_q *this, *next;
900 int ret, drop_count = 0; 855 int ret, drop_count = 0;
901 856
902 retry: 857 retry:
903 futex_lock_mm(fshared);
904
905 ret = get_futex_key(uaddr1, fshared, &key1); 858 ret = get_futex_key(uaddr1, fshared, &key1);
906 if (unlikely(ret != 0)) 859 if (unlikely(ret != 0))
907 goto out; 860 goto out;
@@ -924,12 +877,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
924 if (hb1 != hb2) 877 if (hb1 != hb2)
925 spin_unlock(&hb2->lock); 878 spin_unlock(&hb2->lock);
926 879
927 /*
928 * If we would have faulted, release mmap_sem, fault
929 * it in and start all over again.
930 */
931 futex_unlock_mm(fshared);
932
933 ret = get_user(curval, uaddr1); 880 ret = get_user(curval, uaddr1);
934 881
935 if (!ret) 882 if (!ret)
@@ -981,7 +928,8 @@ out_unlock:
981 drop_futex_key_refs(&key1); 928 drop_futex_key_refs(&key1);
982 929
983out: 930out:
984 futex_unlock_mm(fshared); 931 put_futex_key(fshared, &key2);
932 put_futex_key(fshared, &key1);
985 return ret; 933 return ret;
986} 934}
987 935
@@ -990,7 +938,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
990{ 938{
991 struct futex_hash_bucket *hb; 939 struct futex_hash_bucket *hb;
992 940
993 init_waitqueue_head(&q->waiters); 941 init_waitqueue_head(&q->waiter);
994 942
995 get_futex_key_refs(&q->key); 943 get_futex_key_refs(&q->key);
996 hb = hash_futex(&q->key); 944 hb = hash_futex(&q->key);
@@ -1103,8 +1051,7 @@ static void unqueue_me_pi(struct futex_q *q)
1103 * private futexes. 1051 * private futexes.
1104 */ 1052 */
1105static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1053static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1106 struct task_struct *newowner, 1054 struct task_struct *newowner, int fshared)
1107 struct rw_semaphore *fshared)
1108{ 1055{
1109 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1056 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1110 struct futex_pi_state *pi_state = q->pi_state; 1057 struct futex_pi_state *pi_state = q->pi_state;
@@ -1183,7 +1130,7 @@ retry:
1183handle_fault: 1130handle_fault:
1184 spin_unlock(q->lock_ptr); 1131 spin_unlock(q->lock_ptr);
1185 1132
1186 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1133 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1187 1134
1188 spin_lock(q->lock_ptr); 1135 spin_lock(q->lock_ptr);
1189 1136
@@ -1203,12 +1150,13 @@ handle_fault:
1203 * In case we must use restart_block to restart a futex_wait, 1150 * In case we must use restart_block to restart a futex_wait,
1204 * we encode in the 'flags' shared capability 1151 * we encode in the 'flags' shared capability
1205 */ 1152 */
1206#define FLAGS_SHARED 1 1153#define FLAGS_SHARED 0x01
1154#define FLAGS_CLOCKRT 0x02
1207 1155
1208static long futex_wait_restart(struct restart_block *restart); 1156static long futex_wait_restart(struct restart_block *restart);
1209 1157
1210static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1158static int futex_wait(u32 __user *uaddr, int fshared,
1211 u32 val, ktime_t *abs_time, u32 bitset) 1159 u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
1212{ 1160{
1213 struct task_struct *curr = current; 1161 struct task_struct *curr = current;
1214 DECLARE_WAITQUEUE(wait, curr); 1162 DECLARE_WAITQUEUE(wait, curr);
@@ -1225,8 +1173,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1225 q.pi_state = NULL; 1173 q.pi_state = NULL;
1226 q.bitset = bitset; 1174 q.bitset = bitset;
1227 retry: 1175 retry:
1228 futex_lock_mm(fshared); 1176 q.key = FUTEX_KEY_INIT;
1229
1230 ret = get_futex_key(uaddr, fshared, &q.key); 1177 ret = get_futex_key(uaddr, fshared, &q.key);
1231 if (unlikely(ret != 0)) 1178 if (unlikely(ret != 0))
1232 goto out_release_sem; 1179 goto out_release_sem;
@@ -1258,12 +1205,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1258 if (unlikely(ret)) { 1205 if (unlikely(ret)) {
1259 queue_unlock(&q, hb); 1206 queue_unlock(&q, hb);
1260 1207
1261 /*
1262 * If we would have faulted, release mmap_sem, fault it in and
1263 * start all over again.
1264 */
1265 futex_unlock_mm(fshared);
1266
1267 ret = get_user(uval, uaddr); 1208 ret = get_user(uval, uaddr);
1268 1209
1269 if (!ret) 1210 if (!ret)
@@ -1278,12 +1219,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1278 queue_me(&q, hb); 1219 queue_me(&q, hb);
1279 1220
1280 /* 1221 /*
1281 * Now the futex is queued and we have checked the data, we
1282 * don't want to hold mmap_sem while we sleep.
1283 */
1284 futex_unlock_mm(fshared);
1285
1286 /*
1287 * There might have been scheduling since the queue_me(), as we 1222 * There might have been scheduling since the queue_me(), as we
1288 * cannot hold a spinlock across the get_user() in case it 1223 * cannot hold a spinlock across the get_user() in case it
1289 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1224 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1294,7 +1229,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1294 1229
1295 /* add_wait_queue is the barrier after __set_current_state. */ 1230 /* add_wait_queue is the barrier after __set_current_state. */
1296 __set_current_state(TASK_INTERRUPTIBLE); 1231 __set_current_state(TASK_INTERRUPTIBLE);
1297 add_wait_queue(&q.waiters, &wait); 1232 add_wait_queue(&q.waiter, &wait);
1298 /* 1233 /*
1299 * !plist_node_empty() is safe here without any lock. 1234 * !plist_node_empty() is safe here without any lock.
1300 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1235 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
@@ -1307,8 +1242,10 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1307 slack = current->timer_slack_ns; 1242 slack = current->timer_slack_ns;
1308 if (rt_task(current)) 1243 if (rt_task(current))
1309 slack = 0; 1244 slack = 0;
1310 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1245 hrtimer_init_on_stack(&t.timer,
1311 HRTIMER_MODE_ABS); 1246 clockrt ? CLOCK_REALTIME :
1247 CLOCK_MONOTONIC,
1248 HRTIMER_MODE_ABS);
1312 hrtimer_init_sleeper(&t, current); 1249 hrtimer_init_sleeper(&t, current);
1313 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); 1250 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1314 1251
@@ -1363,6 +1300,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 1300
1364 if (fshared) 1301 if (fshared)
1365 restart->futex.flags |= FLAGS_SHARED; 1302 restart->futex.flags |= FLAGS_SHARED;
1303 if (clockrt)
1304 restart->futex.flags |= FLAGS_CLOCKRT;
1366 return -ERESTART_RESTARTBLOCK; 1305 return -ERESTART_RESTARTBLOCK;
1367 } 1306 }
1368 1307
@@ -1370,7 +1309,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1370 queue_unlock(&q, hb); 1309 queue_unlock(&q, hb);
1371 1310
1372 out_release_sem: 1311 out_release_sem:
1373 futex_unlock_mm(fshared); 1312 put_futex_key(fshared, &q.key);
1374 return ret; 1313 return ret;
1375} 1314}
1376 1315
@@ -1378,15 +1317,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1378static long futex_wait_restart(struct restart_block *restart) 1317static long futex_wait_restart(struct restart_block *restart)
1379{ 1318{
1380 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1319 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1381 struct rw_semaphore *fshared = NULL; 1320 int fshared = 0;
1382 ktime_t t; 1321 ktime_t t;
1383 1322
1384 t.tv64 = restart->futex.time; 1323 t.tv64 = restart->futex.time;
1385 restart->fn = do_no_restart_syscall; 1324 restart->fn = do_no_restart_syscall;
1386 if (restart->futex.flags & FLAGS_SHARED) 1325 if (restart->futex.flags & FLAGS_SHARED)
1387 fshared = &current->mm->mmap_sem; 1326 fshared = 1;
1388 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1327 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1389 restart->futex.bitset); 1328 restart->futex.bitset,
1329 restart->futex.flags & FLAGS_CLOCKRT);
1390} 1330}
1391 1331
1392 1332
@@ -1396,7 +1336,7 @@ static long futex_wait_restart(struct restart_block *restart)
1396 * if there are waiters then it will block, it does PI, etc. (Due to 1336 * if there are waiters then it will block, it does PI, etc. (Due to
1397 * races the kernel might see a 0 value of the futex too.) 1337 * races the kernel might see a 0 value of the futex too.)
1398 */ 1338 */
1399static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1339static int futex_lock_pi(u32 __user *uaddr, int fshared,
1400 int detect, ktime_t *time, int trylock) 1340 int detect, ktime_t *time, int trylock)
1401{ 1341{
1402 struct hrtimer_sleeper timeout, *to = NULL; 1342 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1419,8 +1359,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1419 1359
1420 q.pi_state = NULL; 1360 q.pi_state = NULL;
1421 retry: 1361 retry:
1422 futex_lock_mm(fshared); 1362 q.key = FUTEX_KEY_INIT;
1423
1424 ret = get_futex_key(uaddr, fshared, &q.key); 1363 ret = get_futex_key(uaddr, fshared, &q.key);
1425 if (unlikely(ret != 0)) 1364 if (unlikely(ret != 0))
1426 goto out_release_sem; 1365 goto out_release_sem;
@@ -1509,7 +1448,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1509 * exit to complete. 1448 * exit to complete.
1510 */ 1449 */
1511 queue_unlock(&q, hb); 1450 queue_unlock(&q, hb);
1512 futex_unlock_mm(fshared);
1513 cond_resched(); 1451 cond_resched();
1514 goto retry; 1452 goto retry;
1515 1453
@@ -1541,12 +1479,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1541 */ 1479 */
1542 queue_me(&q, hb); 1480 queue_me(&q, hb);
1543 1481
1544 /*
1545 * Now the futex is queued and we have checked the data, we
1546 * don't want to hold mmap_sem while we sleep.
1547 */
1548 futex_unlock_mm(fshared);
1549
1550 WARN_ON(!q.pi_state); 1482 WARN_ON(!q.pi_state);
1551 /* 1483 /*
1552 * Block on the PI mutex: 1484 * Block on the PI mutex:
@@ -1559,7 +1491,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1559 ret = ret ? 0 : -EWOULDBLOCK; 1491 ret = ret ? 0 : -EWOULDBLOCK;
1560 } 1492 }
1561 1493
1562 futex_lock_mm(fshared);
1563 spin_lock(q.lock_ptr); 1494 spin_lock(q.lock_ptr);
1564 1495
1565 if (!ret) { 1496 if (!ret) {
@@ -1625,7 +1556,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1625 1556
1626 /* Unqueue and drop the lock */ 1557 /* Unqueue and drop the lock */
1627 unqueue_me_pi(&q); 1558 unqueue_me_pi(&q);
1628 futex_unlock_mm(fshared);
1629 1559
1630 if (to) 1560 if (to)
1631 destroy_hrtimer_on_stack(&to->timer); 1561 destroy_hrtimer_on_stack(&to->timer);
@@ -1635,34 +1565,30 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1635 queue_unlock(&q, hb); 1565 queue_unlock(&q, hb);
1636 1566
1637 out_release_sem: 1567 out_release_sem:
1638 futex_unlock_mm(fshared); 1568 put_futex_key(fshared, &q.key);
1639 if (to) 1569 if (to)
1640 destroy_hrtimer_on_stack(&to->timer); 1570 destroy_hrtimer_on_stack(&to->timer);
1641 return ret; 1571 return ret;
1642 1572
1643 uaddr_faulted: 1573 uaddr_faulted:
1644 /* 1574 /*
1645 * We have to r/w *(int __user *)uaddr, but we can't modify it 1575 * We have to r/w *(int __user *)uaddr, and we have to modify it
1646 * non-atomically. Therefore, if get_user below is not 1576 * atomically. Therefore, if we continue to fault after get_user()
1647 * enough, we need to handle the fault ourselves, while 1577 * below, we need to handle the fault ourselves, while still holding
1648 * still holding the mmap_sem. 1578 * the mmap_sem. This can occur if the uaddr is under contention as
1649 * 1579 * we have to drop the mmap_sem in order to call get_user().
1650 * ... and hb->lock. :-) --ANK
1651 */ 1580 */
1652 queue_unlock(&q, hb); 1581 queue_unlock(&q, hb);
1653 1582
1654 if (attempt++) { 1583 if (attempt++) {
1655 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1584 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1656 attempt);
1657 if (ret) 1585 if (ret)
1658 goto out_release_sem; 1586 goto out_release_sem;
1659 goto retry_unlocked; 1587 goto retry_unlocked;
1660 } 1588 }
1661 1589
1662 futex_unlock_mm(fshared);
1663
1664 ret = get_user(uval, uaddr); 1590 ret = get_user(uval, uaddr);
1665 if (!ret && (uval != -EFAULT)) 1591 if (!ret)
1666 goto retry; 1592 goto retry;
1667 1593
1668 if (to) 1594 if (to)
@@ -1675,13 +1601,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1675 * This is the in-kernel slowpath: we look up the PI state (if any), 1601 * This is the in-kernel slowpath: we look up the PI state (if any),
1676 * and do the rt-mutex unlock. 1602 * and do the rt-mutex unlock.
1677 */ 1603 */
1678static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1604static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1679{ 1605{
1680 struct futex_hash_bucket *hb; 1606 struct futex_hash_bucket *hb;
1681 struct futex_q *this, *next; 1607 struct futex_q *this, *next;
1682 u32 uval; 1608 u32 uval;
1683 struct plist_head *head; 1609 struct plist_head *head;
1684 union futex_key key; 1610 union futex_key key = FUTEX_KEY_INIT;
1685 int ret, attempt = 0; 1611 int ret, attempt = 0;
1686 1612
1687retry: 1613retry:
@@ -1692,10 +1618,6 @@ retry:
1692 */ 1618 */
1693 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1619 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1694 return -EPERM; 1620 return -EPERM;
1695 /*
1696 * First take all the futex related locks:
1697 */
1698 futex_lock_mm(fshared);
1699 1621
1700 ret = get_futex_key(uaddr, fshared, &key); 1622 ret = get_futex_key(uaddr, fshared, &key);
1701 if (unlikely(ret != 0)) 1623 if (unlikely(ret != 0))
@@ -1754,34 +1676,30 @@ retry_unlocked:
1754out_unlock: 1676out_unlock:
1755 spin_unlock(&hb->lock); 1677 spin_unlock(&hb->lock);
1756out: 1678out:
1757 futex_unlock_mm(fshared); 1679 put_futex_key(fshared, &key);
1758 1680
1759 return ret; 1681 return ret;
1760 1682
1761pi_faulted: 1683pi_faulted:
1762 /* 1684 /*
1763 * We have to r/w *(int __user *)uaddr, but we can't modify it 1685 * We have to r/w *(int __user *)uaddr, and we have to modify it
1764 * non-atomically. Therefore, if get_user below is not 1686 * atomically. Therefore, if we continue to fault after get_user()
1765 * enough, we need to handle the fault ourselves, while 1687 * below, we need to handle the fault ourselves, while still holding
1766 * still holding the mmap_sem. 1688 * the mmap_sem. This can occur if the uaddr is under contention as
1767 * 1689 * we have to drop the mmap_sem in order to call get_user().
1768 * ... and hb->lock. --ANK
1769 */ 1690 */
1770 spin_unlock(&hb->lock); 1691 spin_unlock(&hb->lock);
1771 1692
1772 if (attempt++) { 1693 if (attempt++) {
1773 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1694 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1774 attempt);
1775 if (ret) 1695 if (ret)
1776 goto out; 1696 goto out;
1777 uval = 0; 1697 uval = 0;
1778 goto retry_unlocked; 1698 goto retry_unlocked;
1779 } 1699 }
1780 1700
1781 futex_unlock_mm(fshared);
1782
1783 ret = get_user(uval, uaddr); 1701 ret = get_user(uval, uaddr);
1784 if (!ret && (uval != -EFAULT)) 1702 if (!ret)
1785 goto retry; 1703 goto retry;
1786 1704
1787 return ret; 1705 return ret;
@@ -1908,8 +1826,7 @@ retry:
1908 * PI futexes happens in exit_pi_state(): 1826 * PI futexes happens in exit_pi_state():
1909 */ 1827 */
1910 if (!pi && (uval & FUTEX_WAITERS)) 1828 if (!pi && (uval & FUTEX_WAITERS))
1911 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1829 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1912 FUTEX_BITSET_MATCH_ANY);
1913 } 1830 }
1914 return 0; 1831 return 0;
1915} 1832}
@@ -2003,18 +1920,22 @@ void exit_robust_list(struct task_struct *curr)
2003long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 1920long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2004 u32 __user *uaddr2, u32 val2, u32 val3) 1921 u32 __user *uaddr2, u32 val2, u32 val3)
2005{ 1922{
2006 int ret = -ENOSYS; 1923 int clockrt, ret = -ENOSYS;
2007 int cmd = op & FUTEX_CMD_MASK; 1924 int cmd = op & FUTEX_CMD_MASK;
2008 struct rw_semaphore *fshared = NULL; 1925 int fshared = 0;
2009 1926
2010 if (!(op & FUTEX_PRIVATE_FLAG)) 1927 if (!(op & FUTEX_PRIVATE_FLAG))
2011 fshared = &current->mm->mmap_sem; 1928 fshared = 1;
1929
1930 clockrt = op & FUTEX_CLOCK_REALTIME;
1931 if (clockrt && cmd != FUTEX_WAIT_BITSET)
1932 return -ENOSYS;
2012 1933
2013 switch (cmd) { 1934 switch (cmd) {
2014 case FUTEX_WAIT: 1935 case FUTEX_WAIT:
2015 val3 = FUTEX_BITSET_MATCH_ANY; 1936 val3 = FUTEX_BITSET_MATCH_ANY;
2016 case FUTEX_WAIT_BITSET: 1937 case FUTEX_WAIT_BITSET:
2017 ret = futex_wait(uaddr, fshared, val, timeout, val3); 1938 ret = futex_wait(uaddr, fshared, val, timeout, val3, clockrt);
2018 break; 1939 break;
2019 case FUTEX_WAKE: 1940 case FUTEX_WAKE:
2020 val3 = FUTEX_BITSET_MATCH_ANY; 1941 val3 = FUTEX_BITSET_MATCH_ANY;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 801addda3c43..e9d1c8205a3b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -673,6 +673,18 @@ int request_irq(unsigned int irq, irq_handler_t handler,
673 struct irq_desc *desc; 673 struct irq_desc *desc;
674 int retval; 674 int retval;
675 675
676 /*
677 * handle_IRQ_event() always ignores IRQF_DISABLED except for
678 * the _first_ irqaction (sigh). That can cause oopsing, but
679 * the behavior is classified as "will not fix" so we need to
680 * start nudging drivers away from using that idiom.
681 */
682 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
683 == (IRQF_SHARED|IRQF_DISABLED))
684 pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
685 "guaranteed on shared IRQs\n",
686 irq, devname);
687
676#ifdef CONFIG_LOCKDEP 688#ifdef CONFIG_LOCKDEP
677 /* 689 /*
678 * Lockdep wants atomic interrupt handlers: 690 * Lockdep wants atomic interrupt handlers:
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 74b1878b8bb8..06b0c3568f0b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -137,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
137#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
139 139
140static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
141{ 141{
142 int i; 142 int i;
143 143
144 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
145 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
146 class->contention_point[i] = ip; 146 points[i] = ip;
147 break; 147 break;
148 } 148 }
149 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
150 break; 150 break;
151 } 151 }
152 152
@@ -186,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
187 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
188 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
189 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
190 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
191 194
@@ -210,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
210 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
211 } 214 }
212 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
213} 217}
214 218
215static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -288,14 +292,12 @@ void lockdep_off(void)
288{ 292{
289 current->lockdep_recursion++; 293 current->lockdep_recursion++;
290} 294}
291
292EXPORT_SYMBOL(lockdep_off); 295EXPORT_SYMBOL(lockdep_off);
293 296
294void lockdep_on(void) 297void lockdep_on(void)
295{ 298{
296 current->lockdep_recursion--; 299 current->lockdep_recursion--;
297} 300}
298
299EXPORT_SYMBOL(lockdep_on); 301EXPORT_SYMBOL(lockdep_on);
300 302
301/* 303/*
@@ -577,7 +579,8 @@ static void print_lock_class_header(struct lock_class *class, int depth)
577/* 579/*
578 * printk all lock dependencies starting at <entry>: 580 * printk all lock dependencies starting at <entry>:
579 */ 581 */
580static void print_lock_dependencies(struct lock_class *class, int depth) 582static void __used
583print_lock_dependencies(struct lock_class *class, int depth)
581{ 584{
582 struct lock_list *entry; 585 struct lock_list *entry;
583 586
@@ -2509,7 +2512,6 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2509 if (subclass) 2512 if (subclass)
2510 register_lock_class(lock, subclass, 1); 2513 register_lock_class(lock, subclass, 1);
2511} 2514}
2512
2513EXPORT_SYMBOL_GPL(lockdep_init_map); 2515EXPORT_SYMBOL_GPL(lockdep_init_map);
2514 2516
2515/* 2517/*
@@ -2690,8 +2692,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2690} 2692}
2691 2693
2692static int 2694static int
2693__lock_set_subclass(struct lockdep_map *lock, 2695__lock_set_class(struct lockdep_map *lock, const char *name,
2694 unsigned int subclass, unsigned long ip) 2696 struct lock_class_key *key, unsigned int subclass,
2697 unsigned long ip)
2695{ 2698{
2696 struct task_struct *curr = current; 2699 struct task_struct *curr = current;
2697 struct held_lock *hlock, *prev_hlock; 2700 struct held_lock *hlock, *prev_hlock;
@@ -2718,6 +2721,7 @@ __lock_set_subclass(struct lockdep_map *lock,
2718 return print_unlock_inbalance_bug(curr, lock, ip); 2721 return print_unlock_inbalance_bug(curr, lock, ip);
2719 2722
2720found_it: 2723found_it:
2724 lockdep_init_map(lock, name, key, 0);
2721 class = register_lock_class(lock, subclass, 0); 2725 class = register_lock_class(lock, subclass, 0);
2722 hlock->class_idx = class - lock_classes + 1; 2726 hlock->class_idx = class - lock_classes + 1;
2723 2727
@@ -2902,9 +2906,9 @@ static void check_flags(unsigned long flags)
2902#endif 2906#endif
2903} 2907}
2904 2908
2905void 2909void lock_set_class(struct lockdep_map *lock, const char *name,
2906lock_set_subclass(struct lockdep_map *lock, 2910 struct lock_class_key *key, unsigned int subclass,
2907 unsigned int subclass, unsigned long ip) 2911 unsigned long ip)
2908{ 2912{
2909 unsigned long flags; 2913 unsigned long flags;
2910 2914
@@ -2914,13 +2918,12 @@ lock_set_subclass(struct lockdep_map *lock,
2914 raw_local_irq_save(flags); 2918 raw_local_irq_save(flags);
2915 current->lockdep_recursion = 1; 2919 current->lockdep_recursion = 1;
2916 check_flags(flags); 2920 check_flags(flags);
2917 if (__lock_set_subclass(lock, subclass, ip)) 2921 if (__lock_set_class(lock, name, key, subclass, ip))
2918 check_chain_key(current); 2922 check_chain_key(current);
2919 current->lockdep_recursion = 0; 2923 current->lockdep_recursion = 0;
2920 raw_local_irq_restore(flags); 2924 raw_local_irq_restore(flags);
2921} 2925}
2922 2926EXPORT_SYMBOL_GPL(lock_set_class);
2923EXPORT_SYMBOL_GPL(lock_set_subclass);
2924 2927
2925/* 2928/*
2926 * We are not always called with irqs disabled - do that here, 2929 * We are not always called with irqs disabled - do that here,
@@ -2944,7 +2947,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2944 current->lockdep_recursion = 0; 2947 current->lockdep_recursion = 0;
2945 raw_local_irq_restore(flags); 2948 raw_local_irq_restore(flags);
2946} 2949}
2947
2948EXPORT_SYMBOL_GPL(lock_acquire); 2950EXPORT_SYMBOL_GPL(lock_acquire);
2949 2951
2950void lock_release(struct lockdep_map *lock, int nested, 2952void lock_release(struct lockdep_map *lock, int nested,
@@ -2962,7 +2964,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2962 current->lockdep_recursion = 0; 2964 current->lockdep_recursion = 0;
2963 raw_local_irq_restore(flags); 2965 raw_local_irq_restore(flags);
2964} 2966}
2965
2966EXPORT_SYMBOL_GPL(lock_release); 2967EXPORT_SYMBOL_GPL(lock_release);
2967 2968
2968#ifdef CONFIG_LOCK_STAT 2969#ifdef CONFIG_LOCK_STAT
@@ -3000,7 +3001,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3000 struct held_lock *hlock, *prev_hlock; 3001 struct held_lock *hlock, *prev_hlock;
3001 struct lock_class_stats *stats; 3002 struct lock_class_stats *stats;
3002 unsigned int depth; 3003 unsigned int depth;
3003 int i, point; 3004 int i, contention_point, contending_point;
3004 3005
3005 depth = curr->lockdep_depth; 3006 depth = curr->lockdep_depth;
3006 if (DEBUG_LOCKS_WARN_ON(!depth)) 3007 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3024,18 +3025,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3024found_it: 3025found_it:
3025 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
3026 3027
3027 point = lock_contention_point(hlock_class(hlock), ip); 3028 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3029 contending_point = lock_point(hlock_class(hlock)->contending_point,
3030 lock->ip);
3028 3031
3029 stats = get_lock_stats(hlock_class(hlock)); 3032 stats = get_lock_stats(hlock_class(hlock));
3030 if (point < ARRAY_SIZE(stats->contention_point)) 3033 if (contention_point < LOCKSTAT_POINTS)
3031 stats->contention_point[point]++; 3034 stats->contention_point[contention_point]++;
3035 if (contending_point < LOCKSTAT_POINTS)
3036 stats->contending_point[contending_point]++;
3032 if (lock->cpu != smp_processor_id()) 3037 if (lock->cpu != smp_processor_id())
3033 stats->bounces[bounce_contended + !!hlock->read]++; 3038 stats->bounces[bounce_contended + !!hlock->read]++;
3034 put_lock_stats(stats); 3039 put_lock_stats(stats);
3035} 3040}
3036 3041
3037static void 3042static void
3038__lock_acquired(struct lockdep_map *lock) 3043__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3039{ 3044{
3040 struct task_struct *curr = current; 3045 struct task_struct *curr = current;
3041 struct held_lock *hlock, *prev_hlock; 3046 struct held_lock *hlock, *prev_hlock;
@@ -3084,6 +3089,7 @@ found_it:
3084 put_lock_stats(stats); 3089 put_lock_stats(stats);
3085 3090
3086 lock->cpu = cpu; 3091 lock->cpu = cpu;
3092 lock->ip = ip;
3087} 3093}
3088 3094
3089void lock_contended(struct lockdep_map *lock, unsigned long ip) 3095void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3105,7 +3111,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3105} 3111}
3106EXPORT_SYMBOL_GPL(lock_contended); 3112EXPORT_SYMBOL_GPL(lock_contended);
3107 3113
3108void lock_acquired(struct lockdep_map *lock) 3114void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3109{ 3115{
3110 unsigned long flags; 3116 unsigned long flags;
3111 3117
@@ -3118,7 +3124,7 @@ void lock_acquired(struct lockdep_map *lock)
3118 raw_local_irq_save(flags); 3124 raw_local_irq_save(flags);
3119 check_flags(flags); 3125 check_flags(flags);
3120 current->lockdep_recursion = 1; 3126 current->lockdep_recursion = 1;
3121 __lock_acquired(lock); 3127 __lock_acquired(lock, ip);
3122 current->lockdep_recursion = 0; 3128 current->lockdep_recursion = 0;
3123 raw_local_irq_restore(flags); 3129 raw_local_irq_restore(flags);
3124} 3130}
@@ -3442,7 +3448,6 @@ retry:
3442 if (unlock) 3448 if (unlock)
3443 read_unlock(&tasklist_lock); 3449 read_unlock(&tasklist_lock);
3444} 3450}
3445
3446EXPORT_SYMBOL_GPL(debug_show_all_locks); 3451EXPORT_SYMBOL_GPL(debug_show_all_locks);
3447 3452
3448/* 3453/*
@@ -3463,7 +3468,6 @@ void debug_show_held_locks(struct task_struct *task)
3463{ 3468{
3464 __debug_show_held_locks(task); 3469 __debug_show_held_locks(task);
3465} 3470}
3466
3467EXPORT_SYMBOL_GPL(debug_show_held_locks); 3471EXPORT_SYMBOL_GPL(debug_show_held_locks);
3468 3472
3469void lockdep_sys_exit(void) 3473void lockdep_sys_exit(void)
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d5088355bfe..13f06349a786 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -21,6 +21,7 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h> 22#include <linux/random.h>
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24#include <linux/dmi.h>
24 25
25int panic_on_oops; 26int panic_on_oops;
26static unsigned long tainted_mask; 27static unsigned long tainted_mask;
@@ -321,36 +322,27 @@ void oops_exit(void)
321} 322}
322 323
323#ifdef WANT_WARN_ON_SLOWPATH 324#ifdef WANT_WARN_ON_SLOWPATH
324void warn_on_slowpath(const char *file, int line)
325{
326 char function[KSYM_SYMBOL_LEN];
327 unsigned long caller = (unsigned long) __builtin_return_address(0);
328 sprint_symbol(function, caller);
329
330 printk(KERN_WARNING "------------[ cut here ]------------\n");
331 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
332 line, function);
333 print_modules();
334 dump_stack();
335 print_oops_end_marker();
336 add_taint(TAINT_WARN);
337}
338EXPORT_SYMBOL(warn_on_slowpath);
339
340
341void warn_slowpath(const char *file, int line, const char *fmt, ...) 325void warn_slowpath(const char *file, int line, const char *fmt, ...)
342{ 326{
343 va_list args; 327 va_list args;
344 char function[KSYM_SYMBOL_LEN]; 328 char function[KSYM_SYMBOL_LEN];
345 unsigned long caller = (unsigned long)__builtin_return_address(0); 329 unsigned long caller = (unsigned long)__builtin_return_address(0);
330 const char *board;
331
346 sprint_symbol(function, caller); 332 sprint_symbol(function, caller);
347 333
348 printk(KERN_WARNING "------------[ cut here ]------------\n"); 334 printk(KERN_WARNING "------------[ cut here ]------------\n");
349 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 335 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
350 line, function); 336 line, function);
351 va_start(args, fmt); 337 board = dmi_get_system_info(DMI_PRODUCT_NAME);
352 vprintk(fmt, args); 338 if (board)
353 va_end(args); 339 printk(KERN_WARNING "Hardware name: %s\n", board);
340
341 if (fmt) {
342 va_start(args, fmt);
343 vprintk(fmt, args);
344 va_end(args);
345 }
354 346
355 print_modules(); 347 print_modules();
356 dump_stack(); 348 dump_stack();
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a831de..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/printk.c b/kernel/printk.c
index f492f1583d77..e651ab05655f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -662,7 +662,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
662 if (recursion_bug) { 662 if (recursion_bug) {
663 recursion_bug = 0; 663 recursion_bug = 0;
664 strcpy(printk_buf, recursion_bug_msg); 664 strcpy(printk_buf, recursion_bug_msg);
665 printed_len = sizeof(recursion_bug_msg); 665 printed_len = strlen(recursion_bug_msg);
666 } 666 }
667 /* Emit the output into the temporary buffer */ 667 /* Emit the output into the temporary buffer */
668 printed_len += vscnprintf(printk_buf + printed_len, 668 printed_len += vscnprintf(printk_buf + printed_len,
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..e503a002f330 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 59236e8b9daa..04982659875a 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -551,6 +551,16 @@ void rcu_irq_exit(void)
551 } 551 }
552} 552}
553 553
554void rcu_nmi_enter(void)
555{
556 rcu_irq_enter();
557}
558
559void rcu_nmi_exit(void)
560{
561 rcu_irq_exit();
562}
563
554static void dyntick_save_progress_counter(int cpu) 564static void dyntick_save_progress_counter(int cpu)
555{ 565{
556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); 566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 35c2d3360ecf..7c2665cac172 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
149 sp->done_length += cp->done_length; 149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add; 150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove; 151 sp->done_remove += cp->done_remove;
152 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); 152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks; 153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_set(&sp->rcu_try_flip_1, 154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 atomic_read(&cp->rcu_try_flip_1)); 155 &sp->rcu_try_flip_1);
156 atomic_set(&sp->rcu_try_flip_e1, 156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 atomic_read(&cp->rcu_try_flip_e1)); 157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; 158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; 159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; 160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 85cb90588a55..b31065522104 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -39,6 +39,7 @@
39#include <linux/moduleparam.h> 39#include <linux/moduleparam.h>
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/reboot.h>
42#include <linux/freezer.h> 43#include <linux/freezer.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/delay.h> 45#include <linux/delay.h>
@@ -108,7 +109,6 @@ struct rcu_torture {
108 int rtort_mbtest; 109 int rtort_mbtest;
109}; 110};
110 111
111static int fullstop = 0; /* stop generating callbacks at test end. */
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current = NULL;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version = 0;
@@ -136,6 +136,30 @@ static int stutter_pause_test = 0;
136#endif 136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138 138
139#define FULLSTOP_SIGNALED 1 /* Bail due to signal. */
140#define FULLSTOP_CLEANUP 2 /* Orderly shutdown. */
141static int fullstop; /* stop generating callbacks at test end. */
142DEFINE_MUTEX(fullstop_mutex); /* protect fullstop transitions and */
143 /* spawning of kthreads. */
144
145/*
146 * Detect and respond to a signal-based shutdown.
147 */
148static int
149rcutorture_shutdown_notify(struct notifier_block *unused1,
150 unsigned long unused2, void *unused3)
151{
152 if (fullstop)
153 return NOTIFY_DONE;
154 if (signal_pending(current)) {
155 mutex_lock(&fullstop_mutex);
156 if (!ACCESS_ONCE(fullstop))
157 fullstop = FULLSTOP_SIGNALED;
158 mutex_unlock(&fullstop_mutex);
159 }
160 return NOTIFY_DONE;
161}
162
139/* 163/*
140 * Allocate an element from the rcu_tortures pool. 164 * Allocate an element from the rcu_tortures pool.
141 */ 165 */
@@ -199,11 +223,12 @@ rcu_random(struct rcu_random_state *rrsp)
199static void 223static void
200rcu_stutter_wait(void) 224rcu_stutter_wait(void)
201{ 225{
202 while (stutter_pause_test || !rcutorture_runnable) 226 while ((stutter_pause_test || !rcutorture_runnable) && !fullstop) {
203 if (rcutorture_runnable) 227 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1); 228 schedule_timeout_interruptible(1);
205 else 229 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ)); 230 schedule_timeout_interruptible(round_jiffies_relative(HZ));
231 }
207} 232}
208 233
209/* 234/*
@@ -599,7 +624,7 @@ rcu_torture_writer(void *arg)
599 rcu_stutter_wait(); 624 rcu_stutter_wait();
600 } while (!kthread_should_stop() && !fullstop); 625 } while (!kthread_should_stop() && !fullstop);
601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
602 while (!kthread_should_stop()) 627 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
603 schedule_timeout_uninterruptible(1); 628 schedule_timeout_uninterruptible(1);
604 return 0; 629 return 0;
605} 630}
@@ -624,7 +649,7 @@ rcu_torture_fakewriter(void *arg)
624 } while (!kthread_should_stop() && !fullstop); 649 } while (!kthread_should_stop() && !fullstop);
625 650
626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 651 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
627 while (!kthread_should_stop()) 652 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
628 schedule_timeout_uninterruptible(1); 653 schedule_timeout_uninterruptible(1);
629 return 0; 654 return 0;
630} 655}
@@ -734,7 +759,7 @@ rcu_torture_reader(void *arg)
734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 759 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable) 760 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t); 761 del_timer_sync(&t);
737 while (!kthread_should_stop()) 762 while (!kthread_should_stop() && fullstop != FULLSTOP_SIGNALED)
738 schedule_timeout_uninterruptible(1); 763 schedule_timeout_uninterruptible(1);
739 return 0; 764 return 0;
740} 765}
@@ -831,7 +856,7 @@ rcu_torture_stats(void *arg)
831 do { 856 do {
832 schedule_timeout_interruptible(stat_interval * HZ); 857 schedule_timeout_interruptible(stat_interval * HZ);
833 rcu_torture_stats_print(); 858 rcu_torture_stats_print();
834 } while (!kthread_should_stop()); 859 } while (!kthread_should_stop() && !fullstop);
835 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); 860 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
836 return 0; 861 return 0;
837} 862}
@@ -899,7 +924,7 @@ rcu_torture_shuffle(void *arg)
899 do { 924 do {
900 schedule_timeout_interruptible(shuffle_interval * HZ); 925 schedule_timeout_interruptible(shuffle_interval * HZ);
901 rcu_torture_shuffle_tasks(); 926 rcu_torture_shuffle_tasks();
902 } while (!kthread_should_stop()); 927 } while (!kthread_should_stop() && !fullstop);
903 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); 928 VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
904 return 0; 929 return 0;
905} 930}
@@ -914,10 +939,10 @@ rcu_torture_stutter(void *arg)
914 do { 939 do {
915 schedule_timeout_interruptible(stutter * HZ); 940 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1; 941 stutter_pause_test = 1;
917 if (!kthread_should_stop()) 942 if (!kthread_should_stop() && !fullstop)
918 schedule_timeout_interruptible(stutter * HZ); 943 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0; 944 stutter_pause_test = 0;
920 } while (!kthread_should_stop()); 945 } while (!kthread_should_stop() && !fullstop);
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); 946 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0; 947 return 0;
923} 948}
@@ -934,12 +959,27 @@ rcu_torture_print_module_parms(char *tag)
934 stutter, irqreader); 959 stutter, irqreader);
935} 960}
936 961
962static struct notifier_block rcutorture_nb = {
963 .notifier_call = rcutorture_shutdown_notify,
964};
965
937static void 966static void
938rcu_torture_cleanup(void) 967rcu_torture_cleanup(void)
939{ 968{
940 int i; 969 int i;
941 970
942 fullstop = 1; 971 mutex_lock(&fullstop_mutex);
972 if (!fullstop) {
973 /* If being signaled, let it happen, then exit. */
974 mutex_unlock(&fullstop_mutex);
975 schedule_timeout_interruptible(10 * HZ);
976 if (cur_ops->cb_barrier != NULL)
977 cur_ops->cb_barrier();
978 return;
979 }
980 fullstop = FULLSTOP_CLEANUP;
981 mutex_unlock(&fullstop_mutex);
982 unregister_reboot_notifier(&rcutorture_nb);
943 if (stutter_task) { 983 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 984 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task); 985 kthread_stop(stutter_task);
@@ -1015,6 +1055,8 @@ rcu_torture_init(void)
1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1055 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1016 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1056 &srcu_ops, &sched_ops, &sched_ops_sync, };
1017 1057
1058 mutex_lock(&fullstop_mutex);
1059
1018 /* Process args and tell the world that the torturer is on the job. */ 1060 /* Process args and tell the world that the torturer is on the job. */
1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1061 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
1020 cur_ops = torture_ops[i]; 1062 cur_ops = torture_ops[i];
@@ -1024,6 +1066,7 @@ rcu_torture_init(void)
1024 if (i == ARRAY_SIZE(torture_ops)) { 1066 if (i == ARRAY_SIZE(torture_ops)) {
1025 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1067 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1026 torture_type); 1068 torture_type);
1069 mutex_unlock(&fullstop_mutex);
1027 return (-EINVAL); 1070 return (-EINVAL);
1028 } 1071 }
1029 if (cur_ops->init) 1072 if (cur_ops->init)
@@ -1146,9 +1189,12 @@ rcu_torture_init(void)
1146 goto unwind; 1189 goto unwind;
1147 } 1190 }
1148 } 1191 }
1192 register_reboot_notifier(&rcutorture_nb);
1193 mutex_unlock(&fullstop_mutex);
1149 return 0; 1194 return 0;
1150 1195
1151unwind: 1196unwind:
1197 mutex_unlock(&fullstop_mutex);
1152 rcu_torture_cleanup(); 1198 rcu_torture_cleanup();
1153 return firsterr; 1199 return firsterr;
1154} 1200}
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 000000000000..a342b032112c
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1535 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
23 *
24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 *
27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU
29 */
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/init.h>
33#include <linux/spinlock.h>
34#include <linux/smp.h>
35#include <linux/rcupdate.h>
36#include <linux/interrupt.h>
37#include <linux/sched.h>
38#include <asm/atomic.h>
39#include <linux/bitops.h>
40#include <linux/module.h>
41#include <linux/completion.h>
42#include <linux/moduleparam.h>
43#include <linux/percpu.h>
44#include <linux/notifier.h>
45#include <linux/cpu.h>
46#include <linux/mutex.h>
47#include <linux/time.h>
48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map =
52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
53EXPORT_SYMBOL_GPL(rcu_lock_map);
54#endif
55
56/* Data structures. */
57
58#define RCU_STATE_INITIALIZER(name) { \
59 .level = { &name.node[0] }, \
60 .levelcnt = { \
61 NUM_RCU_LVL_0, /* root of hierarchy. */ \
62 NUM_RCU_LVL_1, \
63 NUM_RCU_LVL_2, \
64 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
65 }, \
66 .signaled = RCU_SIGNAL_INIT, \
67 .gpnum = -300, \
68 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
71 .n_force_qs = 0, \
72 .n_force_qs_ngp = 0, \
73}
74
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data);
77
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80
81#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
83#endif /* #ifdef CONFIG_NO_HZ */
84
85static int blimit = 10; /* Maximum callbacks per softirq. */
86static int qhimark = 10000; /* If this many pending, ignore blimit. */
87static int qlowmark = 100; /* Once only this many pending, use blimit. */
88
89static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
90
91/*
92 * Return the number of RCU batches processed thus far for debug & stats.
93 */
94long rcu_batches_completed(void)
95{
96 return rcu_state.completed;
97}
98EXPORT_SYMBOL_GPL(rcu_batches_completed);
99
100/*
101 * Return the number of RCU BH batches processed thus far for debug & stats.
102 */
103long rcu_batches_completed_bh(void)
104{
105 return rcu_bh_state.completed;
106}
107EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
108
109/*
110 * Does the CPU have callbacks ready to be invoked?
111 */
112static int
113cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
114{
115 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
116}
117
118/*
119 * Does the current CPU require a yet-as-unscheduled grace period?
120 */
121static int
122cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
123{
124 /* ACCESS_ONCE() because we are accessing outside of lock. */
125 return *rdp->nxttail[RCU_DONE_TAIL] &&
126 ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
127}
128
129/*
130 * Return the root node of the specified rcu_state structure.
131 */
132static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
133{
134 return &rsp->node[0];
135}
136
137#ifdef CONFIG_SMP
138
139/*
140 * If the specified CPU is offline, tell the caller that it is in
141 * a quiescent state. Otherwise, whack it with a reschedule IPI.
142 * Grace periods can end up waiting on an offline CPU when that
143 * CPU is in the process of coming online -- it will be added to the
144 * rcu_node bitmasks before it actually makes it online. The same thing
145 * can happen while a CPU is in the process of coming online. Because this
146 * race is quite rare, we check for it after detecting that the grace
147 * period has been delayed rather than checking each and every CPU
148 * each and every time we start a new grace period.
149 */
150static int rcu_implicit_offline_qs(struct rcu_data *rdp)
151{
152 /*
153 * If the CPU is offline, it is in a quiescent state. We can
154 * trust its state not to change because interrupts are disabled.
155 */
156 if (cpu_is_offline(rdp->cpu)) {
157 rdp->offline_fqs++;
158 return 1;
159 }
160
161 /* The CPU is online, so send it a reschedule IPI. */
162 if (rdp->cpu != smp_processor_id())
163 smp_send_reschedule(rdp->cpu);
164 else
165 set_need_resched();
166 rdp->resched_ipi++;
167 return 0;
168}
169
170#endif /* #ifdef CONFIG_SMP */
171
172#ifdef CONFIG_NO_HZ
173static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
174
175/**
176 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
177 *
178 * Enter nohz mode, in other words, -leave- the mode in which RCU
179 * read-side critical sections can occur. (Though RCU read-side
180 * critical sections can occur in irq handlers in nohz mode, a possibility
181 * handled by rcu_irq_enter() and rcu_irq_exit()).
182 */
183void rcu_enter_nohz(void)
184{
185 unsigned long flags;
186 struct rcu_dynticks *rdtp;
187
188 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
189 local_irq_save(flags);
190 rdtp = &__get_cpu_var(rcu_dynticks);
191 rdtp->dynticks++;
192 rdtp->dynticks_nesting--;
193 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
194 local_irq_restore(flags);
195}
196
197/*
198 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
199 *
200 * Exit nohz mode, in other words, -enter- the mode in which RCU
201 * read-side critical sections normally occur.
202 */
203void rcu_exit_nohz(void)
204{
205 unsigned long flags;
206 struct rcu_dynticks *rdtp;
207
208 local_irq_save(flags);
209 rdtp = &__get_cpu_var(rcu_dynticks);
210 rdtp->dynticks++;
211 rdtp->dynticks_nesting++;
212 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
213 local_irq_restore(flags);
214 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
215}
216
217/**
218 * rcu_nmi_enter - inform RCU of entry to NMI context
219 *
220 * If the CPU was idle with dynamic ticks active, and there is no
221 * irq handler running, this updates rdtp->dynticks_nmi to let the
222 * RCU grace-period handling know that the CPU is active.
223 */
224void rcu_nmi_enter(void)
225{
226 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
227
228 if (rdtp->dynticks & 0x1)
229 return;
230 rdtp->dynticks_nmi++;
231 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
232 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
233}
234
235/**
236 * rcu_nmi_exit - inform RCU of exit from NMI context
237 *
238 * If the CPU was idle with dynamic ticks active, and there is no
239 * irq handler running, this updates rdtp->dynticks_nmi to let the
240 * RCU grace-period handling know that the CPU is no longer active.
241 */
242void rcu_nmi_exit(void)
243{
244 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
245
246 if (rdtp->dynticks & 0x1)
247 return;
248 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
249 rdtp->dynticks_nmi++;
250 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
251}
252
253/**
254 * rcu_irq_enter - inform RCU of entry to hard irq context
255 *
256 * If the CPU was idle with dynamic ticks active, this updates the
257 * rdtp->dynticks to let the RCU handling know that the CPU is active.
258 */
259void rcu_irq_enter(void)
260{
261 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
262
263 if (rdtp->dynticks_nesting++)
264 return;
265 rdtp->dynticks++;
266 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
267 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
268}
269
270/**
271 * rcu_irq_exit - inform RCU of exit from hard irq context
272 *
273 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
274 * to put let the RCU handling be aware that the CPU is going back to idle
275 * with no ticks.
276 */
277void rcu_irq_exit(void)
278{
279 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
280
281 if (--rdtp->dynticks_nesting)
282 return;
283 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
284 rdtp->dynticks++;
285 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
286
287 /* If the interrupt queued a callback, get out of dyntick mode. */
288 if (__get_cpu_var(rcu_data).nxtlist ||
289 __get_cpu_var(rcu_bh_data).nxtlist)
290 set_need_resched();
291}
292
293/*
294 * Record the specified "completed" value, which is later used to validate
295 * dynticks counter manipulations. Specify "rsp->completed - 1" to
296 * unconditionally invalidate any future dynticks manipulations (which is
297 * useful at the beginning of a grace period).
298 */
299static void dyntick_record_completed(struct rcu_state *rsp, long comp)
300{
301 rsp->dynticks_completed = comp;
302}
303
304#ifdef CONFIG_SMP
305
306/*
307 * Recall the previously recorded value of the completion for dynticks.
308 */
309static long dyntick_recall_completed(struct rcu_state *rsp)
310{
311 return rsp->dynticks_completed;
312}
313
314/*
315 * Snapshot the specified CPU's dynticks counter so that we can later
316 * credit them with an implicit quiescent state. Return 1 if this CPU
317 * is already in a quiescent state courtesy of dynticks idle mode.
318 */
319static int dyntick_save_progress_counter(struct rcu_data *rdp)
320{
321 int ret;
322 int snap;
323 int snap_nmi;
324
325 snap = rdp->dynticks->dynticks;
326 snap_nmi = rdp->dynticks->dynticks_nmi;
327 smp_mb(); /* Order sampling of snap with end of grace period. */
328 rdp->dynticks_snap = snap;
329 rdp->dynticks_nmi_snap = snap_nmi;
330 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
331 if (ret)
332 rdp->dynticks_fqs++;
333 return ret;
334}
335
336/*
337 * Return true if the specified CPU has passed through a quiescent
338 * state by virtue of being in or having passed through an dynticks
339 * idle state since the last call to dyntick_save_progress_counter()
340 * for this same CPU.
341 */
342static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
343{
344 long curr;
345 long curr_nmi;
346 long snap;
347 long snap_nmi;
348
349 curr = rdp->dynticks->dynticks;
350 snap = rdp->dynticks_snap;
351 curr_nmi = rdp->dynticks->dynticks_nmi;
352 snap_nmi = rdp->dynticks_nmi_snap;
353 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
354
355 /*
356 * If the CPU passed through or entered a dynticks idle phase with
357 * no active irq/NMI handlers, then we can safely pretend that the CPU
358 * already acknowledged the request to pass through a quiescent
359 * state. Either way, that CPU cannot possibly be in an RCU
360 * read-side critical section that started before the beginning
361 * of the current RCU grace period.
362 */
363 if ((curr != snap || (curr & 0x1) == 0) &&
364 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
365 rdp->dynticks_fqs++;
366 return 1;
367 }
368
369 /* Go check for the CPU being offline. */
370 return rcu_implicit_offline_qs(rdp);
371}
372
373#endif /* #ifdef CONFIG_SMP */
374
375#else /* #ifdef CONFIG_NO_HZ */
376
377static void dyntick_record_completed(struct rcu_state *rsp, long comp)
378{
379}
380
381#ifdef CONFIG_SMP
382
383/*
384 * If there are no dynticks, then the only way that a CPU can passively
385 * be in a quiescent state is to be offline. Unlike dynticks idle, which
386 * is a point in time during the prior (already finished) grace period,
387 * an offline CPU is always in a quiescent state, and thus can be
388 * unconditionally applied. So just return the current value of completed.
389 */
390static long dyntick_recall_completed(struct rcu_state *rsp)
391{
392 return rsp->completed;
393}
394
395static int dyntick_save_progress_counter(struct rcu_data *rdp)
396{
397 return 0;
398}
399
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{
402 return rcu_implicit_offline_qs(rdp);
403}
404
405#endif /* #ifdef CONFIG_SMP */
406
407#endif /* #else #ifdef CONFIG_NO_HZ */
408
409#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
410
411static void record_gp_stall_check_time(struct rcu_state *rsp)
412{
413 rsp->gp_start = jiffies;
414 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
415}
416
417static void print_other_cpu_stall(struct rcu_state *rsp)
418{
419 int cpu;
420 long delta;
421 unsigned long flags;
422 struct rcu_node *rnp = rcu_get_root(rsp);
423 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
424 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
425
426 /* Only let one CPU complain about others per time interval. */
427
428 spin_lock_irqsave(&rnp->lock, flags);
429 delta = jiffies - rsp->jiffies_stall;
430 if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
431 spin_unlock_irqrestore(&rnp->lock, flags);
432 return;
433 }
434 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
435 spin_unlock_irqrestore(&rnp->lock, flags);
436
437 /* OK, time to rat on our buddy... */
438
439 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
440 for (; rnp_cur < rnp_end; rnp_cur++) {
441 if (rnp_cur->qsmask == 0)
442 continue;
443 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
444 if (rnp_cur->qsmask & (1UL << cpu))
445 printk(" %d", rnp_cur->grplo + cpu);
446 }
447 printk(" (detected by %d, t=%ld jiffies)\n",
448 smp_processor_id(), (long)(jiffies - rsp->gp_start));
449 force_quiescent_state(rsp, 0); /* Kick them all. */
450}
451
452static void print_cpu_stall(struct rcu_state *rsp)
453{
454 unsigned long flags;
455 struct rcu_node *rnp = rcu_get_root(rsp);
456
457 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
458 smp_processor_id(), jiffies - rsp->gp_start);
459 dump_stack();
460 spin_lock_irqsave(&rnp->lock, flags);
461 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
462 rsp->jiffies_stall =
463 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
464 spin_unlock_irqrestore(&rnp->lock, flags);
465 set_need_resched(); /* kick ourselves to get things going. */
466}
467
468static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
469{
470 long delta;
471 struct rcu_node *rnp;
472
473 delta = jiffies - rsp->jiffies_stall;
474 rnp = rdp->mynode;
475 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
476
477 /* We haven't checked in, so go dump stack. */
478 print_cpu_stall(rsp);
479
480 } else if (rsp->gpnum != rsp->completed &&
481 delta >= RCU_STALL_RAT_DELAY) {
482
483 /* They had two time units to dump stack, so complain. */
484 print_other_cpu_stall(rsp);
485 }
486}
487
488#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
489
490static void record_gp_stall_check_time(struct rcu_state *rsp)
491{
492}
493
494static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
495{
496}
497
498#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
499
500/*
501 * Update CPU-local rcu_data state to record the newly noticed grace period.
502 * This is used both when we started the grace period and when we notice
503 * that someone else started the grace period.
504 */
505static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
506{
507 rdp->qs_pending = 1;
508 rdp->passed_quiesc = 0;
509 rdp->gpnum = rsp->gpnum;
510 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
511 RCU_JIFFIES_TILL_FORCE_QS;
512}
513
514/*
515 * Did someone else start a new RCU grace period start since we last
516 * checked? Update local state appropriately if so. Must be called
517 * on the CPU corresponding to rdp.
518 */
519static int
520check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
521{
522 unsigned long flags;
523 int ret = 0;
524
525 local_irq_save(flags);
526 if (rdp->gpnum != rsp->gpnum) {
527 note_new_gpnum(rsp, rdp);
528 ret = 1;
529 }
530 local_irq_restore(flags);
531 return ret;
532}
533
534/*
535 * Start a new RCU grace period if warranted, re-initializing the hierarchy
536 * in preparation for detecting the next grace period. The caller must hold
537 * the root node's ->lock, which is released before return. Hard irqs must
538 * be disabled.
539 */
540static void
541rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
542 __releases(rcu_get_root(rsp)->lock)
543{
544 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
545 struct rcu_node *rnp = rcu_get_root(rsp);
546 struct rcu_node *rnp_cur;
547 struct rcu_node *rnp_end;
548
549 if (!cpu_needs_another_gp(rsp, rdp)) {
550 spin_unlock_irqrestore(&rnp->lock, flags);
551 return;
552 }
553
554 /* Advance to a new grace period and initialize state. */
555 rsp->gpnum++;
556 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
557 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
558 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
559 RCU_JIFFIES_TILL_FORCE_QS;
560 record_gp_stall_check_time(rsp);
561 dyntick_record_completed(rsp, rsp->completed - 1);
562 note_new_gpnum(rsp, rdp);
563
564 /*
565 * Because we are first, we know that all our callbacks will
566 * be covered by this upcoming grace period, even the ones
567 * that were registered arbitrarily recently.
568 */
569 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
570 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
571
572 /* Special-case the common single-level case. */
573 if (NUM_RCU_NODES == 1) {
574 rnp->qsmask = rnp->qsmaskinit;
575 spin_unlock_irqrestore(&rnp->lock, flags);
576 return;
577 }
578
579 spin_unlock(&rnp->lock); /* leave irqs disabled. */
580
581
582 /* Exclude any concurrent CPU-hotplug operations. */
583 spin_lock(&rsp->onofflock); /* irqs already disabled. */
584
585 /*
586 * Set the quiescent-state-needed bits in all the non-leaf RCU
587 * nodes for all currently online CPUs. This operation relies
588 * on the layout of the hierarchy within the rsp->node[] array.
589 * Note that other CPUs will access only the leaves of the
590 * hierarchy, which still indicate that no grace period is in
591 * progress. In addition, we have excluded CPU-hotplug operations.
592 *
593 * We therefore do not need to hold any locks. Any required
594 * memory barriers will be supplied by the locks guarding the
595 * leaf rcu_nodes in the hierarchy.
596 */
597
598 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
599 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
600 rnp_cur->qsmask = rnp_cur->qsmaskinit;
601
602 /*
603 * Now set up the leaf nodes. Here we must be careful. First,
604 * we need to hold the lock in order to exclude other CPUs, which
605 * might be contending for the leaf nodes' locks. Second, as
606 * soon as we initialize a given leaf node, its CPUs might run
607 * up the rest of the hierarchy. We must therefore acquire locks
608 * for each node that we touch during this stage. (But we still
609 * are excluding CPU-hotplug operations.)
610 *
611 * Note that the grace period cannot complete until we finish
612 * the initialization process, as there will be at least one
613 * qsmask bit set in the root node until that time, namely the
614 * one corresponding to this CPU.
615 */
616 rnp_end = &rsp->node[NUM_RCU_NODES];
617 rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
618 for (; rnp_cur < rnp_end; rnp_cur++) {
619 spin_lock(&rnp_cur->lock); /* irqs already disabled. */
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */
622 }
623
624 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
625 spin_unlock_irqrestore(&rsp->onofflock, flags);
626}
627
628/*
629 * Advance this CPU's callbacks, but only if the current grace period
630 * has ended. This may be called only from the CPU to whom the rdp
631 * belongs.
632 */
633static void
634rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
635{
636 long completed_snap;
637 unsigned long flags;
638
639 local_irq_save(flags);
640 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
641
642 /* Did another grace period end? */
643 if (rdp->completed != completed_snap) {
644
645 /* Advance callbacks. No harm if list empty. */
646 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
647 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
648 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
649
650 /* Remember that we saw this grace-period completion. */
651 rdp->completed = completed_snap;
652 }
653 local_irq_restore(flags);
654}
655
656/*
657 * Similar to cpu_quiet(), for which it is a helper function. Allows
658 * a group of CPUs to be quieted at one go, though all the CPUs in the
659 * group must be represented by the same leaf rcu_node structure.
660 * That structure's lock must be held upon entry, and it is released
661 * before return.
662 */
663static void
664cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
665 unsigned long flags)
666 __releases(rnp->lock)
667{
668 /* Walk up the rcu_node hierarchy. */
669 for (;;) {
670 if (!(rnp->qsmask & mask)) {
671
672 /* Our bit has already been cleared, so done. */
673 spin_unlock_irqrestore(&rnp->lock, flags);
674 return;
675 }
676 rnp->qsmask &= ~mask;
677 if (rnp->qsmask != 0) {
678
679 /* Other bits still set at this level, so done. */
680 spin_unlock_irqrestore(&rnp->lock, flags);
681 return;
682 }
683 mask = rnp->grpmask;
684 if (rnp->parent == NULL) {
685
686 /* No more levels. Exit loop holding root lock. */
687
688 break;
689 }
690 spin_unlock_irqrestore(&rnp->lock, flags);
691 rnp = rnp->parent;
692 spin_lock_irqsave(&rnp->lock, flags);
693 }
694
695 /*
696 * Get here if we are the last CPU to pass through a quiescent
697 * state for this grace period. Clean up and let rcu_start_gp()
698 * start up the next grace period if one is needed. Note that
699 * we still hold rnp->lock, as required by rcu_start_gp(), which
700 * will release it.
701 */
702 rsp->completed = rsp->gpnum;
703 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
704 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
705}
706
707/*
708 * Record a quiescent state for the specified CPU, which must either be
709 * the current CPU or an offline CPU. The lastcomp argument is used to
710 * make sure we are still in the grace period of interest. We don't want
711 * to end the current grace period based on quiescent states detected in
712 * an earlier grace period!
713 */
714static void
715cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
716{
717 unsigned long flags;
718 unsigned long mask;
719 struct rcu_node *rnp;
720
721 rnp = rdp->mynode;
722 spin_lock_irqsave(&rnp->lock, flags);
723 if (lastcomp != ACCESS_ONCE(rsp->completed)) {
724
725 /*
726 * Someone beat us to it for this grace period, so leave.
727 * The race with GP start is resolved by the fact that we
728 * hold the leaf rcu_node lock, so that the per-CPU bits
729 * cannot yet be initialized -- so we would simply find our
730 * CPU's bit already cleared in cpu_quiet_msk() if this race
731 * occurred.
732 */
733 rdp->passed_quiesc = 0; /* try again later! */
734 spin_unlock_irqrestore(&rnp->lock, flags);
735 return;
736 }
737 mask = rdp->grpmask;
738 if ((rnp->qsmask & mask) == 0) {
739 spin_unlock_irqrestore(&rnp->lock, flags);
740 } else {
741 rdp->qs_pending = 0;
742
743 /*
744 * This GP can't end until cpu checks in, so all of our
745 * callbacks can be processed during the next GP.
746 */
747 rdp = rsp->rda[smp_processor_id()];
748 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
749
750 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
751 }
752}
753
754/*
755 * Check to see if there is a new grace period of which this CPU
756 * is not yet aware, and if so, set up local rcu_data state for it.
757 * Otherwise, see if this CPU has just passed through its first
758 * quiescent state for this grace period, and record that fact if so.
759 */
760static void
761rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
762{
763 /* If there is now a new grace period, record and return. */
764 if (check_for_new_grace_period(rsp, rdp))
765 return;
766
767 /*
768 * Does this CPU still need to do its part for current grace period?
769 * If no, return and let the other CPUs do their part as well.
770 */
771 if (!rdp->qs_pending)
772 return;
773
774 /*
775 * Was there a quiescent state since the beginning of the grace
776 * period? If no, then exit and wait for the next call.
777 */
778 if (!rdp->passed_quiesc)
779 return;
780
781 /* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
782 cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
783}
784
785#ifdef CONFIG_HOTPLUG_CPU
786
787/*
788 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
789 * and move all callbacks from the outgoing CPU to the current one.
790 */
791static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
792{
793 int i;
794 unsigned long flags;
795 long lastcomp;
796 unsigned long mask;
797 struct rcu_data *rdp = rsp->rda[cpu];
798 struct rcu_data *rdp_me;
799 struct rcu_node *rnp;
800
801 /* Exclude any attempts to start a new grace period. */
802 spin_lock_irqsave(&rsp->onofflock, flags);
803
804 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
805 rnp = rdp->mynode;
806 mask = rdp->grpmask; /* rnp->grplo is constant. */
807 do {
808 spin_lock(&rnp->lock); /* irqs already disabled. */
809 rnp->qsmaskinit &= ~mask;
810 if (rnp->qsmaskinit != 0) {
811 spin_unlock(&rnp->lock); /* irqs already disabled. */
812 break;
813 }
814 mask = rnp->grpmask;
815 spin_unlock(&rnp->lock); /* irqs already disabled. */
816 rnp = rnp->parent;
817 } while (rnp != NULL);
818 lastcomp = rsp->completed;
819
820 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
821
822 /* Being offline is a quiescent state, so go record it. */
823 cpu_quiet(cpu, rsp, rdp, lastcomp);
824
825 /*
826 * Move callbacks from the outgoing CPU to the running CPU.
827 * Note that the outgoing CPU is now quiscent, so it is now
828 * (uncharacteristically) safe to access it rcu_data structure.
829 * Note also that we must carefully retain the order of the
830 * outgoing CPU's callbacks in order for rcu_barrier() to work
831 * correctly. Finally, note that we start all the callbacks
832 * afresh, even those that have passed through a grace period
833 * and are therefore ready to invoke. The theory is that hotplug
834 * events are rare, and that if they are frequent enough to
835 * indefinitely delay callbacks, you have far worse things to
836 * be worrying about.
837 */
838 rdp_me = rsp->rda[smp_processor_id()];
839 if (rdp->nxtlist != NULL) {
840 *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
841 rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
842 rdp->nxtlist = NULL;
843 for (i = 0; i < RCU_NEXT_SIZE; i++)
844 rdp->nxttail[i] = &rdp->nxtlist;
845 rdp_me->qlen += rdp->qlen;
846 rdp->qlen = 0;
847 }
848 local_irq_restore(flags);
849}
850
851/*
852 * Remove the specified CPU from the RCU hierarchy and move any pending
853 * callbacks that it might have to the current CPU. This code assumes
854 * that at least one CPU in the system will remain running at all times.
855 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
856 */
857static void rcu_offline_cpu(int cpu)
858{
859 __rcu_offline_cpu(cpu, &rcu_state);
860 __rcu_offline_cpu(cpu, &rcu_bh_state);
861}
862
863#else /* #ifdef CONFIG_HOTPLUG_CPU */
864
865static void rcu_offline_cpu(int cpu)
866{
867}
868
869#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
870
871/*
872 * Invoke any RCU callbacks that have made it to the end of their grace
873 * period. Thottle as specified by rdp->blimit.
874 */
875static void rcu_do_batch(struct rcu_data *rdp)
876{
877 unsigned long flags;
878 struct rcu_head *next, *list, **tail;
879 int count;
880
881 /* If no callbacks are ready, just return.*/
882 if (!cpu_has_callbacks_ready_to_invoke(rdp))
883 return;
884
885 /*
886 * Extract the list of ready callbacks, disabling to prevent
887 * races with call_rcu() from interrupt handlers.
888 */
889 local_irq_save(flags);
890 list = rdp->nxtlist;
891 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
892 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
893 tail = rdp->nxttail[RCU_DONE_TAIL];
894 for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
895 if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
896 rdp->nxttail[count] = &rdp->nxtlist;
897 local_irq_restore(flags);
898
899 /* Invoke callbacks. */
900 count = 0;
901 while (list) {
902 next = list->next;
903 prefetch(next);
904 list->func(list);
905 list = next;
906 if (++count >= rdp->blimit)
907 break;
908 }
909
910 local_irq_save(flags);
911
912 /* Update count, and requeue any remaining callbacks. */
913 rdp->qlen -= count;
914 if (list != NULL) {
915 *tail = rdp->nxtlist;
916 rdp->nxtlist = list;
917 for (count = 0; count < RCU_NEXT_SIZE; count++)
918 if (&rdp->nxtlist == rdp->nxttail[count])
919 rdp->nxttail[count] = tail;
920 else
921 break;
922 }
923
924 /* Reinstate batch limit if we have worked down the excess. */
925 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
926 rdp->blimit = blimit;
927
928 local_irq_restore(flags);
929
930 /* Re-raise the RCU softirq if there are callbacks remaining. */
931 if (cpu_has_callbacks_ready_to_invoke(rdp))
932 raise_softirq(RCU_SOFTIRQ);
933}
934
935/*
936 * Check to see if this CPU is in a non-context-switch quiescent state
937 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
938 * Also schedule the RCU softirq handler.
939 *
940 * This function must be called with hardirqs disabled. It is normally
941 * invoked from the scheduling-clock interrupt. If rcu_pending returns
942 * false, there is no point in invoking rcu_check_callbacks().
943 */
944void rcu_check_callbacks(int cpu, int user)
945{
946 if (user ||
947 (idle_cpu(cpu) && !in_softirq() &&
948 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
949
950 /*
951 * Get here if this CPU took its interrupt from user
952 * mode or from the idle loop, and if this is not a
953 * nested interrupt. In this case, the CPU is in
954 * a quiescent state, so count it.
955 *
956 * No memory barrier is required here because both
957 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference
958 * only CPU-local variables that other CPUs neither
959 * access nor modify, at least not while the corresponding
960 * CPU is online.
961 */
962
963 rcu_qsctr_inc(cpu);
964 rcu_bh_qsctr_inc(cpu);
965
966 } else if (!in_softirq()) {
967
968 /*
969 * Get here if this CPU did not take its interrupt from
970 * softirq, in other words, if it is not interrupting
971 * a rcu_bh read-side critical section. This is an _bh
972 * critical section, so count it.
973 */
974
975 rcu_bh_qsctr_inc(cpu);
976 }
977 raise_softirq(RCU_SOFTIRQ);
978}
979
980#ifdef CONFIG_SMP
981
982/*
983 * Scan the leaf rcu_node structures, processing dyntick state for any that
984 * have not yet encountered a quiescent state, using the function specified.
985 * Returns 1 if the current grace period ends while scanning (possibly
986 * because we made it end).
987 */
988static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
989 int (*f)(struct rcu_data *))
990{
991 unsigned long bit;
992 int cpu;
993 unsigned long flags;
994 unsigned long mask;
995 struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
996 struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
997
998 for (; rnp_cur < rnp_end; rnp_cur++) {
999 mask = 0;
1000 spin_lock_irqsave(&rnp_cur->lock, flags);
1001 if (rsp->completed != lastcomp) {
1002 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1003 return 1;
1004 }
1005 if (rnp_cur->qsmask == 0) {
1006 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1007 continue;
1008 }
1009 cpu = rnp_cur->grplo;
1010 bit = 1;
1011 for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
1012 if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1013 mask |= bit;
1014 }
1015 if (mask != 0 && rsp->completed == lastcomp) {
1016
1017 /* cpu_quiet_msk() releases rnp_cur->lock. */
1018 cpu_quiet_msk(mask, rsp, rnp_cur, flags);
1019 continue;
1020 }
1021 spin_unlock_irqrestore(&rnp_cur->lock, flags);
1022 }
1023 return 0;
1024}
1025
1026/*
1027 * Force quiescent states on reluctant CPUs, and also detect which
1028 * CPUs are in dyntick-idle mode.
1029 */
1030static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1031{
1032 unsigned long flags;
1033 long lastcomp;
1034 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1035 struct rcu_node *rnp = rcu_get_root(rsp);
1036 u8 signaled;
1037
1038 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
1039 return; /* No grace period in progress, nothing to force. */
1040 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
1041 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1042 return; /* Someone else is already on the job. */
1043 }
1044 if (relaxed &&
1045 (long)(rsp->jiffies_force_qs - jiffies) >= 0 &&
1046 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1047 goto unlock_ret; /* no emergency and done recently. */
1048 rsp->n_force_qs++;
1049 spin_lock(&rnp->lock);
1050 lastcomp = rsp->completed;
1051 signaled = rsp->signaled;
1052 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1053 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1054 RCU_JIFFIES_TILL_FORCE_QS;
1055 if (lastcomp == rsp->gpnum) {
1056 rsp->n_force_qs_ngp++;
1057 spin_unlock(&rnp->lock);
1058 goto unlock_ret; /* no GP in progress, time updated. */
1059 }
1060 spin_unlock(&rnp->lock);
1061 switch (signaled) {
1062 case RCU_GP_INIT:
1063
1064 break; /* grace period still initializing, ignore. */
1065
1066 case RCU_SAVE_DYNTICK:
1067
1068 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1069 break; /* So gcc recognizes the dead code. */
1070
1071 /* Record dyntick-idle state. */
1072 if (rcu_process_dyntick(rsp, lastcomp,
1073 dyntick_save_progress_counter))
1074 goto unlock_ret;
1075
1076 /* Update state, record completion counter. */
1077 spin_lock(&rnp->lock);
1078 if (lastcomp == rsp->completed) {
1079 rsp->signaled = RCU_FORCE_QS;
1080 dyntick_record_completed(rsp, lastcomp);
1081 }
1082 spin_unlock(&rnp->lock);
1083 break;
1084
1085 case RCU_FORCE_QS:
1086
1087 /* Check dyntick-idle state, send IPI to laggarts. */
1088 if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
1089 rcu_implicit_dynticks_qs))
1090 goto unlock_ret;
1091
1092 /* Leave state in case more forcing is required. */
1093
1094 break;
1095 }
1096unlock_ret:
1097 spin_unlock_irqrestore(&rsp->fqslock, flags);
1098}
1099
1100#else /* #ifdef CONFIG_SMP */
1101
1102static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1103{
1104 set_need_resched();
1105}
1106
1107#endif /* #else #ifdef CONFIG_SMP */
1108
1109/*
1110 * This does the RCU processing work from softirq context for the
1111 * specified rcu_state and rcu_data structures. This may be called
1112 * only from the CPU to whom the rdp belongs.
1113 */
1114static void
1115__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1116{
1117 unsigned long flags;
1118
1119 /*
1120 * If an RCU GP has gone long enough, go check for dyntick
1121 * idle CPUs and, if needed, send resched IPIs.
1122 */
1123 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1124 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1125 force_quiescent_state(rsp, 1);
1126
1127 /*
1128 * Advance callbacks in response to end of earlier grace
1129 * period that some other CPU ended.
1130 */
1131 rcu_process_gp_end(rsp, rdp);
1132
1133 /* Update RCU state based on any recent quiescent states. */
1134 rcu_check_quiescent_state(rsp, rdp);
1135
1136 /* Does this CPU require a not-yet-started grace period? */
1137 if (cpu_needs_another_gp(rsp, rdp)) {
1138 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1139 rcu_start_gp(rsp, flags); /* releases above lock */
1140 }
1141
1142 /* If there are callbacks ready, invoke them. */
1143 rcu_do_batch(rdp);
1144}
1145
1146/*
1147 * Do softirq processing for the current CPU.
1148 */
1149static void rcu_process_callbacks(struct softirq_action *unused)
1150{
1151 /*
1152 * Memory references from any prior RCU read-side critical sections
1153 * executed by the interrupted code must be seen before any RCU
1154 * grace-period manipulations below.
1155 */
1156 smp_mb(); /* See above block comment. */
1157
1158 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
1159 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1160
1161 /*
1162 * Memory references from any later RCU read-side critical sections
1163 * executed by the interrupted code must be seen after any RCU
1164 * grace-period manipulations above.
1165 */
1166 smp_mb(); /* See above block comment. */
1167}
1168
1169static void
1170__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1171 struct rcu_state *rsp)
1172{
1173 unsigned long flags;
1174 struct rcu_data *rdp;
1175
1176 head->func = func;
1177 head->next = NULL;
1178
1179 smp_mb(); /* Ensure RCU update seen before callback registry. */
1180
1181 /*
1182 * Opportunistically note grace-period endings and beginnings.
1183 * Note that we might see a beginning right after we see an
1184 * end, but never vice versa, since this CPU has to pass through
1185 * a quiescent state betweentimes.
1186 */
1187 local_irq_save(flags);
1188 rdp = rsp->rda[smp_processor_id()];
1189 rcu_process_gp_end(rsp, rdp);
1190 check_for_new_grace_period(rsp, rdp);
1191
1192 /* Add the callback to our list. */
1193 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1194 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1195
1196 /* Start a new grace period if one not already started. */
1197 if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
1198 unsigned long nestflag;
1199 struct rcu_node *rnp_root = rcu_get_root(rsp);
1200
1201 spin_lock_irqsave(&rnp_root->lock, nestflag);
1202 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1203 }
1204
1205 /* Force the grace period if too many callbacks or too long waiting. */
1206 if (unlikely(++rdp->qlen > qhimark)) {
1207 rdp->blimit = LONG_MAX;
1208 force_quiescent_state(rsp, 0);
1209 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1210 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1211 force_quiescent_state(rsp, 1);
1212 local_irq_restore(flags);
1213}
1214
1215/*
1216 * Queue an RCU callback for invocation after a grace period.
1217 */
1218void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1219{
1220 __call_rcu(head, func, &rcu_state);
1221}
1222EXPORT_SYMBOL_GPL(call_rcu);
1223
1224/*
1225 * Queue an RCU for invocation after a quicker grace period.
1226 */
1227void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1228{
1229 __call_rcu(head, func, &rcu_bh_state);
1230}
1231EXPORT_SYMBOL_GPL(call_rcu_bh);
1232
1233/*
1234 * Check to see if there is any immediate RCU-related work to be done
1235 * by the current CPU, for the specified type of RCU, returning 1 if so.
1236 * The checks are in order of increasing expense: checks that can be
1237 * carried out against CPU-local state are performed first. However,
1238 * we must check for CPU stalls first, else we might not get a chance.
1239 */
1240static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1241{
1242 rdp->n_rcu_pending++;
1243
1244 /* Check for CPU stalls, if enabled. */
1245 check_cpu_stall(rsp, rdp);
1246
1247 /* Is the RCU core waiting for a quiescent state from this CPU? */
1248 if (rdp->qs_pending)
1249 return 1;
1250
1251 /* Does this CPU have callbacks ready to invoke? */
1252 if (cpu_has_callbacks_ready_to_invoke(rdp))
1253 return 1;
1254
1255 /* Has RCU gone idle with this CPU needing another grace period? */
1256 if (cpu_needs_another_gp(rsp, rdp))
1257 return 1;
1258
1259 /* Has another RCU grace period completed? */
1260 if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
1261 return 1;
1262
1263 /* Has a new RCU grace period started? */
1264 if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
1265 return 1;
1266
1267 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1268 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1269 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 ||
1270 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1271 return 1;
1272
1273 /* nothing to do */
1274 return 0;
1275}
1276
1277/*
1278 * Check to see if there is any immediate RCU-related work to be done
1279 * by the current CPU, returning 1 if so. This function is part of the
1280 * RCU implementation; it is -not- an exported member of the RCU API.
1281 */
1282int rcu_pending(int cpu)
1283{
1284 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
1285 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
1286}
1287
1288/*
1289 * Check to see if any future RCU-related work will need to be done
1290 * by the current CPU, even if none need be done immediately, returning
1291 * 1 if so. This function is part of the RCU implementation; it is -not-
1292 * an exported member of the RCU API.
1293 */
1294int rcu_needs_cpu(int cpu)
1295{
1296 /* RCU callbacks either ready or pending? */
1297 return per_cpu(rcu_data, cpu).nxtlist ||
1298 per_cpu(rcu_bh_data, cpu).nxtlist;
1299}
1300
1301/*
1302 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth"
1303 * approach so that we don't have to worry about how long the CPU has
1304 * been gone, or whether it ever was online previously. We do trust the
1305 * ->mynode field, as it is constant for a given struct rcu_data and
1306 * initialized during early boot.
1307 *
1308 * Note that only one online or offline event can be happening at a given
1309 * time. Note also that we can accept some slop in the rsp->completed
1310 * access due to the fact that this CPU cannot possibly have any RCU
1311 * callbacks in flight yet.
1312 */
1313static void
1314rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1315{
1316 unsigned long flags;
1317 int i;
1318 long lastcomp;
1319 unsigned long mask;
1320 struct rcu_data *rdp = rsp->rda[cpu];
1321 struct rcu_node *rnp = rcu_get_root(rsp);
1322
1323 /* Set up local state, ensuring consistent view of global state. */
1324 spin_lock_irqsave(&rnp->lock, flags);
1325 lastcomp = rsp->completed;
1326 rdp->completed = lastcomp;
1327 rdp->gpnum = lastcomp;
1328 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1329 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1330 rdp->beenonline = 1; /* We have now been online. */
1331 rdp->passed_quiesc_completed = lastcomp - 1;
1332 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1333 rdp->nxtlist = NULL;
1334 for (i = 0; i < RCU_NEXT_SIZE; i++)
1335 rdp->nxttail[i] = &rdp->nxtlist;
1336 rdp->qlen = 0;
1337 rdp->blimit = blimit;
1338#ifdef CONFIG_NO_HZ
1339 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1340#endif /* #ifdef CONFIG_NO_HZ */
1341 rdp->cpu = cpu;
1342 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1343
1344 /*
1345 * A new grace period might start here. If so, we won't be part
1346 * of it, but that is OK, as we are currently in a quiescent state.
1347 */
1348
1349 /* Exclude any attempts to start a new GP on large systems. */
1350 spin_lock(&rsp->onofflock); /* irqs already disabled. */
1351
1352 /* Add CPU to rcu_node bitmasks. */
1353 rnp = rdp->mynode;
1354 mask = rdp->grpmask;
1355 do {
1356 /* Exclude any attempts to start a new GP on small systems. */
1357 spin_lock(&rnp->lock); /* irqs already disabled. */
1358 rnp->qsmaskinit |= mask;
1359 mask = rnp->grpmask;
1360 spin_unlock(&rnp->lock); /* irqs already disabled. */
1361 rnp = rnp->parent;
1362 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1363
1364 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
1365
1366 /*
1367 * A new grace period might start here. If so, we will be part of
1368 * it, and its gpnum will be greater than ours, so we will
1369 * participate. It is also possible for the gpnum to have been
1370 * incremented before this function was called, and the bitmasks
1371 * to not be filled out until now, in which case we will also
1372 * participate due to our gpnum being behind.
1373 */
1374
1375 /* Since it is coming online, the CPU is in a quiescent state. */
1376 cpu_quiet(cpu, rsp, rdp, lastcomp);
1377 local_irq_restore(flags);
1378}
1379
1380static void __cpuinit rcu_online_cpu(int cpu)
1381{
1382#ifdef CONFIG_NO_HZ
1383 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1384
1385 rdtp->dynticks_nesting = 1;
1386 rdtp->dynticks |= 1; /* need consecutive #s even for hotplug. */
1387 rdtp->dynticks_nmi = (rdtp->dynticks_nmi + 1) & ~0x1;
1388#endif /* #ifdef CONFIG_NO_HZ */
1389 rcu_init_percpu_data(cpu, &rcu_state);
1390 rcu_init_percpu_data(cpu, &rcu_bh_state);
1391 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1392}
1393
1394/*
1395 * Handle CPU online/offline notifcation events.
1396 */
1397static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1398 unsigned long action, void *hcpu)
1399{
1400 long cpu = (long)hcpu;
1401
1402 switch (action) {
1403 case CPU_UP_PREPARE:
1404 case CPU_UP_PREPARE_FROZEN:
1405 rcu_online_cpu(cpu);
1406 break;
1407 case CPU_DEAD:
1408 case CPU_DEAD_FROZEN:
1409 case CPU_UP_CANCELED:
1410 case CPU_UP_CANCELED_FROZEN:
1411 rcu_offline_cpu(cpu);
1412 break;
1413 default:
1414 break;
1415 }
1416 return NOTIFY_OK;
1417}
1418
1419/*
1420 * Compute the per-level fanout, either using the exact fanout specified
1421 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1422 */
1423#ifdef CONFIG_RCU_FANOUT_EXACT
1424static void __init rcu_init_levelspread(struct rcu_state *rsp)
1425{
1426 int i;
1427
1428 for (i = NUM_RCU_LVLS - 1; i >= 0; i--)
1429 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
1430}
1431#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
1432static void __init rcu_init_levelspread(struct rcu_state *rsp)
1433{
1434 int ccur;
1435 int cprv;
1436 int i;
1437
1438 cprv = NR_CPUS;
1439 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1440 ccur = rsp->levelcnt[i];
1441 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
1442 cprv = ccur;
1443 }
1444}
1445#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
1446
1447/*
1448 * Helper function for rcu_init() that initializes one rcu_state structure.
1449 */
1450static void __init rcu_init_one(struct rcu_state *rsp)
1451{
1452 int cpustride = 1;
1453 int i;
1454 int j;
1455 struct rcu_node *rnp;
1456
1457 /* Initialize the level-tracking arrays. */
1458
1459 for (i = 1; i < NUM_RCU_LVLS; i++)
1460 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
1461 rcu_init_levelspread(rsp);
1462
1463 /* Initialize the elements themselves, starting from the leaves. */
1464
1465 for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
1466 cpustride *= rsp->levelspread[i];
1467 rnp = rsp->level[i];
1468 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1469 spin_lock_init(&rnp->lock);
1470 rnp->qsmask = 0;
1471 rnp->qsmaskinit = 0;
1472 rnp->grplo = j * cpustride;
1473 rnp->grphi = (j + 1) * cpustride - 1;
1474 if (rnp->grphi >= NR_CPUS)
1475 rnp->grphi = NR_CPUS - 1;
1476 if (i == 0) {
1477 rnp->grpnum = 0;
1478 rnp->grpmask = 0;
1479 rnp->parent = NULL;
1480 } else {
1481 rnp->grpnum = j % rsp->levelspread[i - 1];
1482 rnp->grpmask = 1UL << rnp->grpnum;
1483 rnp->parent = rsp->level[i - 1] +
1484 j / rsp->levelspread[i - 1];
1485 }
1486 rnp->level = i;
1487 }
1488 }
1489}
1490
1491/*
1492 * Helper macro for __rcu_init(). To be used nowhere else!
1493 * Assigns leaf node pointers into each CPU's rcu_data structure.
1494 */
1495#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
1496do { \
1497 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1498 j = 0; \
1499 for_each_possible_cpu(i) { \
1500 if (i > rnp[j].grphi) \
1501 j++; \
1502 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1503 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1504 } \
1505} while (0)
1506
1507static struct notifier_block __cpuinitdata rcu_nb = {
1508 .notifier_call = rcu_cpu_notify,
1509};
1510
1511void __init __rcu_init(void)
1512{
1513 int i; /* All used by RCU_DATA_PTR_INIT(). */
1514 int j;
1515 struct rcu_node *rnp;
1516
1517 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
1518#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1519 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1520#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1521 rcu_init_one(&rcu_state);
1522 RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
1523 rcu_init_one(&rcu_bh_state);
1524 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
1525
1526 for_each_online_cpu(i)
1527 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1528 /* Register notifier for non-boot CPUs */
1529 register_cpu_notifier(&rcu_nb);
1530 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1531}
1532
1533module_param(blimit, int, 0);
1534module_param(qhimark, int, 0);
1535module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 000000000000..d6db3e837826
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,271 @@
1/*
2 * Read-Copy Update tracing for classic implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2008
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
45
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{
48 if (!rdp->beenonline)
49 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x",
51 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending,
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks,
61 rdp->dynticks->dynticks_nesting,
62 rdp->dynticks->dynticks_nmi,
63 rdp->dynticks_fqs);
64#endif /* #ifdef CONFIG_NO_HZ */
65 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
66 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
67}
68
69#define PRINT_RCU_DATA(name, func, m) \
70 do { \
71 int _p_r_d_i; \
72 \
73 for_each_possible_cpu(_p_r_d_i) \
74 func(m, &per_cpu(name, _p_r_d_i)); \
75 } while (0)
76
77static int show_rcudata(struct seq_file *m, void *unused)
78{
79 seq_puts(m, "rcu:\n");
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0;
84}
85
86static int rcudata_open(struct inode *inode, struct file *file)
87{
88 return single_open(file, show_rcudata, NULL);
89}
90
91static struct file_operations rcudata_fops = {
92 .owner = THIS_MODULE,
93 .open = rcudata_open,
94 .read = seq_read,
95 .llseek = seq_lseek,
96 .release = single_release,
97};
98
99static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{
101 if (!rdp->beenonline)
102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld",
104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending,
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks,
114 rdp->dynticks->dynticks_nesting,
115 rdp->dynticks->dynticks_nmi,
116 rdp->dynticks_fqs);
117#endif /* #ifdef CONFIG_NO_HZ */
118 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
119 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit);
120}
121
122static int show_rcudata_csv(struct seq_file *m, void *unused)
123{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\",");
125#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */
128 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
129 seq_puts(m, "\"rcu:\"\n");
130 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m);
131 seq_puts(m, "\"rcu_bh:\"\n");
132 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
133 return 0;
134}
135
136static int rcudata_csv_open(struct inode *inode, struct file *file)
137{
138 return single_open(file, show_rcudata_csv, NULL);
139}
140
141static struct file_operations rcudata_csv_fops = {
142 .owner = THIS_MODULE,
143 .open = rcudata_csv_open,
144 .read = seq_read,
145 .llseek = seq_lseek,
146 .release = single_release,
147};
148
149static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
150{
151 int level = 0;
152 struct rcu_node *rnp;
153
154 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
155 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
156 rsp->completed, rsp->gpnum, rsp->signaled,
157 (long)(rsp->jiffies_force_qs - jiffies),
158 (int)(jiffies & 0xffff),
159 rsp->n_force_qs, rsp->n_force_qs_ngp,
160 rsp->n_force_qs - rsp->n_force_qs_ngp,
161 rsp->n_force_qs_lh);
162 for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
163 if (rnp->level != level) {
164 seq_puts(m, "\n");
165 level = rnp->level;
166 }
167 seq_printf(m, "%lx/%lx %d:%d ^%d ",
168 rnp->qsmask, rnp->qsmaskinit,
169 rnp->grplo, rnp->grphi, rnp->grpnum);
170 }
171 seq_puts(m, "\n");
172}
173
174static int show_rcuhier(struct seq_file *m, void *unused)
175{
176 seq_puts(m, "rcu:\n");
177 print_one_rcu_state(m, &rcu_state);
178 seq_puts(m, "rcu_bh:\n");
179 print_one_rcu_state(m, &rcu_bh_state);
180 return 0;
181}
182
183static int rcuhier_open(struct inode *inode, struct file *file)
184{
185 return single_open(file, show_rcuhier, NULL);
186}
187
188static struct file_operations rcuhier_fops = {
189 .owner = THIS_MODULE,
190 .open = rcuhier_open,
191 .read = seq_read,
192 .llseek = seq_lseek,
193 .release = single_release,
194};
195
196static int show_rcugp(struct seq_file *m, void *unused)
197{
198 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n",
199 rcu_state.completed, rcu_state.gpnum);
200 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
201 rcu_bh_state.completed, rcu_bh_state.gpnum);
202 return 0;
203}
204
205static int rcugp_open(struct inode *inode, struct file *file)
206{
207 return single_open(file, show_rcugp, NULL);
208}
209
210static struct file_operations rcugp_fops = {
211 .owner = THIS_MODULE,
212 .open = rcugp_open,
213 .read = seq_read,
214 .llseek = seq_lseek,
215 .release = single_release,
216};
217
218static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
219static int __init rcuclassic_trace_init(void)
220{
221 rcudir = debugfs_create_dir("rcu", NULL);
222 if (!rcudir)
223 goto out;
224
225 datadir = debugfs_create_file("rcudata", 0444, rcudir,
226 NULL, &rcudata_fops);
227 if (!datadir)
228 goto free_out;
229
230 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir,
231 NULL, &rcudata_csv_fops);
232 if (!datadir_csv)
233 goto free_out;
234
235 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
236 if (!gpdir)
237 goto free_out;
238
239 hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
240 NULL, &rcuhier_fops);
241 if (!hierdir)
242 goto free_out;
243 return 0;
244free_out:
245 if (datadir)
246 debugfs_remove(datadir);
247 if (datadir_csv)
248 debugfs_remove(datadir_csv);
249 if (gpdir)
250 debugfs_remove(gpdir);
251 debugfs_remove(rcudir);
252out:
253 return 1;
254}
255
256static void __exit rcuclassic_trace_cleanup(void)
257{
258 debugfs_remove(datadir);
259 debugfs_remove(datadir_csv);
260 debugfs_remove(gpdir);
261 debugfs_remove(hierdir);
262 debugfs_remove(rcudir);
263}
264
265
266module_init(rcuclassic_trace_init);
267module_exit(rcuclassic_trace_cleanup);
268
269MODULE_AUTHOR("Paul E. McKenney");
270MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
271MODULE_LICENSE("GPL");
diff --git a/kernel/resource.c b/kernel/resource.c
index 4337063663ef..e633106b12f6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -853,6 +853,15 @@ int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && 853 if (PFN_DOWN(p->start) <= PFN_DOWN(addr) &&
854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) 854 PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1))
855 continue; 855 continue;
856 /*
857 * if a resource is "BUSY", it's not a hardware resource
858 * but a driver mapping of such a resource; we don't want
859 * to warn for those; some drivers legitimately map only
860 * partial hardware resources. (example: vesafb)
861 */
862 if (p->flags & IORESOURCE_BUSY)
863 continue;
864
856 printk(KERN_WARNING "resource map sanity check conflict: " 865 printk(KERN_WARNING "resource map sanity check conflict: "
857 "0x%llx 0x%llx 0x%llx 0x%llx %s\n", 866 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
858 (unsigned long long)addr, 867 (unsigned long long)addr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 748ff924a290..22aa9cab3fe5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4192,7 +4192,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4192 4192
4193 if (p == rq->idle) { 4193 if (p == rq->idle) {
4194 p->stime = cputime_add(p->stime, steal); 4194 p->stime = cputime_add(p->stime, steal);
4195 account_group_system_time(p, steal);
4196 if (atomic_read(&rq->nr_iowait) > 0) 4195 if (atomic_read(&rq->nr_iowait) > 0)
4197 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4196 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4198 else 4197 else
@@ -4328,7 +4327,7 @@ void __kprobes sub_preempt_count(int val)
4328 /* 4327 /*
4329 * Underflow? 4328 * Underflow?
4330 */ 4329 */
4331 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4330 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4332 return; 4331 return;
4333 /* 4332 /*
4334 * Is the spinlock portion underflowing? 4333 * Is the spinlock portion underflowing?
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e7c69a720d69..466e75ce271a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -102,20 +102,6 @@ void local_bh_disable(void)
102 102
103EXPORT_SYMBOL(local_bh_disable); 103EXPORT_SYMBOL(local_bh_disable);
104 104
105void __local_bh_enable(void)
106{
107 WARN_ON_ONCE(in_irq());
108
109 /*
110 * softirqs should never be enabled by __local_bh_enable(),
111 * it always nests inside local_bh_enable() sections:
112 */
113 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
114
115 sub_preempt_count(SOFTIRQ_OFFSET);
116}
117EXPORT_SYMBOL_GPL(__local_bh_enable);
118
119/* 105/*
120 * Special-case - softirqs can safely be enabled in 106 * Special-case - softirqs can safely be enabled in
121 * cond_resched_softirq(), or by __do_softirq(), 107 * cond_resched_softirq(), or by __do_softirq(),
@@ -269,6 +255,7 @@ void irq_enter(void)
269{ 255{
270 int cpu = smp_processor_id(); 256 int cpu = smp_processor_id();
271 257
258 rcu_irq_enter();
272 if (idle_cpu(cpu) && !in_interrupt()) { 259 if (idle_cpu(cpu) && !in_interrupt()) {
273 __irq_enter(); 260 __irq_enter();
274 tick_check_idle(cpu); 261 tick_check_idle(cpu);
@@ -295,9 +282,9 @@ void irq_exit(void)
295 282
296#ifdef CONFIG_NO_HZ 283#ifdef CONFIG_NO_HZ
297 /* Make sure that timer wheel updates are propagated */ 284 /* Make sure that timer wheel updates are propagated */
298 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
299 tick_nohz_stop_sched_tick(0);
300 rcu_irq_exit(); 285 rcu_irq_exit();
286 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
287 tick_nohz_stop_sched_tick(0);
301#endif 288#endif
302 preempt_enable_no_resched(); 289 preempt_enable_no_resched();
303} 290}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be6b7d5..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 94b527ef1d1e..eb212f8f8bc8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/kernel.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
11#include <linux/stacktrace.h> 12#include <linux/stacktrace.h>
@@ -24,3 +25,13 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
24} 25}
25EXPORT_SYMBOL_GPL(print_stack_trace); 26EXPORT_SYMBOL_GPL(print_stack_trace);
26 27
28/*
29 * Architectures that do not implement save_stack_trace_tsk get this
30 * weak alias and a once-per-bootup warning (whenever this facility
31 * is utilized - for example by procfs):
32 */
33__weak void
34save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
35{
36 WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n");
37}
diff --git a/kernel/sys.c b/kernel/sys.c
index ebe65c2c9873..d356d79e84ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -907,8 +907,8 @@ void do_sys_times(struct tms *tms)
907 struct task_cputime cputime; 907 struct task_cputime cputime;
908 cputime_t cutime, cstime; 908 cputime_t cutime, cstime;
909 909
910 spin_lock_irq(&current->sighand->siglock);
911 thread_group_cputime(current, &cputime); 910 thread_group_cputime(current, &cputime);
911 spin_lock_irq(&current->sighand->siglock);
912 cutime = current->signal->cutime; 912 cutime = current->signal->cutime;
913 cstime = current->signal->cstime; 913 cstime = current->signal->cstime;
914 spin_unlock_irq(&current->sighand->siglock); 914 spin_unlock_irq(&current->sighand->siglock);