aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/futex.c1067
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/mutex-debug.c5
-rw-r--r--kernel/power/Kconfig13
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c38
-rw-r--r--kernel/rtmutex-debug.c513
-rw-r--r--kernel/rtmutex-debug.h37
-rw-r--r--kernel/rtmutex-tester.c440
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h29
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/sched.c1199
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sysctl.c27
-rw-r--r--kernel/timer.c4
-rw-r--r--kernel/workqueue.c2
28 files changed, 4215 insertions, 569 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 752bd7d383af..82fb182f6f61 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -16,6 +16,9 @@ obj-$(CONFIG_FUTEX) += futex.o
16ifeq ($(CONFIG_COMPAT),y) 16ifeq ($(CONFIG_COMPAT),y)
17obj-$(CONFIG_FUTEX) += futex_compat.o 17obj-$(CONFIG_FUTEX) += futex_compat.o
18endif 18endif
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
20obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
21obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
19obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 22obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
20obj-$(CONFIG_SMP) += cpu.o spinlock.o 23obj-$(CONFIG_SMP) += cpu.o spinlock.o
21obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 368c4f03fe0e..126ca43d5d2b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file)
521 521
522/** 522/**
523 * acct_init_pacct - initialize a new pacct_struct 523 * acct_init_pacct - initialize a new pacct_struct
524 * @pacct: per-process accounting info struct to initialize
524 */ 525 */
525void acct_init_pacct(struct pacct_struct *pacct) 526void acct_init_pacct(struct pacct_struct *pacct)
526{ 527{
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead)
576 * 577 *
577 * handles process accounting for an exiting task 578 * handles process accounting for an exiting task
578 */ 579 */
579void acct_process() 580void acct_process(void)
580{ 581{
581 struct file *file = NULL; 582 struct file *file = NULL;
582 583
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd7..82443fb433ef 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -818,7 +818,7 @@ err:
818 */ 818 */
819unsigned int audit_serial(void) 819unsigned int audit_serial(void)
820{ 820{
821 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 821 static DEFINE_SPINLOCK(serial_lock);
822 static unsigned int serial = 0; 822 static unsigned int serial = 0;
823 823
824 unsigned long flags; 824 unsigned long flags;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96fda295..dc5e3f01efe7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
658 return; 658 return;
659 659
660error_path: 660error_path:
661 if (ctx) 661 kfree(ctx);
662 kfree(ctx);
663 audit_panic("error in audit_log_task_context"); 662 audit_panic("error in audit_log_task_context");
664 return; 663 return;
665} 664}
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1367 * @mqdes: MQ descriptor 1366 * @mqdes: MQ descriptor
1368 * @msg_len: Message length 1367 * @msg_len: Message length
1369 * @msg_prio: Message priority 1368 * @msg_prio: Message priority
1370 * @abs_timeout: Message timeout in absolute time 1369 * @u_abs_timeout: Message timeout in absolute time
1371 * 1370 *
1372 * Returns 0 for success or NULL context or < 0 on error. 1371 * Returns 0 for success or NULL context or < 0 on error.
1373 */ 1372 */
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1409 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive 1408 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1410 * @mqdes: MQ descriptor 1409 * @mqdes: MQ descriptor
1411 * @msg_len: Message length 1410 * @msg_len: Message length
1412 * @msg_prio: Message priority 1411 * @u_msg_prio: Message priority
1413 * @abs_timeout: Message timeout in absolute time 1412 * @u_abs_timeout: Message timeout in absolute time
1414 * 1413 *
1415 * Returns 0 for success or NULL context or < 0 on error. 1414 * Returns 0 for success or NULL context or < 0 on error.
1416 */ 1415 */
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1558 * @uid: msgq user id 1557 * @uid: msgq user id
1559 * @gid: msgq group id 1558 * @gid: msgq group id
1560 * @mode: msgq mode (permissions) 1559 * @mode: msgq mode (permissions)
1561 * @ipcp: in-kernel IPC permissions
1562 * 1560 *
1563 * Returns 0 for success or NULL context or < 0 on error. 1561 * Returns 0 for success or NULL context or < 0 on error.
1564 */ 1562 */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 03dcd981846a..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -18,7 +18,7 @@
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DEFINE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 69#endif /* CONFIG_HOTPLUG_CPU */
70 70
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 75}
76
77#ifdef CONFIG_HOTPLUG_CPU
78
76EXPORT_SYMBOL(register_cpu_notifier); 79EXPORT_SYMBOL(register_cpu_notifier);
77 80
78void unregister_cpu_notifier(struct notifier_block *nb) 81void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 84}
82EXPORT_SYMBOL(unregister_cpu_notifier); 85EXPORT_SYMBOL(unregister_cpu_notifier);
83 86
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 87static inline void check_for_tasks(int cpu)
86{ 88{
87 struct task_struct *p; 89 struct task_struct *p;
diff --git a/kernel/exit.c b/kernel/exit.c
index 304ef637be6c..ab06b9f88f64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -926,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code)
926 tsk->mempolicy = NULL; 926 tsk->mempolicy = NULL;
927#endif 927#endif
928 /* 928 /*
929 * This must happen late, after the PID is not
930 * hashed anymore:
931 */
932 if (unlikely(!list_empty(&tsk->pi_state_list)))
933 exit_pi_state_list(tsk);
934 if (unlikely(current->pi_state_cache))
935 kfree(current->pi_state_cache);
936 /*
929 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 937 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
930 */ 938 */
931 mutex_debug_check_no_locks_held(tsk); 939 mutex_debug_check_no_locks_held(tsk);
940 rt_mutex_debug_check_no_locks_held(tsk);
932 941
933 if (tsk->io_context) 942 if (tsk->io_context)
934 exit_io_context(); 943 exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b4e54ef0225..628198a4f28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 104void free_task(struct task_struct *tsk)
105{ 105{
106 free_thread_info(tsk->thread_info); 106 free_thread_info(tsk->thread_info);
107 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 108 free_task_struct(tsk);
108} 109}
109EXPORT_SYMBOL(free_task); 110EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
913 return current->pid; 914 return current->pid;
914} 915}
915 916
917static inline void rt_mutex_init_task(struct task_struct *p)
918{
919#ifdef CONFIG_RT_MUTEXES
920 spin_lock_init(&p->pi_lock);
921 plist_head_init(&p->pi_waiters, &p->pi_lock);
922 p->pi_blocked_on = NULL;
923# ifdef CONFIG_DEBUG_RT_MUTEXES
924 spin_lock_init(&p->held_list_lock);
925 INIT_LIST_HEAD(&p->held_list_head);
926# endif
927#endif
928}
929
916/* 930/*
917 * This creates a new process as a copy of the old one, 931 * This creates a new process as a copy of the old one,
918 * but does not actually start it yet. 932 * but does not actually start it yet.
@@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
1034 mpol_fix_fork_child_flag(p); 1048 mpol_fix_fork_child_flag(p);
1035#endif 1049#endif
1036 1050
1051 rt_mutex_init_task(p);
1052
1037#ifdef CONFIG_DEBUG_MUTEXES 1053#ifdef CONFIG_DEBUG_MUTEXES
1038 p->blocked_on = NULL; /* not blocked yet */ 1054 p->blocked_on = NULL; /* not blocked yet */
1039#endif 1055#endif
@@ -1076,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
1076#ifdef CONFIG_COMPAT 1092#ifdef CONFIG_COMPAT
1077 p->compat_robust_list = NULL; 1093 p->compat_robust_list = NULL;
1078#endif 1094#endif
1095 INIT_LIST_HEAD(&p->pi_state_list);
1096 p->pi_state_cache = NULL;
1097
1079 /* 1098 /*
1080 * sigaltstack should be cleared when sharing the same VM 1099 * sigaltstack should be cleared when sharing the same VM
1081 */ 1100 */
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..6c91f938005d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state;
481
482 return 0;
483 }
484 }
485
486 /*
487 * We are the first waiter - try to look up the real owner and
488 * attach the new pi_state to it:
489 */
490 pid = uval & FUTEX_TID_MASK;
491 p = futex_find_get_task(pid);
492 if (!p)
493 return -ESRCH;
494
495 pi_state = alloc_pi_state();
496
497 /*
498 * Initialize the pi_mutex in locked state and make 'p'
499 * the owner of it:
500 */
501 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
502
503 /* Store the key for possible exit cleanups: */
504 pi_state->key = me->key;
505
506 spin_lock_irq(&p->pi_lock);
507 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock);
510
511 put_task_struct(p);
512
513 me->pi_state = pi_state;
514
515 return 0;
516}
517
518/*
261 * The hash bucket lock must be held when this is called. 519 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 520 * Afterwards, the futex_q must not be accessed.
263 */ 521 */
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 542 q->lock_ptr = NULL;
285} 543}
286 544
545static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
546{
547 struct task_struct *new_owner;
548 struct futex_pi_state *pi_state = this->pi_state;
549 u32 curval, newval;
550
551 if (!pi_state)
552 return -EINVAL;
553
554 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
555
556 /*
557 * This happens when we have stolen the lock and the original
558 * pending owner did not enqueue itself back on the rt_mutex.
559 * Thats not a tragedy. We know that way, that a lock waiter
560 * is on the fly. We make the futex_q waiter the pending owner.
561 */
562 if (!new_owner)
563 new_owner = this->task;
564
565 /*
566 * We pass it to the next owner. (The WAITERS bit is always
567 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.)
569 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
571
572 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count();
575
576 if (curval == -EFAULT)
577 return -EFAULT;
578 if (curval != uval)
579 return -EINVAL;
580
581 list_del_init(&pi_state->owner->pi_state_list);
582 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner;
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 return 0;
587}
588
589static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
590{
591 u32 oldval;
592
593 /*
594 * There is no waiter, so we unlock the futex. The owner died
595 * bit has not to be preserved here. We are the owner:
596 */
597 inc_preempt_count();
598 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
599 dec_preempt_count();
600
601 if (oldval == -EFAULT)
602 return oldval;
603 if (oldval != uval)
604 return -EAGAIN;
605
606 return 0;
607}
608
287/* 609/*
288 * Wake up all waiters hashed on the physical page that is mapped 610 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 611 * to this virtual address:
290 */ 612 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 613static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 614{
293 union futex_key key; 615 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 616 struct futex_q *this, *next;
617 struct list_head *head;
618 union futex_key key;
297 int ret; 619 int ret;
298 620
299 down_read(&current->mm->mmap_sem); 621 down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 624 if (unlikely(ret != 0))
303 goto out; 625 goto out;
304 626
305 bh = hash_futex(&key); 627 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 628 spin_lock(&hb->lock);
307 head = &bh->chain; 629 head = &hb->chain;
308 630
309 list_for_each_entry_safe(this, next, head, list) { 631 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 632 if (match_futex (&this->key, &key)) {
633 if (this->pi_state)
634 return -EINVAL;
311 wake_futex(this); 635 wake_futex(this);
312 if (++ret >= nr_wake) 636 if (++ret >= nr_wake)
313 break; 637 break;
314 } 638 }
315 } 639 }
316 640
317 spin_unlock(&bh->lock); 641 spin_unlock(&hb->lock);
318out: 642out:
319 up_read(&current->mm->mmap_sem); 643 up_read(&current->mm->mmap_sem);
320 return ret; 644 return ret;
@@ -324,10 +648,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 648 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 649 * to this virtual address:
326 */ 650 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 651static int
652futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
653 int nr_wake, int nr_wake2, int op)
328{ 654{
329 union futex_key key1, key2; 655 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 656 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 657 struct list_head *head;
332 struct futex_q *this, *next; 658 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 659 int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@ retryfull:
342 if (unlikely(ret != 0)) 668 if (unlikely(ret != 0))
343 goto out; 669 goto out;
344 670
345 bh1 = hash_futex(&key1); 671 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 672 hb2 = hash_futex(&key2);
347 673
348retry: 674retry:
349 if (bh1 < bh2) 675 if (hb1 < hb2)
350 spin_lock(&bh1->lock); 676 spin_lock(&hb1->lock);
351 spin_lock(&bh2->lock); 677 spin_lock(&hb2->lock);
352 if (bh1 > bh2) 678 if (hb1 > hb2)
353 spin_lock(&bh1->lock); 679 spin_lock(&hb1->lock);
354 680
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 681 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 682 if (unlikely(op_ret < 0)) {
357 int dummy; 683 u32 dummy;
358 684
359 spin_unlock(&bh1->lock); 685 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 686 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 687 spin_unlock(&hb2->lock);
362 688
363#ifndef CONFIG_MMU 689#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 690 /*
365 * but we might get them from range checking */ 691 * we don't get EFAULT from MMU faults if we don't have an MMU,
692 * but we might get them from range checking
693 */
366 ret = op_ret; 694 ret = op_ret;
367 goto out; 695 goto out;
368#endif 696#endif
@@ -372,47 +700,34 @@ retry:
372 goto out; 700 goto out;
373 } 701 }
374 702
375 /* futex_atomic_op_inuser needs to both read and write 703 /*
704 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 705 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 706 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 707 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 708 * still holding the mmap_sem.
709 */
380 if (attempt++) { 710 if (attempt++) {
381 struct vm_area_struct * vma; 711 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 712 attempt))
383
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 713 goto out;
400 }
401 goto retry; 714 goto retry;
402 } 715 }
403 716
404 /* If we would have faulted, release mmap_sem, 717 /*
405 * fault it in and start all over again. */ 718 * If we would have faulted, release mmap_sem,
719 * fault it in and start all over again.
720 */
406 up_read(&current->mm->mmap_sem); 721 up_read(&current->mm->mmap_sem);
407 722
408 ret = get_user(dummy, (int __user *)uaddr2); 723 ret = get_user(dummy, uaddr2);
409 if (ret) 724 if (ret)
410 return ret; 725 return ret;
411 726
412 goto retryfull; 727 goto retryfull;
413 } 728 }
414 729
415 head = &bh1->chain; 730 head = &hb1->chain;
416 731
417 list_for_each_entry_safe(this, next, head, list) { 732 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 733 if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@ retry:
423 } 738 }
424 739
425 if (op_ret > 0) { 740 if (op_ret > 0) {
426 head = &bh2->chain; 741 head = &hb2->chain;
427 742
428 op_ret = 0; 743 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 744 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@ retry:
436 ret += op_ret; 751 ret += op_ret;
437 } 752 }
438 753
439 spin_unlock(&bh1->lock); 754 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 755 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 756 spin_unlock(&hb2->lock);
442out: 757out:
443 up_read(&current->mm->mmap_sem); 758 up_read(&current->mm->mmap_sem);
444 return ret; 759 return ret;
@@ -448,11 +763,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 763 * Requeue all waiters hashed on one physical page to another
449 * physical page. 764 * physical page.
450 */ 765 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 766static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 767 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 768{
454 union futex_key key1, key2; 769 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 770 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 771 struct list_head *head1;
457 struct futex_q *this, *next; 772 struct futex_q *this, *next;
458 int ret, drop_count = 0; 773 int ret, drop_count = 0;
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 782 if (unlikely(ret != 0))
468 goto out; 783 goto out;
469 784
470 bh1 = hash_futex(&key1); 785 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 786 hb2 = hash_futex(&key2);
472 787
473 if (bh1 < bh2) 788 if (hb1 < hb2)
474 spin_lock(&bh1->lock); 789 spin_lock(&hb1->lock);
475 spin_lock(&bh2->lock); 790 spin_lock(&hb2->lock);
476 if (bh1 > bh2) 791 if (hb1 > hb2)
477 spin_lock(&bh1->lock); 792 spin_lock(&hb1->lock);
478 793
479 if (likely(valp != NULL)) { 794 if (likely(cmpval != NULL)) {
480 int curval; 795 u32 curval;
481 796
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 797 ret = get_futex_value_locked(&curval, uaddr1);
483 798
484 if (unlikely(ret)) { 799 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 800 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 801 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 802 spin_unlock(&hb2->lock);
488 803
489 /* If we would have faulted, release mmap_sem, fault 804 /*
805 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 806 * it in and start all over again.
491 */ 807 */
492 up_read(&current->mm->mmap_sem); 808 up_read(&current->mm->mmap_sem);
493 809
494 ret = get_user(curval, (int __user *)uaddr1); 810 ret = get_user(curval, uaddr1);
495 811
496 if (!ret) 812 if (!ret)
497 goto retry; 813 goto retry;
498 814
499 return ret; 815 return ret;
500 } 816 }
501 if (curval != *valp) { 817 if (curval != *cmpval) {
502 ret = -EAGAIN; 818 ret = -EAGAIN;
503 goto out_unlock; 819 goto out_unlock;
504 } 820 }
505 } 821 }
506 822
507 head1 = &bh1->chain; 823 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 824 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 825 if (!match_futex (&this->key, &key1))
510 continue; 826 continue;
511 if (++ret <= nr_wake) { 827 if (++ret <= nr_wake) {
512 wake_futex(this); 828 wake_futex(this);
513 } else { 829 } else {
514 list_move_tail(&this->list, &bh2->chain); 830 /*
515 this->lock_ptr = &bh2->lock; 831 * If key1 and key2 hash to the same bucket, no need to
832 * requeue.
833 */
834 if (likely(head1 != &hb2->chain)) {
835 list_move_tail(&this->list, &hb2->chain);
836 this->lock_ptr = &hb2->lock;
837 }
516 this->key = key2; 838 this->key = key2;
517 get_key_refs(&key2); 839 get_key_refs(&key2);
518 drop_count++; 840 drop_count++;
519 841
520 if (ret - nr_wake >= nr_requeue) 842 if (ret - nr_wake >= nr_requeue)
521 break; 843 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 844 }
526 } 845 }
527 846
528out_unlock: 847out_unlock:
529 spin_unlock(&bh1->lock); 848 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 849 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 850 spin_unlock(&hb2->lock);
532 851
533 /* drop_key_refs() must be called outside the spinlocks. */ 852 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 853 while (--drop_count >= 0)
@@ -543,7 +862,7 @@ out:
543static inline struct futex_hash_bucket * 862static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 863queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 864{
546 struct futex_hash_bucket *bh; 865 struct futex_hash_bucket *hb;
547 866
548 q->fd = fd; 867 q->fd = fd;
549 q->filp = filp; 868 q->filp = filp;
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 870 init_waitqueue_head(&q->waiters);
552 871
553 get_key_refs(&q->key); 872 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 873 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 874 q->lock_ptr = &hb->lock;
556 875
557 spin_lock(&bh->lock); 876 spin_lock(&hb->lock);
558 return bh; 877 return hb;
559} 878}
560 879
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 880static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 881{
563 list_add_tail(&q->list, &bh->chain); 882 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 883 q->task = current;
884 spin_unlock(&hb->lock);
565} 885}
566 886
567static inline void 887static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 888queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 889{
570 spin_unlock(&bh->lock); 890 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 891 drop_key_refs(&q->key);
572} 892}
573 893
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 899/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 900static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 901{
582 struct futex_hash_bucket *bh; 902 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 903
584 __queue_me(q, bh); 904 hb = queue_lock(q, fd, filp);
905 __queue_me(q, hb);
585} 906}
586 907
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 908/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 909static int unqueue_me(struct futex_q *q)
589{ 910{
590 int ret = 0;
591 spinlock_t *lock_ptr; 911 spinlock_t *lock_ptr;
912 int ret = 0;
592 913
593 /* In the common case we don't take the spinlock, which is nice. */ 914 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 915 retry:
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q)
614 } 935 }
615 WARN_ON(list_empty(&q->list)); 936 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 937 list_del(&q->list);
938
939 BUG_ON(q->pi_state);
940
617 spin_unlock(lock_ptr); 941 spin_unlock(lock_ptr);
618 ret = 1; 942 ret = 1;
619 } 943 }
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 946 return ret;
623} 947}
624 948
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 949/*
950 * PI futexes can not be requeued and must remove themself from the
951 * hash bucket. The hash bucket lock is held on entry and dropped here.
952 */
953static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
626{ 954{
627 DECLARE_WAITQUEUE(wait, current); 955 WARN_ON(list_empty(&q->list));
628 int ret, curval; 956 list_del(&q->list);
957
958 BUG_ON(!q->pi_state);
959 free_pi_state(q->pi_state);
960 q->pi_state = NULL;
961
962 spin_unlock(&hb->lock);
963
964 drop_key_refs(&q->key);
965}
966
967static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
968{
969 struct task_struct *curr = current;
970 DECLARE_WAITQUEUE(wait, curr);
971 struct futex_hash_bucket *hb;
629 struct futex_q q; 972 struct futex_q q;
630 struct futex_hash_bucket *bh; 973 u32 uval;
974 int ret;
631 975
976 q.pi_state = NULL;
632 retry: 977 retry:
633 down_read(&current->mm->mmap_sem); 978 down_read(&curr->mm->mmap_sem);
634 979
635 ret = get_futex_key(uaddr, &q.key); 980 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 981 if (unlikely(ret != 0))
637 goto out_release_sem; 982 goto out_release_sem;
638 983
639 bh = queue_lock(&q, -1, NULL); 984 hb = queue_lock(&q, -1, NULL);
640 985
641 /* 986 /*
642 * Access the page AFTER the futex is queued. 987 * Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1003 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1004 * since we looked it up in get_futex_key.
660 */ 1005 */
661 1006 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1007
664 if (unlikely(ret)) { 1008 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1009 queue_unlock(&q, hb);
666 1010
667 /* If we would have faulted, release mmap_sem, fault it in and 1011 /*
1012 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1013 * start all over again.
669 */ 1014 */
670 up_read(&current->mm->mmap_sem); 1015 up_read(&curr->mm->mmap_sem);
671 1016
672 ret = get_user(curval, (int __user *)uaddr); 1017 ret = get_user(uval, uaddr);
673 1018
674 if (!ret) 1019 if (!ret)
675 goto retry; 1020 goto retry;
676 return ret; 1021 return ret;
677 } 1022 }
678 if (curval != val) { 1023 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1024 if (uval != val)
680 queue_unlock(&q, bh); 1025 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1026
684 /* Only actually queue if *uaddr contained val. */ 1027 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1028 __queue_me(&q, hb);
686 1029
687 /* 1030 /*
688 * Now the futex is queued and we have checked the data, we 1031 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1032 * don't want to hold mmap_sem while we sleep.
690 */ 1033 */
691 up_read(&current->mm->mmap_sem); 1034 up_read(&curr->mm->mmap_sem);
692 1035
693 /* 1036 /*
694 * There might have been scheduling since the queue_me(), as we 1037 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1063 return 0;
721 if (time == 0) 1064 if (time == 0)
722 return -ETIMEDOUT; 1065 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1066 /*
724 * have handled it for us already. */ 1067 * We expect signal_pending(current), but another thread may
1068 * have handled it for us already.
1069 */
725 return -EINTR; 1070 return -EINTR;
726 1071
1072 out_unlock_release_sem:
1073 queue_unlock(&q, hb);
1074
727 out_release_sem: 1075 out_release_sem:
1076 up_read(&curr->mm->mmap_sem);
1077 return ret;
1078}
1079
1080/*
1081 * Userspace tried a 0 -> TID atomic transition of the futex value
1082 * and failed. The kernel side here does the whole locking operation:
1083 * if there are waiters then it will block, it does PI, etc. (Due to
1084 * races the kernel might see a 0 value of the futex too.)
1085 */
1086static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1087 struct hrtimer_sleeper *to)
1088{
1089 struct task_struct *curr = current;
1090 struct futex_hash_bucket *hb;
1091 u32 uval, newval, curval;
1092 struct futex_q q;
1093 int ret, attempt = 0;
1094
1095 if (refill_pi_state_cache())
1096 return -ENOMEM;
1097
1098 q.pi_state = NULL;
1099 retry:
1100 down_read(&curr->mm->mmap_sem);
1101
1102 ret = get_futex_key(uaddr, &q.key);
1103 if (unlikely(ret != 0))
1104 goto out_release_sem;
1105
1106 hb = queue_lock(&q, -1, NULL);
1107
1108 retry_locked:
1109 /*
1110 * To avoid races, we attempt to take the lock here again
1111 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1112 * the locks. It will most likely not succeed.
1113 */
1114 newval = current->pid;
1115
1116 inc_preempt_count();
1117 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1118 dec_preempt_count();
1119
1120 if (unlikely(curval == -EFAULT))
1121 goto uaddr_faulted;
1122
1123 /* We own the lock already */
1124 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1125 if (!detect && 0)
1126 force_sig(SIGKILL, current);
1127 ret = -EDEADLK;
1128 goto out_unlock_release_sem;
1129 }
1130
1131 /*
1132 * Surprise - we got the lock. Just return
1133 * to userspace:
1134 */
1135 if (unlikely(!curval))
1136 goto out_unlock_release_sem;
1137
1138 uval = curval;
1139 newval = uval | FUTEX_WAITERS;
1140
1141 inc_preempt_count();
1142 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1143 dec_preempt_count();
1144
1145 if (unlikely(curval == -EFAULT))
1146 goto uaddr_faulted;
1147 if (unlikely(curval != uval))
1148 goto retry_locked;
1149
1150 /*
1151 * We dont have the lock. Look up the PI state (or create it if
1152 * we are the first waiter):
1153 */
1154 ret = lookup_pi_state(uval, hb, &q);
1155
1156 if (unlikely(ret)) {
1157 /*
1158 * There were no waiters and the owner task lookup
1159 * failed. When the OWNER_DIED bit is set, then we
1160 * know that this is a robust futex and we actually
1161 * take the lock. This is safe as we are protected by
1162 * the hash bucket lock. We also set the waiters bit
1163 * unconditionally here, to simplify glibc handling of
1164 * multiple tasks racing to acquire the lock and
1165 * cleanup the problems which were left by the dead
1166 * owner.
1167 */
1168 if (curval & FUTEX_OWNER_DIED) {
1169 uval = newval;
1170 newval = current->pid |
1171 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1172
1173 inc_preempt_count();
1174 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1175 uval, newval);
1176 dec_preempt_count();
1177
1178 if (unlikely(curval == -EFAULT))
1179 goto uaddr_faulted;
1180 if (unlikely(curval != uval))
1181 goto retry_locked;
1182 ret = 0;
1183 }
1184 goto out_unlock_release_sem;
1185 }
1186
1187 /*
1188 * Only actually queue now that the atomic ops are done:
1189 */
1190 __queue_me(&q, hb);
1191
1192 /*
1193 * Now the futex is queued and we have checked the data, we
1194 * don't want to hold mmap_sem while we sleep.
1195 */
1196 up_read(&curr->mm->mmap_sem);
1197
1198 WARN_ON(!q.pi_state);
1199 /*
1200 * Block on the PI mutex:
1201 */
1202 if (!trylock)
1203 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1204 else {
1205 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1206 /* Fixup the trylock return value: */
1207 ret = ret ? 0 : -EWOULDBLOCK;
1208 }
1209
1210 down_read(&curr->mm->mmap_sem);
1211 hb = queue_lock(&q, -1, NULL);
1212
1213 /*
1214 * Got the lock. We might not be the anticipated owner if we
1215 * did a lock-steal - fix up the PI-state in that case.
1216 */
1217 if (!ret && q.pi_state->owner != curr) {
1218 u32 newtid = current->pid | FUTEX_WAITERS;
1219
1220 /* Owner died? */
1221 if (q.pi_state->owner != NULL) {
1222 spin_lock_irq(&q.pi_state->owner->pi_lock);
1223 list_del_init(&q.pi_state->list);
1224 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1225 } else
1226 newtid |= FUTEX_OWNER_DIED;
1227
1228 q.pi_state->owner = current;
1229
1230 spin_lock_irq(&current->pi_lock);
1231 list_add(&q.pi_state->list, &current->pi_state_list);
1232 spin_unlock_irq(&current->pi_lock);
1233
1234 /* Unqueue and drop the lock */
1235 unqueue_me_pi(&q, hb);
1236 up_read(&curr->mm->mmap_sem);
1237 /*
1238 * We own it, so we have to replace the pending owner
1239 * TID. This must be atomic as we have preserve the
1240 * owner died bit here.
1241 */
1242 ret = get_user(uval, uaddr);
1243 while (!ret) {
1244 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1245 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1246 uval, newval);
1247 if (curval == -EFAULT)
1248 ret = -EFAULT;
1249 if (curval == uval)
1250 break;
1251 uval = curval;
1252 }
1253 } else {
1254 /*
1255 * Catch the rare case, where the lock was released
1256 * when we were on the way back before we locked
1257 * the hash bucket.
1258 */
1259 if (ret && q.pi_state->owner == curr) {
1260 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1261 ret = 0;
1262 }
1263 /* Unqueue and drop the lock */
1264 unqueue_me_pi(&q, hb);
1265 up_read(&curr->mm->mmap_sem);
1266 }
1267
1268 if (!detect && ret == -EDEADLK && 0)
1269 force_sig(SIGKILL, current);
1270
1271 return ret;
1272
1273 out_unlock_release_sem:
1274 queue_unlock(&q, hb);
1275
1276 out_release_sem:
1277 up_read(&curr->mm->mmap_sem);
1278 return ret;
1279
1280 uaddr_faulted:
1281 /*
1282 * We have to r/w *(int __user *)uaddr, but we can't modify it
1283 * non-atomically. Therefore, if get_user below is not
1284 * enough, we need to handle the fault ourselves, while
1285 * still holding the mmap_sem.
1286 */
1287 if (attempt++) {
1288 if (futex_handle_fault((unsigned long)uaddr, attempt))
1289 goto out_unlock_release_sem;
1290
1291 goto retry_locked;
1292 }
1293
1294 queue_unlock(&q, hb);
1295 up_read(&curr->mm->mmap_sem);
1296
1297 ret = get_user(uval, uaddr);
1298 if (!ret && (uval != -EFAULT))
1299 goto retry;
1300
1301 return ret;
1302}
1303
1304/*
1305 * Restart handler
1306 */
1307static long futex_lock_pi_restart(struct restart_block *restart)
1308{
1309 struct hrtimer_sleeper timeout, *to = NULL;
1310 int ret;
1311
1312 restart->fn = do_no_restart_syscall;
1313
1314 if (restart->arg2 || restart->arg3) {
1315 to = &timeout;
1316 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1317 hrtimer_init_sleeper(to, current);
1318 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1319 (u64) restart->arg0;
1320 }
1321
1322 pr_debug("lock_pi restart: %p, %d (%d)\n",
1323 (u32 __user *)restart->arg0, current->pid);
1324
1325 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1326 0, to);
1327
1328 if (ret != -EINTR)
1329 return ret;
1330
1331 restart->fn = futex_lock_pi_restart;
1332
1333 /* The other values are filled in */
1334 return -ERESTART_RESTARTBLOCK;
1335}
1336
1337/*
1338 * Called from the syscall entry below.
1339 */
1340static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1341 long nsec, int trylock)
1342{
1343 struct hrtimer_sleeper timeout, *to = NULL;
1344 struct restart_block *restart;
1345 int ret;
1346
1347 if (sec != MAX_SCHEDULE_TIMEOUT) {
1348 to = &timeout;
1349 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1350 hrtimer_init_sleeper(to, current);
1351 to->timer.expires = ktime_set(sec, nsec);
1352 }
1353
1354 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1355
1356 if (ret != -EINTR)
1357 return ret;
1358
1359 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1360
1361 restart = &current_thread_info()->restart_block;
1362 restart->fn = futex_lock_pi_restart;
1363 restart->arg0 = (unsigned long) uaddr;
1364 restart->arg1 = detect;
1365 if (to) {
1366 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1367 restart->arg3 = to->timer.expires.tv64 >> 32;
1368 } else
1369 restart->arg2 = restart->arg3 = 0;
1370
1371 return -ERESTART_RESTARTBLOCK;
1372}
1373
1374/*
1375 * Userspace attempted a TID -> 0 atomic transition, and failed.
1376 * This is the in-kernel slowpath: we look up the PI state (if any),
1377 * and do the rt-mutex unlock.
1378 */
1379static int futex_unlock_pi(u32 __user *uaddr)
1380{
1381 struct futex_hash_bucket *hb;
1382 struct futex_q *this, *next;
1383 u32 uval;
1384 struct list_head *head;
1385 union futex_key key;
1386 int ret, attempt = 0;
1387
1388retry:
1389 if (get_user(uval, uaddr))
1390 return -EFAULT;
1391 /*
1392 * We release only a lock we actually own:
1393 */
1394 if ((uval & FUTEX_TID_MASK) != current->pid)
1395 return -EPERM;
1396 /*
1397 * First take all the futex related locks:
1398 */
1399 down_read(&current->mm->mmap_sem);
1400
1401 ret = get_futex_key(uaddr, &key);
1402 if (unlikely(ret != 0))
1403 goto out;
1404
1405 hb = hash_futex(&key);
1406 spin_lock(&hb->lock);
1407
1408retry_locked:
1409 /*
1410 * To avoid races, try to do the TID -> 0 atomic transition
1411 * again. If it succeeds then we can return without waking
1412 * anyone else up:
1413 */
1414 inc_preempt_count();
1415 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1416 dec_preempt_count();
1417
1418 if (unlikely(uval == -EFAULT))
1419 goto pi_faulted;
1420 /*
1421 * Rare case: we managed to release the lock atomically,
1422 * no need to wake anyone else up:
1423 */
1424 if (unlikely(uval == current->pid))
1425 goto out_unlock;
1426
1427 /*
1428 * Ok, other tasks may need to be woken up - check waiters
1429 * and do the wakeup if necessary:
1430 */
1431 head = &hb->chain;
1432
1433 list_for_each_entry_safe(this, next, head, list) {
1434 if (!match_futex (&this->key, &key))
1435 continue;
1436 ret = wake_futex_pi(uaddr, uval, this);
1437 /*
1438 * The atomic access to the futex value
1439 * generated a pagefault, so retry the
1440 * user-access and the wakeup:
1441 */
1442 if (ret == -EFAULT)
1443 goto pi_faulted;
1444 goto out_unlock;
1445 }
1446 /*
1447 * No waiters - kernel unlocks the futex:
1448 */
1449 ret = unlock_futex_pi(uaddr, uval);
1450 if (ret == -EFAULT)
1451 goto pi_faulted;
1452
1453out_unlock:
1454 spin_unlock(&hb->lock);
1455out:
728 up_read(&current->mm->mmap_sem); 1456 up_read(&current->mm->mmap_sem);
1457
1458 return ret;
1459
1460pi_faulted:
1461 /*
1462 * We have to r/w *(int __user *)uaddr, but we can't modify it
1463 * non-atomically. Therefore, if get_user below is not
1464 * enough, we need to handle the fault ourselves, while
1465 * still holding the mmap_sem.
1466 */
1467 if (attempt++) {
1468 if (futex_handle_fault((unsigned long)uaddr, attempt))
1469 goto out_unlock;
1470
1471 goto retry_locked;
1472 }
1473
1474 spin_unlock(&hb->lock);
1475 up_read(&current->mm->mmap_sem);
1476
1477 ret = get_user(uval, uaddr);
1478 if (!ret && (uval != -EFAULT))
1479 goto retry;
1480
729 return ret; 1481 return ret;
730} 1482}
731 1483
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1487
736 unqueue_me(q); 1488 unqueue_me(q);
737 kfree(q); 1489 kfree(q);
1490
738 return 0; 1491 return 0;
739} 1492}
740 1493
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1519 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1520 * set the sigio stuff up afterwards.
768 */ 1521 */
769static int futex_fd(unsigned long uaddr, int signal) 1522static int futex_fd(u32 __user *uaddr, int signal)
770{ 1523{
771 struct futex_q *q; 1524 struct futex_q *q;
772 struct file *filp; 1525 struct file *filp;
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1556 err = -ENOMEM;
804 goto error; 1557 goto error;
805 } 1558 }
1559 q->pi_state = NULL;
806 1560
807 down_read(&current->mm->mmap_sem); 1561 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1562 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1594 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1595 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1596 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1597 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1598 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1599 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1600 * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@ err_unlock:
915 */ 1669 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1670int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{ 1671{
918 u32 uval; 1672 u32 uval, nval;
919 1673
920retry: 1674retry:
921 if (get_user(uval, uaddr)) 1675 if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1686 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1687 * userspace.
934 */ 1688 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1689 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval) 1690 uval | FUTEX_OWNER_DIED);
1691 if (nval == -EFAULT)
1692 return -1;
1693
1694 if (nval != uval)
937 goto retry; 1695 goto retry;
938 1696
939 if (uval & FUTEX_WAITERS) 1697 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1); 1698 futex_wake(uaddr, 1);
941 } 1699 }
942 return 0; 1700 return 0;
943} 1701}
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr)
978 while (entry != &head->list) { 1736 while (entry != &head->list) {
979 /* 1737 /*
980 * A pending lock might already be on the list, so 1738 * A pending lock might already be on the list, so
981 * dont process it twice: 1739 * don't process it twice:
982 */ 1740 */
983 if (entry != pending) 1741 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1742 if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1757 }
1000} 1758}
1001 1759
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1760long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1761 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1762{
1005 int ret; 1763 int ret;
1006 1764
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1782 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1783 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1784 break;
1785 case FUTEX_LOCK_PI:
1786 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1787 break;
1788 case FUTEX_UNLOCK_PI:
1789 ret = futex_unlock_pi(uaddr);
1790 break;
1791 case FUTEX_TRYLOCK_PI:
1792 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1793 break;
1027 default: 1794 default:
1028 ret = -ENOSYS; 1795 ret = -ENOSYS;
1029 } 1796 }
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1798}
1032 1799
1033 1800
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1801asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1802 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1803 u32 val3)
1037{ 1804{
1038 struct timespec t; 1805 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1806 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1807 u32 val2 = 0;
1041 1808
1042 if (utime && (op == FUTEX_WAIT)) { 1809 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1810 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1811 return -EFAULT;
1045 if (!timespec_valid(&t)) 1812 if (!timespec_valid(&t))
1046 return -EINVAL; 1813 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1814 if (op == FUTEX_WAIT)
1815 timeout = timespec_to_jiffies(&t) + 1;
1816 else {
1817 timeout = t.tv_sec;
1818 val2 = t.tv_nsec;
1819 }
1048 } 1820 }
1049 /* 1821 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1822 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1823 */
1052 if (op >= FUTEX_REQUEUE) 1824 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1825 val2 = (u32) (unsigned long) utime;
1054 1826
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1827 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1828}
1058 1829
1059static int futexfs_get_sb(struct file_system_type *fs_type, 1830static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex((unsigned long)uaddr, op, val, timeout, 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 148}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 55601b3ce60e..8d3dc29ef41a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu)
833} 833}
834#endif /* CONFIG_HOTPLUG_CPU */ 834#endif /* CONFIG_HOTPLUG_CPU */
835 835
836static int hrtimer_cpu_notify(struct notifier_block *self, 836static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
837 unsigned long action, void *hcpu) 837 unsigned long action, void *hcpu)
838{ 838{
839 long cpu = (long)hcpu; 839 long cpu = (long)hcpu;
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
857 return NOTIFY_OK; 857 return NOTIFY_OK;
858} 858}
859 859
860static struct notifier_block hrtimers_nb = { 860static struct notifier_block __devinitdata hrtimers_nb = {
861 .notifier_call = hrtimer_cpu_notify, 861 .notifier_call = hrtimer_cpu_notify,
862}; 862};
863 863
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 036b6285b15c..e38e4bac97ca 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
381 382
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 383void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{ 384{
384 memset(waiter, 0x11, sizeof(*waiter)); 385 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 386 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 387 INIT_LIST_HEAD(&waiter->list);
387} 388}
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 398void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 399{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 400 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 401 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 402}
402 403
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 404void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index fc311a4673a2..857b4fa09124 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -38,13 +38,22 @@ config PM_DEBUG
38 38
39config PM_TRACE 39config PM_TRACE
40 bool "Suspend/resume event tracing" 40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
42 default y 42 default n
43 ---help--- 43 ---help---
44 This enables some cheesy code to save the last PM event point in the 44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs 45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume). 46 during suspend (or more commonly, during resume).
47 47
48 To use this debugging feature you should attempt to suspend the machine,
49 then reboot it, then run
50
51 dmesg -s 1000000 | grep 'hash matches'
52
53 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume.
55
56
48config SOFTWARE_SUSPEND 57config SOFTWARE_SUSPEND
49 bool "Software Suspend" 58 bool "Software Suspend"
50 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 59 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
299} 299}
300 300
301#ifdef CONFIG_HOTPLUG_CPU 301#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 302static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 303 unsigned long action, void *__cpu)
304{ 304{
305 int node, cpu = (unsigned long)__cpu; 305 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21c..f464f5ae3f11 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 549}
541 550
542static int rcu_cpu_notify(struct notifier_block *self, 551static int __devinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
544{ 553{
545 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 565 return NOTIFY_OK;
557} 566}
558 567
559static struct notifier_block rcu_nb = { 568static struct notifier_block __devinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
561}; 570};
562 571
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0);
619module_param(rsinterval, int, 0); 628module_param(rsinterval, int, 0);
620#endif 629#endif
621EXPORT_SYMBOL_GPL(rcu_batches_completed); 630EXPORT_SYMBOL_GPL(rcu_batches_completed);
631EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
622EXPORT_SYMBOL_GPL(call_rcu); 632EXPORT_SYMBOL_GPL(call_rcu);
623EXPORT_SYMBOL_GPL(call_rcu_bh); 633EXPORT_SYMBOL_GPL(call_rcu_bh);
624EXPORT_SYMBOL_GPL(synchronize_rcu); 634EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..2404f9b0bc47 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
232 232
233EXPORT_SYMBOL(release_resource); 233EXPORT_SYMBOL(release_resource);
234 234
235#ifdef CONFIG_MEMORY_HOTPLUG
236/*
237 * Finds the lowest memory reosurce exists within [res->start.res->end)
238 * the caller must specify res->start, res->end, res->flags.
239 * If found, returns 0, res is overwritten, if not found, returns -1.
240 */
241int find_next_system_ram(struct resource *res)
242{
243 resource_size_t start, end;
244 struct resource *p;
245
246 BUG_ON(!res);
247
248 start = res->start;
249 end = res->end;
250
251 read_lock(&resource_lock);
252 for (p = iomem_resource.child; p ; p = p->sibling) {
253 /* system ram is just marked as IORESOURCE_MEM */
254 if (p->flags != res->flags)
255 continue;
256 if (p->start > end) {
257 p = NULL;
258 break;
259 }
260 if (p->start >= start)
261 break;
262 }
263 read_unlock(&resource_lock);
264 if (!p)
265 return -1;
266 /* copy data */
267 res->start = p->start;
268 res->end = p->end;
269 return 0;
270}
271#endif
272
235/* 273/*
236 * Find empty slot in the resource tree given range and alignment. 274 * Find empty slot in the resource tree given range and alignment.
237 */ 275 */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..4aa8a2c9f453
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29
30#include "rtmutex_common.h"
31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x)
40
41# define TRACE_OFF() \
42do { \
43 if (rt_trace_on) { \
44 rt_trace_on = 0; \
45 console_verbose(); \
46 if (spin_is_locked(&current->pi_lock)) \
47 spin_unlock(&current->pi_lock); \
48 if (spin_is_locked(&current->held_list_lock)) \
49 spin_unlock(&current->held_list_lock); \
50 } \
51} while (0)
52
53# define TRACE_OFF_NOLOCK() \
54do { \
55 if (rt_trace_on) { \
56 rt_trace_on = 0; \
57 console_verbose(); \
58 } \
59} while (0)
60
61# define TRACE_BUG_LOCKED() \
62do { \
63 TRACE_OFF(); \
64 BUG(); \
65} while (0)
66
67# define TRACE_WARN_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) { \
70 TRACE_OFF(); \
71 WARN_ON(1); \
72 } \
73} while (0)
74
75# define TRACE_BUG_ON_LOCKED(c) \
76do { \
77 if (unlikely(c)) \
78 TRACE_BUG_LOCKED(); \
79} while (0)
80
81#ifdef CONFIG_SMP
82# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
83#else
84# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
85#endif
86
87/*
88 * deadlock detection flag. We turn it off when we detect
89 * the first problem because we dont want to recurse back
90 * into the tracing code when doing error printk or
91 * executing a BUG():
92 */
93int rt_trace_on = 1;
94
95void deadlock_trace_off(void)
96{
97 rt_trace_on = 0;
98}
99
100static void printk_task(task_t *p)
101{
102 if (p)
103 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
104 else
105 printk("<none>");
106}
107
108static void printk_task_short(task_t *p)
109{
110 if (p)
111 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
112 else
113 printk("<none>");
114}
115
116static void printk_lock(struct rt_mutex *lock, int print_owner)
117{
118 if (lock->name)
119 printk(" [%p] {%s}\n",
120 lock, lock->name);
121 else
122 printk(" [%p] {%s:%d}\n",
123 lock, lock->file, lock->line);
124
125 if (print_owner && rt_mutex_owner(lock)) {
126 printk(".. ->owner: %p\n", lock->owner);
127 printk(".. held by: ");
128 printk_task(rt_mutex_owner(lock));
129 printk("\n");
130 }
131 if (rt_mutex_owner(lock)) {
132 printk("... acquired at: ");
133 print_symbol("%s\n", lock->acquire_ip);
134 }
135}
136
137static void printk_waiter(struct rt_mutex_waiter *w)
138{
139 printk("-------------------------\n");
140 printk("| waiter struct %p:\n", w);
141 printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
142 w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
143 w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
144 w->list_entry.prio);
145 printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
146 w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
147 w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
148 w->pi_list_entry.prio);
149 printk("\n| lock:\n");
150 printk_lock(w->lock, 1);
151 printk("| w->ti->task:\n");
152 printk_task(w->task);
153 printk("| blocked at: ");
154 print_symbol("%s\n", w->ip);
155 printk("-------------------------\n");
156}
157
158static void show_task_locks(task_t *p)
159{
160 switch (p->state) {
161 case TASK_RUNNING: printk("R"); break;
162 case TASK_INTERRUPTIBLE: printk("S"); break;
163 case TASK_UNINTERRUPTIBLE: printk("D"); break;
164 case TASK_STOPPED: printk("T"); break;
165 case EXIT_ZOMBIE: printk("Z"); break;
166 case EXIT_DEAD: printk("X"); break;
167 default: printk("?"); break;
168 }
169 printk_task(p);
170 if (p->pi_blocked_on) {
171 struct rt_mutex *lock = p->pi_blocked_on->lock;
172
173 printk(" blocked on:");
174 printk_lock(lock, 1);
175 } else
176 printk(" (not blocked)\n");
177}
178
179void rt_mutex_show_held_locks(task_t *task, int verbose)
180{
181 struct list_head *curr, *cursor = NULL;
182 struct rt_mutex *lock;
183 task_t *t;
184 unsigned long flags;
185 int count = 0;
186
187 if (!rt_trace_on)
188 return;
189
190 if (verbose) {
191 printk("------------------------------\n");
192 printk("| showing all locks held by: | (");
193 printk_task_short(task);
194 printk("):\n");
195 printk("------------------------------\n");
196 }
197
198next:
199 spin_lock_irqsave(&task->held_list_lock, flags);
200 list_for_each(curr, &task->held_list_head) {
201 if (cursor && curr != cursor)
202 continue;
203 lock = list_entry(curr, struct rt_mutex, held_list_entry);
204 t = rt_mutex_owner(lock);
205 WARN_ON(t != task);
206 count++;
207 cursor = curr->next;
208 spin_unlock_irqrestore(&task->held_list_lock, flags);
209
210 printk("\n#%03d: ", count);
211 printk_lock(lock, 0);
212 goto next;
213 }
214 spin_unlock_irqrestore(&task->held_list_lock, flags);
215
216 printk("\n");
217}
218
219void rt_mutex_show_all_locks(void)
220{
221 task_t *g, *p;
222 int count = 10;
223 int unlock = 1;
224
225 printk("\n");
226 printk("----------------------\n");
227 printk("| showing all tasks: |\n");
228 printk("----------------------\n");
229
230 /*
231 * Here we try to get the tasklist_lock as hard as possible,
232 * if not successful after 2 seconds we ignore it (but keep
233 * trying). This is to enable a debug printout even if a
234 * tasklist_lock-holding task deadlocks or crashes.
235 */
236retry:
237 if (!read_trylock(&tasklist_lock)) {
238 if (count == 10)
239 printk("hm, tasklist_lock locked, retrying... ");
240 if (count) {
241 count--;
242 printk(" #%d", 10-count);
243 mdelay(200);
244 goto retry;
245 }
246 printk(" ignoring it.\n");
247 unlock = 0;
248 }
249 if (count != 10)
250 printk(" locked it.\n");
251
252 do_each_thread(g, p) {
253 show_task_locks(p);
254 if (!unlock)
255 if (read_trylock(&tasklist_lock))
256 unlock = 1;
257 } while_each_thread(g, p);
258
259 printk("\n");
260
261 printk("-----------------------------------------\n");
262 printk("| showing all locks held in the system: |\n");
263 printk("-----------------------------------------\n");
264
265 do_each_thread(g, p) {
266 rt_mutex_show_held_locks(p, 0);
267 if (!unlock)
268 if (read_trylock(&tasklist_lock))
269 unlock = 1;
270 } while_each_thread(g, p);
271
272
273 printk("=============================================\n\n");
274
275 if (unlock)
276 read_unlock(&tasklist_lock);
277}
278
279void rt_mutex_debug_check_no_locks_held(task_t *task)
280{
281 struct rt_mutex_waiter *w;
282 struct list_head *curr;
283 struct rt_mutex *lock;
284
285 if (!rt_trace_on)
286 return;
287 if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
288 printk("BUG: PI priority boost leaked!\n");
289 printk_task(task);
290 printk("\n");
291 }
292 if (list_empty(&task->held_list_head))
293 return;
294
295 spin_lock(&task->pi_lock);
296 plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
297 TRACE_OFF();
298
299 printk("hm, PI interest held at exit time? Task:\n");
300 printk_task(task);
301 printk_waiter(w);
302 return;
303 }
304 spin_unlock(&task->pi_lock);
305
306 list_for_each(curr, &task->held_list_head) {
307 lock = list_entry(curr, struct rt_mutex, held_list_entry);
308
309 printk("BUG: %s/%d, lock held at task exit time!\n",
310 task->comm, task->pid);
311 printk_lock(lock, 1);
312 if (rt_mutex_owner(lock) != task)
313 printk("exiting task is not even the owner??\n");
314 }
315}
316
317int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
318{
319 const void *to = from + len;
320 struct list_head *curr;
321 struct rt_mutex *lock;
322 unsigned long flags;
323 void *lock_addr;
324
325 if (!rt_trace_on)
326 return 0;
327
328 spin_lock_irqsave(&current->held_list_lock, flags);
329 list_for_each(curr, &current->held_list_head) {
330 lock = list_entry(curr, struct rt_mutex, held_list_entry);
331 lock_addr = lock;
332 if (lock_addr < from || lock_addr >= to)
333 continue;
334 TRACE_OFF();
335
336 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
337 current->comm, current->pid, lock, from, to);
338 dump_stack();
339 printk_lock(lock, 1);
340 if (rt_mutex_owner(lock) != current)
341 printk("freeing task is not even the owner??\n");
342 return 1;
343 }
344 spin_unlock_irqrestore(&current->held_list_lock, flags);
345
346 return 0;
347}
348
349void rt_mutex_debug_task_free(struct task_struct *task)
350{
351 WARN_ON(!plist_head_empty(&task->pi_waiters));
352 WARN_ON(task->pi_blocked_on);
353}
354
355/*
356 * We fill out the fields in the waiter to store the information about
357 * the deadlock. We print when we return. act_waiter can be NULL in
358 * case of a remove waiter operation.
359 */
360void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
361 struct rt_mutex *lock)
362{
363 struct task_struct *task;
364
365 if (!rt_trace_on || detect || !act_waiter)
366 return;
367
368 task = rt_mutex_owner(act_waiter->lock);
369 if (task && task != current) {
370 act_waiter->deadlock_task_pid = task->pid;
371 act_waiter->deadlock_lock = lock;
372 }
373}
374
375void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
376{
377 struct task_struct *task;
378
379 if (!waiter->deadlock_lock || !rt_trace_on)
380 return;
381
382 task = find_task_by_pid(waiter->deadlock_task_pid);
383 if (!task)
384 return;
385
386 TRACE_OFF_NOLOCK();
387
388 printk("\n============================================\n");
389 printk( "[ BUG: circular locking deadlock detected! ]\n");
390 printk( "--------------------------------------------\n");
391 printk("%s/%d is deadlocking current task %s/%d\n\n",
392 task->comm, task->pid, current->comm, current->pid);
393
394 printk("\n1) %s/%d is trying to acquire this lock:\n",
395 current->comm, current->pid);
396 printk_lock(waiter->lock, 1);
397
398 printk("... trying at: ");
399 print_symbol("%s\n", waiter->ip);
400
401 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
402 printk_lock(waiter->deadlock_lock, 1);
403
404 rt_mutex_show_held_locks(current, 1);
405 rt_mutex_show_held_locks(task, 1);
406
407 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
408 show_stack(task, NULL);
409 printk("\n%s/%d's [current] stackdump:\n\n",
410 current->comm, current->pid);
411 dump_stack();
412 rt_mutex_show_all_locks();
413 printk("[ turning off deadlock detection."
414 "Please report this trace. ]\n\n");
415 local_irq_disable();
416}
417
418void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
419{
420 unsigned long flags;
421
422 if (rt_trace_on) {
423 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
424
425 spin_lock_irqsave(&current->held_list_lock, flags);
426 list_add_tail(&lock->held_list_entry, &current->held_list_head);
427 spin_unlock_irqrestore(&current->held_list_lock, flags);
428
429 lock->acquire_ip = ip;
430 }
431}
432
433void debug_rt_mutex_unlock(struct rt_mutex *lock)
434{
435 unsigned long flags;
436
437 if (rt_trace_on) {
438 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
439 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
440
441 spin_lock_irqsave(&current->held_list_lock, flags);
442 list_del_init(&lock->held_list_entry);
443 spin_unlock_irqrestore(&current->held_list_lock, flags);
444 }
445}
446
447void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
448 struct task_struct *powner __IP_DECL__)
449{
450 unsigned long flags;
451
452 if (rt_trace_on) {
453 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
454
455 spin_lock_irqsave(&powner->held_list_lock, flags);
456 list_add_tail(&lock->held_list_entry, &powner->held_list_head);
457 spin_unlock_irqrestore(&powner->held_list_lock, flags);
458
459 lock->acquire_ip = ip;
460 }
461}
462
463void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
464{
465 unsigned long flags;
466
467 if (rt_trace_on) {
468 struct task_struct *owner = rt_mutex_owner(lock);
469
470 TRACE_WARN_ON_LOCKED(!owner);
471 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
472
473 spin_lock_irqsave(&owner->held_list_lock, flags);
474 list_del_init(&lock->held_list_entry);
475 spin_unlock_irqrestore(&owner->held_list_lock, flags);
476 }
477}
478
479void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
480{
481 memset(waiter, 0x11, sizeof(*waiter));
482 plist_node_init(&waiter->list_entry, MAX_PRIO);
483 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
484}
485
486void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
487{
488 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
489 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
490 TRACE_WARN_ON(waiter->task);
491 memset(waiter, 0x22, sizeof(*waiter));
492}
493
494void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
495{
496 void *addr = lock;
497
498 if (rt_trace_on) {
499 rt_mutex_debug_check_no_locks_freed(addr,
500 sizeof(struct rt_mutex));
501 INIT_LIST_HEAD(&lock->held_list_entry);
502 lock->name = name;
503 }
504}
505
506void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
507{
508}
509
510void rt_mutex_deadlock_account_unlock(struct task_struct *task)
511{
512}
513
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..7612fbc62d70
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12#define __IP_DECL__ , unsigned long ip
13#define __IP__ , ip
14#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
15
16extern void
17rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
18extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
19extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
20extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
21extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
22extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
23extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
24extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
25 struct task_struct *powner __IP_DECL__);
26extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
27extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
28 struct rt_mutex *lock);
29extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
30# define debug_rt_mutex_reset_waiter(w) \
31 do { (w)->deadlock_lock = NULL; } while (0)
32
33static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
34 int detect)
35{
36 return (waiter != NULL);
37}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..e82c2f848249
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static task_t *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278
279 if (signal_pending(current))
280 flush_signals(current);
281
282 if(kthread_should_stop())
283 break;
284 }
285 return 0;
286}
287
288/**
289 * sysfs_test_command - interface for test commands
290 * @dev: thread reference
291 * @buf: command for actual step
292 * @count: length of buffer
293 *
294 * command syntax:
295 *
296 * opcode:data
297 */
298static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
299 size_t count)
300{
301 struct sched_param schedpar;
302 struct test_thread_data *td;
303 char cmdbuf[32];
304 int op, dat, tid, ret;
305
306 td = container_of(dev, struct test_thread_data, sysdev);
307 tid = td->sysdev.id;
308
309 /* strings from sysfs write are not 0 terminated! */
310 if (count >= sizeof(cmdbuf))
311 return -EINVAL;
312
313 /* strip of \n: */
314 if (buf[count-1] == '\n')
315 count--;
316 if (count < 1)
317 return -EINVAL;
318
319 memcpy(cmdbuf, buf, count);
320 cmdbuf[count] = 0;
321
322 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
323 return -EINVAL;
324
325 switch (op) {
326 case RTTEST_SCHEDOT:
327 schedpar.sched_priority = 0;
328 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
329 if (ret)
330 return ret;
331 set_user_nice(current, 0);
332 break;
333
334 case RTTEST_SCHEDRT:
335 schedpar.sched_priority = dat;
336 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
337 if (ret)
338 return ret;
339 break;
340
341 case RTTEST_SIGNAL:
342 send_sig(SIGHUP, threads[tid], 0);
343 break;
344
345 default:
346 if (td->opcode > 0)
347 return -EBUSY;
348 td->opdata = dat;
349 td->opcode = op;
350 wake_up_process(threads[tid]);
351 }
352
353 return count;
354}
355
356/**
357 * sysfs_test_status - sysfs interface for rt tester
358 * @dev: thread to query
359 * @buf: char buffer to be filled with thread status info
360 */
361static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
362{
363 struct test_thread_data *td;
364 char *curr = buf;
365 task_t *tsk;
366 int i;
367
368 td = container_of(dev, struct test_thread_data, sysdev);
369 tsk = threads[td->sysdev.id];
370
371 spin_lock(&rttest_lock);
372
373 curr += sprintf(curr,
374 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
375 td->opcode, td->event, tsk->state,
376 (MAX_RT_PRIO - 1) - tsk->prio,
377 (MAX_RT_PRIO - 1) - tsk->normal_prio,
378 tsk->pi_blocked_on, td->bkl);
379
380 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
381 curr += sprintf(curr, "%d", td->mutexes[i]);
382
383 spin_unlock(&rttest_lock);
384
385 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
386 mutexes[td->sysdev.id].owner);
387
388 return curr - buf;
389}
390
391static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
392static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
393
394static struct sysdev_class rttest_sysclass = {
395 set_kset_name("rttest"),
396};
397
398static int init_test_thread(int id)
399{
400 thread_data[id].sysdev.cls = &rttest_sysclass;
401 thread_data[id].sysdev.id = id;
402
403 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
404 if (IS_ERR(threads[id]))
405 return PTR_ERR(threads[id]);
406
407 return sysdev_register(&thread_data[id].sysdev);
408}
409
410static int init_rttest(void)
411{
412 int ret, i;
413
414 spin_lock_init(&rttest_lock);
415
416 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
417 rt_mutex_init(&mutexes[i]);
418
419 ret = sysdev_class_register(&rttest_sysclass);
420 if (ret)
421 return ret;
422
423 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
424 ret = init_test_thread(i);
425 if (ret)
426 break;
427 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
428 if (ret)
429 break;
430 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
431 if (ret)
432 break;
433 }
434
435 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
436
437 return ret;
438}
439
440device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..45d61016da57
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 */
11#include <linux/spinlock.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/timer.h>
15
16#include "rtmutex_common.h"
17
18#ifdef CONFIG_DEBUG_RT_MUTEXES
19# include "rtmutex-debug.h"
20#else
21# include "rtmutex.h"
22#endif
23
24/*
25 * lock->owner state tracking:
26 *
27 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
28 * are used to keep track of the "owner is pending" and "lock has
29 * waiters" state.
30 *
31 * owner bit1 bit0
32 * NULL 0 0 lock is free (fast acquire possible)
33 * NULL 0 1 invalid state
34 * NULL 1 0 Transitional State*
35 * NULL 1 1 invalid state
36 * taskpointer 0 0 lock is held (fast release possible)
37 * taskpointer 0 1 task is pending owner
38 * taskpointer 1 0 lock is held and has waiters
39 * taskpointer 1 1 task is pending owner and lock has more waiters
40 *
41 * Pending ownership is assigned to the top (highest priority)
42 * waiter of the lock, when the lock is released. The thread is woken
43 * up and can now take the lock. Until the lock is taken (bit 0
44 * cleared) a competing higher priority thread can steal the lock
45 * which puts the woken up thread back on the waiters list.
46 *
47 * The fast atomic compare exchange based acquire and release is only
48 * possible when bit 0 and 1 of lock->owner are 0.
49 *
50 * (*) There's a small time where the owner can be NULL and the
51 * "lock has waiters" bit is set. This can happen when grabbing the lock.
52 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
53 * bit before looking at the lock, hence the reason this is a transitional
54 * state.
55 */
56
57static void
58rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
59 unsigned long mask)
60{
61 unsigned long val = (unsigned long)owner | mask;
62
63 if (rt_mutex_has_waiters(lock))
64 val |= RT_MUTEX_HAS_WAITERS;
65
66 lock->owner = (struct task_struct *)val;
67}
68
69static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
70{
71 lock->owner = (struct task_struct *)
72 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
73}
74
75static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
76{
77 if (!rt_mutex_has_waiters(lock))
78 clear_rt_mutex_waiters(lock);
79}
80
81/*
82 * We can speed up the acquire/release, if the architecture
83 * supports cmpxchg and if there's no debugging state to be set up
84 */
85#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
86# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
87static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
88{
89 unsigned long owner, *p = (unsigned long *) &lock->owner;
90
91 do {
92 owner = *p;
93 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
94}
95#else
96# define rt_mutex_cmpxchg(l,c,n) (0)
97static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
98{
99 lock->owner = (struct task_struct *)
100 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
101}
102#endif
103
104/*
105 * Calculate task priority from the waiter list priority
106 *
107 * Return task->normal_prio when the waiter list is empty or when
108 * the waiter is not allowed to do priority boosting
109 */
110int rt_mutex_getprio(struct task_struct *task)
111{
112 if (likely(!task_has_pi_waiters(task)))
113 return task->normal_prio;
114
115 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
116 task->normal_prio);
117}
118
119/*
120 * Adjust the priority of a task, after its pi_waiters got modified.
121 *
122 * This can be both boosting and unboosting. task->pi_lock must be held.
123 */
124static void __rt_mutex_adjust_prio(struct task_struct *task)
125{
126 int prio = rt_mutex_getprio(task);
127
128 if (task->prio != prio)
129 rt_mutex_setprio(task, prio);
130}
131
132/*
133 * Adjust task priority (undo boosting). Called from the exit path of
134 * rt_mutex_slowunlock() and rt_mutex_slowlock().
135 *
136 * (Note: We do this outside of the protection of lock->wait_lock to
137 * allow the lock to be taken while or before we readjust the priority
138 * of task. We do not use the spin_xx_mutex() variants here as we are
139 * outside of the debug path.)
140 */
141static void rt_mutex_adjust_prio(struct task_struct *task)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&task->pi_lock, flags);
146 __rt_mutex_adjust_prio(task);
147 spin_unlock_irqrestore(&task->pi_lock, flags);
148}
149
150/*
151 * Max number of times we'll walk the boosting chain:
152 */
153int max_lock_depth = 1024;
154
155/*
156 * Adjust the priority chain. Also used for deadlock detection.
157 * Decreases task's usage by one - may thus free the task.
158 * Returns 0 or -EDEADLK.
159 */
160static int rt_mutex_adjust_prio_chain(task_t *task,
161 int deadlock_detect,
162 struct rt_mutex *orig_lock,
163 struct rt_mutex_waiter *orig_waiter,
164 struct task_struct *top_task
165 __IP_DECL__)
166{
167 struct rt_mutex *lock;
168 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
169 int detect_deadlock, ret = 0, depth = 0;
170 unsigned long flags;
171
172 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
173 deadlock_detect);
174
175 /*
176 * The (de)boosting is a step by step approach with a lot of
177 * pitfalls. We want this to be preemptible and we want hold a
178 * maximum of two locks per step. So we have to check
179 * carefully whether things change under us.
180 */
181 again:
182 if (++depth > max_lock_depth) {
183 static int prev_max;
184
185 /*
186 * Print this only once. If the admin changes the limit,
187 * print a new message when reaching the limit again.
188 */
189 if (prev_max != max_lock_depth) {
190 prev_max = max_lock_depth;
191 printk(KERN_WARNING "Maximum lock depth %d reached "
192 "task: %s (%d)\n", max_lock_depth,
193 top_task->comm, top_task->pid);
194 }
195 put_task_struct(task);
196
197 return deadlock_detect ? -EDEADLK : 0;
198 }
199 retry:
200 /*
201 * Task can not go away as we did a get_task() before !
202 */
203 spin_lock_irqsave(&task->pi_lock, flags);
204
205 waiter = task->pi_blocked_on;
206 /*
207 * Check whether the end of the boosting chain has been
208 * reached or the state of the chain has changed while we
209 * dropped the locks.
210 */
211 if (!waiter || !waiter->task)
212 goto out_unlock_pi;
213
214 if (top_waiter && (!task_has_pi_waiters(task) ||
215 top_waiter != task_top_pi_waiter(task)))
216 goto out_unlock_pi;
217
218 /*
219 * When deadlock detection is off then we check, if further
220 * priority adjustment is necessary.
221 */
222 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
223 goto out_unlock_pi;
224
225 lock = waiter->lock;
226 if (!spin_trylock(&lock->wait_lock)) {
227 spin_unlock_irqrestore(&task->pi_lock, flags);
228 cpu_relax();
229 goto retry;
230 }
231
232 /* Deadlock detection */
233 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
234 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
235 spin_unlock(&lock->wait_lock);
236 ret = deadlock_detect ? -EDEADLK : 0;
237 goto out_unlock_pi;
238 }
239
240 top_waiter = rt_mutex_top_waiter(lock);
241
242 /* Requeue the waiter */
243 plist_del(&waiter->list_entry, &lock->wait_list);
244 waiter->list_entry.prio = task->prio;
245 plist_add(&waiter->list_entry, &lock->wait_list);
246
247 /* Release the task */
248 spin_unlock_irqrestore(&task->pi_lock, flags);
249 put_task_struct(task);
250
251 /* Grab the next task */
252 task = rt_mutex_owner(lock);
253 spin_lock_irqsave(&task->pi_lock, flags);
254
255 if (waiter == rt_mutex_top_waiter(lock)) {
256 /* Boost the owner */
257 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
258 waiter->pi_list_entry.prio = waiter->list_entry.prio;
259 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
260 __rt_mutex_adjust_prio(task);
261
262 } else if (top_waiter == waiter) {
263 /* Deboost the owner */
264 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
265 waiter = rt_mutex_top_waiter(lock);
266 waiter->pi_list_entry.prio = waiter->list_entry.prio;
267 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
268 __rt_mutex_adjust_prio(task);
269 }
270
271 get_task_struct(task);
272 spin_unlock_irqrestore(&task->pi_lock, flags);
273
274 top_waiter = rt_mutex_top_waiter(lock);
275 spin_unlock(&lock->wait_lock);
276
277 if (!detect_deadlock && waiter != top_waiter)
278 goto out_put_task;
279
280 goto again;
281
282 out_unlock_pi:
283 spin_unlock_irqrestore(&task->pi_lock, flags);
284 out_put_task:
285 put_task_struct(task);
286 return ret;
287}
288
289/*
290 * Optimization: check if we can steal the lock from the
291 * assigned pending owner [which might not have taken the
292 * lock yet]:
293 */
294static inline int try_to_steal_lock(struct rt_mutex *lock)
295{
296 struct task_struct *pendowner = rt_mutex_owner(lock);
297 struct rt_mutex_waiter *next;
298 unsigned long flags;
299
300 if (!rt_mutex_owner_pending(lock))
301 return 0;
302
303 if (pendowner == current)
304 return 1;
305
306 spin_lock_irqsave(&pendowner->pi_lock, flags);
307 if (current->prio >= pendowner->prio) {
308 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
309 return 0;
310 }
311
312 /*
313 * Check if a waiter is enqueued on the pending owners
314 * pi_waiters list. Remove it and readjust pending owners
315 * priority.
316 */
317 if (likely(!rt_mutex_has_waiters(lock))) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 1;
320 }
321
322 /* No chain handling, pending owner is not blocked on anything: */
323 next = rt_mutex_top_waiter(lock);
324 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
325 __rt_mutex_adjust_prio(pendowner);
326 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
327
328 /*
329 * We are going to steal the lock and a waiter was
330 * enqueued on the pending owners pi_waiters queue. So
331 * we have to enqueue this waiter into
332 * current->pi_waiters list. This covers the case,
333 * where current is boosted because it holds another
334 * lock and gets unboosted because the booster is
335 * interrupted, so we would delay a waiter with higher
336 * priority as current->normal_prio.
337 *
338 * Note: in the rare case of a SCHED_OTHER task changing
339 * its priority and thus stealing the lock, next->task
340 * might be current:
341 */
342 if (likely(next->task != current)) {
343 spin_lock_irqsave(&current->pi_lock, flags);
344 plist_add(&next->pi_list_entry, &current->pi_waiters);
345 __rt_mutex_adjust_prio(current);
346 spin_unlock_irqrestore(&current->pi_lock, flags);
347 }
348 return 1;
349}
350
351/*
352 * Try to take an rt-mutex
353 *
354 * This fails
355 * - when the lock has a real owner
356 * - when a different pending owner exists and has higher priority than current
357 *
358 * Must be called with lock->wait_lock held.
359 */
360static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
361{
362 /*
363 * We have to be careful here if the atomic speedups are
364 * enabled, such that, when
365 * - no other waiter is on the lock
366 * - the lock has been released since we did the cmpxchg
367 * the lock can be released or taken while we are doing the
368 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
369 *
370 * The atomic acquire/release aware variant of
371 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
372 * the WAITERS bit, the atomic release / acquire can not
373 * happen anymore and lock->wait_lock protects us from the
374 * non-atomic case.
375 *
376 * Note, that this might set lock->owner =
377 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
378 * any more. This is fixed up when we take the ownership.
379 * This is the transitional state explained at the top of this file.
380 */
381 mark_rt_mutex_waiters(lock);
382
383 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
384 return 0;
385
386 /* We got the lock. */
387 debug_rt_mutex_lock(lock __IP__);
388
389 rt_mutex_set_owner(lock, current, 0);
390
391 rt_mutex_deadlock_account_lock(lock, current);
392
393 return 1;
394}
395
396/*
397 * Task blocks on lock.
398 *
399 * Prepare waiter and propagate pi chain
400 *
401 * This must be called with lock->wait_lock held.
402 */
403static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
404 struct rt_mutex_waiter *waiter,
405 int detect_deadlock
406 __IP_DECL__)
407{
408 struct rt_mutex_waiter *top_waiter = waiter;
409 task_t *owner = rt_mutex_owner(lock);
410 int boost = 0, res;
411 unsigned long flags;
412
413 spin_lock_irqsave(&current->pi_lock, flags);
414 __rt_mutex_adjust_prio(current);
415 waiter->task = current;
416 waiter->lock = lock;
417 plist_node_init(&waiter->list_entry, current->prio);
418 plist_node_init(&waiter->pi_list_entry, current->prio);
419
420 /* Get the top priority waiter on the lock */
421 if (rt_mutex_has_waiters(lock))
422 top_waiter = rt_mutex_top_waiter(lock);
423 plist_add(&waiter->list_entry, &lock->wait_list);
424
425 current->pi_blocked_on = waiter;
426
427 spin_unlock_irqrestore(&current->pi_lock, flags);
428
429 if (waiter == rt_mutex_top_waiter(lock)) {
430 spin_lock_irqsave(&owner->pi_lock, flags);
431 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
432 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
433
434 __rt_mutex_adjust_prio(owner);
435 if (owner->pi_blocked_on) {
436 boost = 1;
437 /* gets dropped in rt_mutex_adjust_prio_chain()! */
438 get_task_struct(owner);
439 }
440 spin_unlock_irqrestore(&owner->pi_lock, flags);
441 }
442 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
443 spin_lock_irqsave(&owner->pi_lock, flags);
444 if (owner->pi_blocked_on) {
445 boost = 1;
446 /* gets dropped in rt_mutex_adjust_prio_chain()! */
447 get_task_struct(owner);
448 }
449 spin_unlock_irqrestore(&owner->pi_lock, flags);
450 }
451 if (!boost)
452 return 0;
453
454 spin_unlock(&lock->wait_lock);
455
456 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
457 current __IP__);
458
459 spin_lock(&lock->wait_lock);
460
461 return res;
462}
463
464/*
465 * Wake up the next waiter on the lock.
466 *
467 * Remove the top waiter from the current tasks waiter list and from
468 * the lock waiter list. Set it as pending owner. Then wake it up.
469 *
470 * Called with lock->wait_lock held.
471 */
472static void wakeup_next_waiter(struct rt_mutex *lock)
473{
474 struct rt_mutex_waiter *waiter;
475 struct task_struct *pendowner;
476 unsigned long flags;
477
478 spin_lock_irqsave(&current->pi_lock, flags);
479
480 waiter = rt_mutex_top_waiter(lock);
481 plist_del(&waiter->list_entry, &lock->wait_list);
482
483 /*
484 * Remove it from current->pi_waiters. We do not adjust a
485 * possible priority boost right now. We execute wakeup in the
486 * boosted mode and go back to normal after releasing
487 * lock->wait_lock.
488 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
490 pendowner = waiter->task;
491 waiter->task = NULL;
492
493 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
494
495 spin_unlock_irqrestore(&current->pi_lock, flags);
496
497 /*
498 * Clear the pi_blocked_on variable and enqueue a possible
499 * waiter into the pi_waiters list of the pending owner. This
500 * prevents that in case the pending owner gets unboosted a
501 * waiter with higher priority than pending-owner->normal_prio
502 * is blocked on the unboosted (pending) owner.
503 */
504 spin_lock_irqsave(&pendowner->pi_lock, flags);
505
506 WARN_ON(!pendowner->pi_blocked_on);
507 WARN_ON(pendowner->pi_blocked_on != waiter);
508 WARN_ON(pendowner->pi_blocked_on->lock != lock);
509
510 pendowner->pi_blocked_on = NULL;
511
512 if (rt_mutex_has_waiters(lock)) {
513 struct rt_mutex_waiter *next;
514
515 next = rt_mutex_top_waiter(lock);
516 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
517 }
518 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
519
520 wake_up_process(pendowner);
521}
522
523/*
524 * Remove a waiter from a lock
525 *
526 * Must be called with lock->wait_lock held
527 */
528static void remove_waiter(struct rt_mutex *lock,
529 struct rt_mutex_waiter *waiter __IP_DECL__)
530{
531 int first = (waiter == rt_mutex_top_waiter(lock));
532 int boost = 0;
533 task_t *owner = rt_mutex_owner(lock);
534 unsigned long flags;
535
536 spin_lock_irqsave(&current->pi_lock, flags);
537 plist_del(&waiter->list_entry, &lock->wait_list);
538 waiter->task = NULL;
539 current->pi_blocked_on = NULL;
540 spin_unlock_irqrestore(&current->pi_lock, flags);
541
542 if (first && owner != current) {
543
544 spin_lock_irqsave(&owner->pi_lock, flags);
545
546 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
547
548 if (rt_mutex_has_waiters(lock)) {
549 struct rt_mutex_waiter *next;
550
551 next = rt_mutex_top_waiter(lock);
552 plist_add(&next->pi_list_entry, &owner->pi_waiters);
553 }
554 __rt_mutex_adjust_prio(owner);
555
556 if (owner->pi_blocked_on) {
557 boost = 1;
558 /* gets dropped in rt_mutex_adjust_prio_chain()! */
559 get_task_struct(owner);
560 }
561 spin_unlock_irqrestore(&owner->pi_lock, flags);
562 }
563
564 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
565
566 if (!boost)
567 return;
568
569 spin_unlock(&lock->wait_lock);
570
571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
572
573 spin_lock(&lock->wait_lock);
574}
575
576/*
577 * Recheck the pi chain, in case we got a priority setting
578 *
579 * Called from sched_setscheduler
580 */
581void rt_mutex_adjust_pi(struct task_struct *task)
582{
583 struct rt_mutex_waiter *waiter;
584 unsigned long flags;
585
586 spin_lock_irqsave(&task->pi_lock, flags);
587
588 waiter = task->pi_blocked_on;
589 if (!waiter || waiter->list_entry.prio == task->prio) {
590 spin_unlock_irqrestore(&task->pi_lock, flags);
591 return;
592 }
593
594 /* gets dropped in rt_mutex_adjust_prio_chain()! */
595 get_task_struct(task);
596 spin_unlock_irqrestore(&task->pi_lock, flags);
597
598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
599}
600
601/*
602 * Slow path lock function:
603 */
604static int __sched
605rt_mutex_slowlock(struct rt_mutex *lock, int state,
606 struct hrtimer_sleeper *timeout,
607 int detect_deadlock __IP_DECL__)
608{
609 struct rt_mutex_waiter waiter;
610 int ret = 0;
611
612 debug_rt_mutex_init_waiter(&waiter);
613 waiter.task = NULL;
614
615 spin_lock(&lock->wait_lock);
616
617 /* Try to acquire the lock again: */
618 if (try_to_take_rt_mutex(lock __IP__)) {
619 spin_unlock(&lock->wait_lock);
620 return 0;
621 }
622
623 set_current_state(state);
624
625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS);
629
630 for (;;) {
631 /* Try to acquire the lock: */
632 if (try_to_take_rt_mutex(lock __IP__))
633 break;
634
635 /*
636 * TASK_INTERRUPTIBLE checks for signals and
637 * timeout. Ignored otherwise.
638 */
639 if (unlikely(state == TASK_INTERRUPTIBLE)) {
640 /* Signal pending? */
641 if (signal_pending(current))
642 ret = -EINTR;
643 if (timeout && !timeout->task)
644 ret = -ETIMEDOUT;
645 if (ret)
646 break;
647 }
648
649 /*
650 * waiter.task is NULL the first time we come here and
651 * when we have been woken up by the previous owner
652 * but the lock got stolen by a higher prio task.
653 */
654 if (!waiter.task) {
655 ret = task_blocks_on_rt_mutex(lock, &waiter,
656 detect_deadlock __IP__);
657 /*
658 * If we got woken up by the owner then start loop
659 * all over without going into schedule to try
660 * to get the lock now:
661 */
662 if (unlikely(!waiter.task))
663 continue;
664
665 if (unlikely(ret))
666 break;
667 }
668
669 spin_unlock(&lock->wait_lock);
670
671 debug_rt_mutex_print_deadlock(&waiter);
672
673 if (waiter.task)
674 schedule_rt_mutex(lock);
675
676 spin_lock(&lock->wait_lock);
677 set_current_state(state);
678 }
679
680 set_current_state(TASK_RUNNING);
681
682 if (unlikely(waiter.task))
683 remove_waiter(lock, &waiter __IP__);
684
685 /*
686 * try_to_take_rt_mutex() sets the waiter bit
687 * unconditionally. We might have to fix that up.
688 */
689 fixup_rt_mutex_waiters(lock);
690
691 spin_unlock(&lock->wait_lock);
692
693 /* Remove pending timer: */
694 if (unlikely(timeout))
695 hrtimer_cancel(&timeout->timer);
696
697 /*
698 * Readjust priority, when we did not get the lock. We might
699 * have been the pending owner and boosted. Since we did not
700 * take the lock, the PI boost has to go.
701 */
702 if (unlikely(ret))
703 rt_mutex_adjust_prio(current);
704
705 debug_rt_mutex_free_waiter(&waiter);
706
707 return ret;
708}
709
710/*
711 * Slow path try-lock function:
712 */
713static inline int
714rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
715{
716 int ret = 0;
717
718 spin_lock(&lock->wait_lock);
719
720 if (likely(rt_mutex_owner(lock) != current)) {
721
722 ret = try_to_take_rt_mutex(lock __IP__);
723 /*
724 * try_to_take_rt_mutex() sets the lock waiters
725 * bit unconditionally. Clean this up.
726 */
727 fixup_rt_mutex_waiters(lock);
728 }
729
730 spin_unlock(&lock->wait_lock);
731
732 return ret;
733}
734
735/*
736 * Slow path to release a rt-mutex:
737 */
738static void __sched
739rt_mutex_slowunlock(struct rt_mutex *lock)
740{
741 spin_lock(&lock->wait_lock);
742
743 debug_rt_mutex_unlock(lock);
744
745 rt_mutex_deadlock_account_unlock(current);
746
747 if (!rt_mutex_has_waiters(lock)) {
748 lock->owner = NULL;
749 spin_unlock(&lock->wait_lock);
750 return;
751 }
752
753 wakeup_next_waiter(lock);
754
755 spin_unlock(&lock->wait_lock);
756
757 /* Undo pi boosting if necessary: */
758 rt_mutex_adjust_prio(current);
759}
760
761/*
762 * debug aware fast / slowpath lock,trylock,unlock
763 *
764 * The atomic acquire/release ops are compiled away, when either the
765 * architecture does not support cmpxchg or when debugging is enabled.
766 */
767static inline int
768rt_mutex_fastlock(struct rt_mutex *lock, int state,
769 int detect_deadlock,
770 int (*slowfn)(struct rt_mutex *lock, int state,
771 struct hrtimer_sleeper *timeout,
772 int detect_deadlock __IP_DECL__))
773{
774 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
775 rt_mutex_deadlock_account_lock(lock, current);
776 return 0;
777 } else
778 return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
779}
780
781static inline int
782rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
783 struct hrtimer_sleeper *timeout, int detect_deadlock,
784 int (*slowfn)(struct rt_mutex *lock, int state,
785 struct hrtimer_sleeper *timeout,
786 int detect_deadlock __IP_DECL__))
787{
788 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
789 rt_mutex_deadlock_account_lock(lock, current);
790 return 0;
791 } else
792 return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
793}
794
795static inline int
796rt_mutex_fasttrylock(struct rt_mutex *lock,
797 int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
798{
799 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
800 rt_mutex_deadlock_account_lock(lock, current);
801 return 1;
802 }
803 return slowfn(lock __RET_IP__);
804}
805
806static inline void
807rt_mutex_fastunlock(struct rt_mutex *lock,
808 void (*slowfn)(struct rt_mutex *lock))
809{
810 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
811 rt_mutex_deadlock_account_unlock(current);
812 else
813 slowfn(lock);
814}
815
816/**
817 * rt_mutex_lock - lock a rt_mutex
818 *
819 * @lock: the rt_mutex to be locked
820 */
821void __sched rt_mutex_lock(struct rt_mutex *lock)
822{
823 might_sleep();
824
825 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
826}
827EXPORT_SYMBOL_GPL(rt_mutex_lock);
828
829/**
830 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
831 *
832 * @lock: the rt_mutex to be locked
833 * @detect_deadlock: deadlock detection on/off
834 *
835 * Returns:
836 * 0 on success
837 * -EINTR when interrupted by a signal
838 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
839 */
840int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
841 int detect_deadlock)
842{
843 might_sleep();
844
845 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
846 detect_deadlock, rt_mutex_slowlock);
847}
848EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
849
850/**
851 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
852 * the timeout structure is provided
853 * by the caller
854 *
855 * @lock: the rt_mutex to be locked
856 * @timeout: timeout structure or NULL (no timeout)
857 * @detect_deadlock: deadlock detection on/off
858 *
859 * Returns:
860 * 0 on success
861 * -EINTR when interrupted by a signal
862 * -ETIMEOUT when the timeout expired
863 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
864 */
865int
866rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
867 int detect_deadlock)
868{
869 might_sleep();
870
871 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
872 detect_deadlock, rt_mutex_slowlock);
873}
874EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
875
876/**
877 * rt_mutex_trylock - try to lock a rt_mutex
878 *
879 * @lock: the rt_mutex to be locked
880 *
881 * Returns 1 on success and 0 on contention
882 */
883int __sched rt_mutex_trylock(struct rt_mutex *lock)
884{
885 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
886}
887EXPORT_SYMBOL_GPL(rt_mutex_trylock);
888
889/**
890 * rt_mutex_unlock - unlock a rt_mutex
891 *
892 * @lock: the rt_mutex to be unlocked
893 */
894void __sched rt_mutex_unlock(struct rt_mutex *lock)
895{
896 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
897}
898EXPORT_SYMBOL_GPL(rt_mutex_unlock);
899
900/***
901 * rt_mutex_destroy - mark a mutex unusable
902 * @lock: the mutex to be destroyed
903 *
904 * This function marks the mutex uninitialized, and any subsequent
905 * use of the mutex is forbidden. The mutex must not be locked when
906 * this function is called.
907 */
908void rt_mutex_destroy(struct rt_mutex *lock)
909{
910 WARN_ON(rt_mutex_is_locked(lock));
911#ifdef CONFIG_DEBUG_RT_MUTEXES
912 lock->magic = NULL;
913#endif
914}
915
916EXPORT_SYMBOL_GPL(rt_mutex_destroy);
917
918/**
919 * __rt_mutex_init - initialize the rt lock
920 *
921 * @lock: the rt lock to be initialized
922 *
923 * Initialize the rt lock to unlocked state.
924 *
925 * Initializing of a locked rt lock is not allowed
926 */
927void __rt_mutex_init(struct rt_mutex *lock, const char *name)
928{
929 lock->owner = NULL;
930 spin_lock_init(&lock->wait_lock);
931 plist_head_init(&lock->wait_list, &lock->wait_lock);
932
933 debug_rt_mutex_init(lock, name);
934}
935EXPORT_SYMBOL_GPL(__rt_mutex_init);
936
937/**
938 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
939 * proxy owner
940 *
941 * @lock: the rt_mutex to be locked
942 * @proxy_owner:the task to set as owner
943 *
944 * No locking. Caller has to do serializing itself
945 * Special API call for PI-futex support
946 */
947void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
948 struct task_struct *proxy_owner)
949{
950 __rt_mutex_init(lock, NULL);
951 debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
952 rt_mutex_set_owner(lock, proxy_owner, 0);
953 rt_mutex_deadlock_account_lock(lock, proxy_owner);
954}
955
956/**
957 * rt_mutex_proxy_unlock - release a lock on behalf of owner
958 *
959 * @lock: the rt_mutex to be locked
960 *
961 * No locking. Caller has to do serializing itself
962 * Special API call for PI-futex support
963 */
964void rt_mutex_proxy_unlock(struct rt_mutex *lock,
965 struct task_struct *proxy_owner)
966{
967 debug_rt_mutex_proxy_unlock(lock);
968 rt_mutex_set_owner(lock, NULL, 0);
969 rt_mutex_deadlock_account_unlock(proxy_owner);
970}
971
972/**
973 * rt_mutex_next_owner - return the next owner of the lock
974 *
975 * @lock: the rt lock query
976 *
977 * Returns the next owner of the lock or NULL
978 *
979 * Caller has to serialize against other accessors to the lock
980 * itself.
981 *
982 * Special API call for PI-futex support
983 */
984struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
985{
986 if (!rt_mutex_has_waiters(lock))
987 return NULL;
988
989 return rt_mutex_top_waiter(lock)->task;
990}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..1e0fca13ff72
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define __IP_DECL__
14#define __IP__
15#define __RET_IP__
16#define rt_mutex_deadlock_check(l) (0)
17#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
18#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
19#define debug_rt_mutex_init_waiter(w) do { } while (0)
20#define debug_rt_mutex_free_waiter(w) do { } while (0)
21#define debug_rt_mutex_lock(l) do { } while (0)
22#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
23#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
24#define debug_rt_mutex_unlock(l) do { } while (0)
25#define debug_rt_mutex_init(m, n) do { } while (0)
26#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
27#define debug_rt_mutex_print_deadlock(w) do { } while (0)
28#define debug_rt_mutex_detect_deadlock(w,d) (d)
29#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index a856040c200a..2629c1711fd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
168 */ 168 */
169 169
170#define SCALE_PRIO(x, prio) \ 170#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 172
173static unsigned int task_timeslice(task_t *p) 173static unsigned int static_prio_timeslice(int static_prio)
174{ 174{
175 if (p->static_prio < NICE_TO_PRIO(0)) 175 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 176 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 177 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
179} 179}
180
181static inline unsigned int task_timeslice(task_t *p)
182{
183 return static_prio_timeslice(p->static_prio);
184}
185
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 186#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time) 187 < (long long) (sd)->cache_hot_time)
182 188
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
184 * These are the runqueue data structures: 190 * These are the runqueue data structures:
185 */ 191 */
186 192
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t; 193typedef struct runqueue runqueue_t;
190 194
191struct prio_array { 195struct prio_array {
192 unsigned int nr_active; 196 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 197 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 198 struct list_head queue[MAX_PRIO];
195}; 199};
196 200
@@ -209,6 +213,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 213 * remote CPUs use both these fields when doing load calculation.
210 */ 214 */
211 unsigned long nr_running; 215 unsigned long nr_running;
216 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 217#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 218 unsigned long cpu_load[3];
214#endif 219#endif
@@ -239,7 +244,6 @@ struct runqueue {
239 244
240 task_t *migration_thread; 245 task_t *migration_thread;
241 struct list_head migration_queue; 246 struct list_head migration_queue;
242 int cpu;
243#endif 247#endif
244 248
245#ifdef CONFIG_SCHEDSTATS 249#ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 356
353/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 379 * explicitly disabling preemption.
357 */ 380 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 381static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
359 __acquires(rq->lock) 382 __acquires(rq->lock)
360{ 383{
361 struct runqueue *rq; 384 struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
371 return rq; 394 return rq;
372} 395}
373 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
375 __releases(rq->lock) 404 __releases(rq->lock)
376{ 405{
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 663}
635 664
636/* 665/*
637 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
639 * 668 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 676 *
648 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
649 */ 678 */
650static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
651{ 681{
652 int bonus, prio; 682 int bonus, prio;
653 683
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 685
659 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p)
665} 692}
666 693
667/* 694/*
695 * To aid in avoiding the subversion of "niceness" due to uneven distribution
696 * of tasks with abnormal "nice" values across CPUs the contribution that
697 * each task makes to its run queue's load is weighted according to its
698 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
699 * scaled version of the new time slice allocation that they receive on time
700 * slice expiry etc.
701 */
702
703/*
704 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
705 * If static_prio_timeslice() is ever changed to break this assumption then
706 * this code will need modification
707 */
708#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
709#define LOAD_WEIGHT(lp) \
710 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
711#define PRIO_TO_LOAD_WEIGHT(prio) \
712 LOAD_WEIGHT(static_prio_timeslice(prio))
713#define RTPRIO_TO_LOAD_WEIGHT(rp) \
714 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
715
716static void set_load_weight(task_t *p)
717{
718 if (has_rt_policy(p)) {
719#ifdef CONFIG_SMP
720 if (p == task_rq(p)->migration_thread)
721 /*
722 * The migration thread does the actual balancing.
723 * Giving its load any weight will skew balancing
724 * adversely.
725 */
726 p->load_weight = 0;
727 else
728#endif
729 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
730 } else
731 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
732}
733
734static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
735{
736 rq->raw_weighted_load += p->load_weight;
737}
738
739static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
740{
741 rq->raw_weighted_load -= p->load_weight;
742}
743
744static inline void inc_nr_running(task_t *p, runqueue_t *rq)
745{
746 rq->nr_running++;
747 inc_raw_weighted_load(rq, p);
748}
749
750static inline void dec_nr_running(task_t *p, runqueue_t *rq)
751{
752 rq->nr_running--;
753 dec_raw_weighted_load(rq, p);
754}
755
756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
668 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
669 */ 796 */
670static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
674 if (batch_task(p)) 801 if (batch_task(p))
675 target = rq->expired; 802 target = rq->expired;
676 enqueue_task(p, target); 803 enqueue_task(p, target);
677 rq->nr_running++; 804 inc_nr_running(p, rq);
678} 805}
679 806
680/* 807/*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 810static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
684{ 811{
685 enqueue_task_head(p, rq->active); 812 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 813 inc_nr_running(p, rq);
687} 814}
688 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
689static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
690{ 821{
691 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 823 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 824
695 if (batch_task(p)) 825 if (batch_task(p))
696 sleep_time = 0; 826 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 827
704 if (likely(sleep_time > 0)) { 828 if (likely(sleep_time > 0)) {
705 /* 829 /*
706 * User tasks that sleep a long time are categorised as 830 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 831 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 832 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 833 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 834 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 835
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 836 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 837 /*
717 if (p->sleep_avg < ceiling) 838 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 839 * with one single large enough sleep.
840 */
841 p->sleep_avg = ceiling;
842 /*
843 * Using INTERACTIVE_SLEEP() as a ceiling places a
844 * nice(0) task 1ms sleep away from promotion, and
845 * gives it 700ms to round-robin with no chance of
846 * being demoted. This is more than generous, so
847 * mark this sleep as non-interactive to prevent the
848 * on-runqueue bonus logic from intervening should
849 * this task not receive cpu immediately.
850 */
851 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 852 } else {
720 /* 853 /*
721 * Tasks waking from uninterruptible sleep are 854 * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 856 * are likely to be waiting on I/O
724 */ 857 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 858 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 859 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 860 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 861 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 862 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 863 p->sleep_avg = ceiling;
731 sleep_time = 0; 864 sleep_time = 0;
732 } 865 }
733 } 866 }
734 867
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 875 */
743 p->sleep_avg += sleep_time; 876 p->sleep_avg += sleep_time;
744 877
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 878 }
879 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
880 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 881 }
749 882
750 return effective_prio(p); 883 return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
805 */ 938 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 939static void deactivate_task(struct task_struct *p, runqueue_t *rq)
807{ 940{
808 rq->nr_running--; 941 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 942 dequeue_task(p, p->array);
810 p->array = NULL; 943 p->array = NULL;
811} 944}
@@ -860,6 +993,12 @@ inline int task_curr(const task_t *p)
860 return cpu_curr(task_cpu(p)) == p; 993 return cpu_curr(task_cpu(p)) == p;
861} 994}
862 995
996/* Used instead of source_load when we know the type == 0 */
997unsigned long weighted_cpuload(const int cpu)
998{
999 return cpu_rq(cpu)->raw_weighted_load;
1000}
1001
863#ifdef CONFIG_SMP 1002#ifdef CONFIG_SMP
864typedef struct { 1003typedef struct {
865 struct list_head list; 1004 struct list_head list;
@@ -949,7 +1088,8 @@ void kick_process(task_t *p)
949} 1088}
950 1089
951/* 1090/*
952 * Return a low guess at the load of a migration-source cpu. 1091 * Return a low guess at the load of a migration-source cpu weighted
1092 * according to the scheduling class and "nice" value.
953 * 1093 *
954 * We want to under-estimate the load of migration sources, to 1094 * We want to under-estimate the load of migration sources, to
955 * balance conservatively. 1095 * balance conservatively.
@@ -957,24 +1097,36 @@ void kick_process(task_t *p)
957static inline unsigned long source_load(int cpu, int type) 1097static inline unsigned long source_load(int cpu, int type)
958{ 1098{
959 runqueue_t *rq = cpu_rq(cpu); 1099 runqueue_t *rq = cpu_rq(cpu);
960 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1100
961 if (type == 0) 1101 if (type == 0)
962 return load_now; 1102 return rq->raw_weighted_load;
963 1103
964 return min(rq->cpu_load[type-1], load_now); 1104 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
965} 1105}
966 1106
967/* 1107/*
968 * Return a high guess at the load of a migration-target cpu 1108 * Return a high guess at the load of a migration-target cpu weighted
1109 * according to the scheduling class and "nice" value.
969 */ 1110 */
970static inline unsigned long target_load(int cpu, int type) 1111static inline unsigned long target_load(int cpu, int type)
971{ 1112{
972 runqueue_t *rq = cpu_rq(cpu); 1113 runqueue_t *rq = cpu_rq(cpu);
973 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1114
974 if (type == 0) 1115 if (type == 0)
975 return load_now; 1116 return rq->raw_weighted_load;
1117
1118 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1119}
1120
1121/*
1122 * Return the average load per task on the cpu's run queue
1123 */
1124static inline unsigned long cpu_avg_load_per_task(int cpu)
1125{
1126 runqueue_t *rq = cpu_rq(cpu);
1127 unsigned long n = rq->nr_running;
976 1128
977 return max(rq->cpu_load[type-1], load_now); 1129 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
978} 1130}
979 1131
980/* 1132/*
@@ -1047,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1047 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1199 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1048 1200
1049 for_each_cpu_mask(i, tmp) { 1201 for_each_cpu_mask(i, tmp) {
1050 load = source_load(i, 0); 1202 load = weighted_cpuload(i);
1051 1203
1052 if (load < min_load || (load == min_load && i == this_cpu)) { 1204 if (load < min_load || (load == min_load && i == this_cpu)) {
1053 min_load = load; 1205 min_load = load;
@@ -1074,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
1074 struct task_struct *t = current; 1226 struct task_struct *t = current;
1075 struct sched_domain *tmp, *sd = NULL; 1227 struct sched_domain *tmp, *sd = NULL;
1076 1228
1077 for_each_domain(cpu, tmp) 1229 for_each_domain(cpu, tmp) {
1230 /*
1231 * If power savings logic is enabled for a domain, stop there.
1232 */
1233 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1234 break;
1078 if (tmp->flags & flag) 1235 if (tmp->flags & flag)
1079 sd = tmp; 1236 sd = tmp;
1237 }
1080 1238
1081 while (sd) { 1239 while (sd) {
1082 cpumask_t span; 1240 cpumask_t span;
@@ -1226,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1226 1384
1227 if (this_sd->flags & SD_WAKE_AFFINE) { 1385 if (this_sd->flags & SD_WAKE_AFFINE) {
1228 unsigned long tl = this_load; 1386 unsigned long tl = this_load;
1387 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1388
1229 /* 1389 /*
1230 * If sync wakeup then subtract the (maximum possible) 1390 * If sync wakeup then subtract the (maximum possible)
1231 * effect of the currently running task from the load 1391 * effect of the currently running task from the load
1232 * of the current CPU: 1392 * of the current CPU:
1233 */ 1393 */
1234 if (sync) 1394 if (sync)
1235 tl -= SCHED_LOAD_SCALE; 1395 tl -= current->load_weight;
1236 1396
1237 if ((tl <= load && 1397 if ((tl <= load &&
1238 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1398 tl + target_load(cpu, idx) <= tl_per_task) ||
1239 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1399 100*(tl + p->load_weight) <= imbalance*load) {
1240 /* 1400 /*
1241 * This domain has SD_WAKE_AFFINE and 1401 * This domain has SD_WAKE_AFFINE and
1242 * p is cache cold in this domain, and 1402 * p is cache cold in this domain, and
@@ -1353,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1353 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1354 */ 1514 */
1355 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1356 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1357 p->array = NULL; 1523 p->array = NULL;
1358#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1432,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1432 __activate_task(p, rq); 1598 __activate_task(p, rq);
1433 else { 1599 else {
1434 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1435 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1436 p->array = current->array; 1603 p->array = current->array;
1437 p->array->nr_active++; 1604 p->array->nr_active++;
1438 rq->nr_running++; 1605 inc_nr_running(p, rq);
1439 } 1606 }
1440 set_need_resched(); 1607 set_need_resched();
1441 } else 1608 } else
@@ -1653,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
1653 1820
1654unsigned long long nr_context_switches(void) 1821unsigned long long nr_context_switches(void)
1655{ 1822{
1656 unsigned long long i, sum = 0; 1823 int i;
1824 unsigned long long sum = 0;
1657 1825
1658 for_each_possible_cpu(i) 1826 for_each_possible_cpu(i)
1659 sum += cpu_rq(i)->nr_switches; 1827 sum += cpu_rq(i)->nr_switches;
@@ -1691,9 +1859,6 @@ unsigned long nr_active(void)
1691/* 1859/*
1692 * double_rq_lock - safely lock two runqueues 1860 * double_rq_lock - safely lock two runqueues
1693 * 1861 *
1694 * We must take them in cpu order to match code in
1695 * dependent_sleeper and wake_dependent_sleeper.
1696 *
1697 * Note this does not disable interrupts like task_rq_lock, 1862 * Note this does not disable interrupts like task_rq_lock,
1698 * you need to do so manually before calling. 1863 * you need to do so manually before calling.
1699 */ 1864 */
@@ -1705,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1705 spin_lock(&rq1->lock); 1870 spin_lock(&rq1->lock);
1706 __acquire(rq2->lock); /* Fake it out ;) */ 1871 __acquire(rq2->lock); /* Fake it out ;) */
1707 } else { 1872 } else {
1708 if (rq1->cpu < rq2->cpu) { 1873 if (rq1 < rq2) {
1709 spin_lock(&rq1->lock); 1874 spin_lock(&rq1->lock);
1710 spin_lock(&rq2->lock); 1875 spin_lock(&rq2->lock);
1711 } else { 1876 } else {
@@ -1741,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1741 __acquires(this_rq->lock) 1906 __acquires(this_rq->lock)
1742{ 1907{
1743 if (unlikely(!spin_trylock(&busiest->lock))) { 1908 if (unlikely(!spin_trylock(&busiest->lock))) {
1744 if (busiest->cpu < this_rq->cpu) { 1909 if (busiest < this_rq) {
1745 spin_unlock(&this_rq->lock); 1910 spin_unlock(&this_rq->lock);
1746 spin_lock(&busiest->lock); 1911 spin_lock(&busiest->lock);
1747 spin_lock(&this_rq->lock); 1912 spin_lock(&this_rq->lock);
@@ -1804,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1804 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1969 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1805{ 1970{
1806 dequeue_task(p, src_array); 1971 dequeue_task(p, src_array);
1807 src_rq->nr_running--; 1972 dec_nr_running(p, src_rq);
1808 set_task_cpu(p, this_cpu); 1973 set_task_cpu(p, this_cpu);
1809 this_rq->nr_running++; 1974 inc_nr_running(p, this_rq);
1810 enqueue_task(p, this_array); 1975 enqueue_task(p, this_array);
1811 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1976 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1812 + this_rq->timestamp_last_tick; 1977 + this_rq->timestamp_last_tick;
@@ -1853,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1853 return 1; 2018 return 1;
1854} 2019}
1855 2020
2021#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1856/* 2022/*
1857 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2023 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1858 * as part of a balancing operation within "domain". Returns the number of 2024 * load from busiest to this_rq, as part of a balancing operation within
1859 * tasks moved. 2025 * "domain". Returns the number of tasks moved.
1860 * 2026 *
1861 * Called with both runqueues locked. 2027 * Called with both runqueues locked.
1862 */ 2028 */
1863static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2029static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1864 unsigned long max_nr_move, struct sched_domain *sd, 2030 unsigned long max_nr_move, unsigned long max_load_move,
1865 enum idle_type idle, int *all_pinned) 2031 struct sched_domain *sd, enum idle_type idle,
2032 int *all_pinned)
1866{ 2033{
1867 prio_array_t *array, *dst_array; 2034 prio_array_t *array, *dst_array;
1868 struct list_head *head, *curr; 2035 struct list_head *head, *curr;
1869 int idx, pulled = 0, pinned = 0; 2036 int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
2037 int busiest_best_prio_seen;
2038 int skip_for_load; /* skip the task based on weighted load issues */
2039 long rem_load_move;
1870 task_t *tmp; 2040 task_t *tmp;
1871 2041
1872 if (max_nr_move == 0) 2042 if (max_nr_move == 0 || max_load_move == 0)
1873 goto out; 2043 goto out;
1874 2044
2045 rem_load_move = max_load_move;
1875 pinned = 1; 2046 pinned = 1;
2047 this_best_prio = rq_best_prio(this_rq);
2048 busiest_best_prio = rq_best_prio(busiest);
2049 /*
2050 * Enable handling of the case where there is more than one task
2051 * with the best priority. If the current running task is one
2052 * of those with prio==busiest_best_prio we know it won't be moved
2053 * and therefore it's safe to override the skip (based on load) of
2054 * any task we find with that prio.
2055 */
2056 busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1876 2057
1877 /* 2058 /*
1878 * We first consider expired tasks. Those will likely not be 2059 * We first consider expired tasks. Those will likely not be
@@ -1912,7 +2093,17 @@ skip_queue:
1912 2093
1913 curr = curr->prev; 2094 curr = curr->prev;
1914 2095
1915 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2096 /*
2097 * To help distribute high priority tasks accross CPUs we don't
2098 * skip a task if it will be the highest priority task (i.e. smallest
2099 * prio value) on its new queue regardless of its load weight
2100 */
2101 skip_for_load = tmp->load_weight > rem_load_move;
2102 if (skip_for_load && idx < this_best_prio)
2103 skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
2104 if (skip_for_load ||
2105 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2106 busiest_best_prio_seen |= idx == busiest_best_prio;
1916 if (curr != head) 2107 if (curr != head)
1917 goto skip_queue; 2108 goto skip_queue;
1918 idx++; 2109 idx++;
@@ -1926,9 +2117,15 @@ skip_queue:
1926 2117
1927 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2118 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1928 pulled++; 2119 pulled++;
2120 rem_load_move -= tmp->load_weight;
1929 2121
1930 /* We only want to steal up to the prescribed number of tasks. */ 2122 /*
1931 if (pulled < max_nr_move) { 2123 * We only want to steal up to the prescribed number of tasks
2124 * and the prescribed amount of weighted load.
2125 */
2126 if (pulled < max_nr_move && rem_load_move > 0) {
2127 if (idx < this_best_prio)
2128 this_best_prio = idx;
1932 if (curr != head) 2129 if (curr != head)
1933 goto skip_queue; 2130 goto skip_queue;
1934 idx++; 2131 idx++;
@@ -1949,7 +2146,7 @@ out:
1949 2146
1950/* 2147/*
1951 * find_busiest_group finds and returns the busiest CPU group within the 2148 * find_busiest_group finds and returns the busiest CPU group within the
1952 * domain. It calculates and returns the number of tasks which should be 2149 * domain. It calculates and returns the amount of weighted load which should be
1953 * moved to restore balance via the imbalance parameter. 2150 * moved to restore balance via the imbalance parameter.
1954 */ 2151 */
1955static struct sched_group * 2152static struct sched_group *
@@ -1959,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1959 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2156 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1960 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2157 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1961 unsigned long max_pull; 2158 unsigned long max_pull;
2159 unsigned long busiest_load_per_task, busiest_nr_running;
2160 unsigned long this_load_per_task, this_nr_running;
1962 int load_idx; 2161 int load_idx;
2162#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2163 int power_savings_balance = 1;
2164 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2165 unsigned long min_nr_running = ULONG_MAX;
2166 struct sched_group *group_min = NULL, *group_leader = NULL;
2167#endif
1963 2168
1964 max_load = this_load = total_load = total_pwr = 0; 2169 max_load = this_load = total_load = total_pwr = 0;
2170 busiest_load_per_task = busiest_nr_running = 0;
2171 this_load_per_task = this_nr_running = 0;
1965 if (idle == NOT_IDLE) 2172 if (idle == NOT_IDLE)
1966 load_idx = sd->busy_idx; 2173 load_idx = sd->busy_idx;
1967 else if (idle == NEWLY_IDLE) 2174 else if (idle == NEWLY_IDLE)
@@ -1970,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1970 load_idx = sd->idle_idx; 2177 load_idx = sd->idle_idx;
1971 2178
1972 do { 2179 do {
1973 unsigned long load; 2180 unsigned long load, group_capacity;
1974 int local_group; 2181 int local_group;
1975 int i; 2182 int i;
2183 unsigned long sum_nr_running, sum_weighted_load;
1976 2184
1977 local_group = cpu_isset(this_cpu, group->cpumask); 2185 local_group = cpu_isset(this_cpu, group->cpumask);
1978 2186
1979 /* Tally up the load of all CPUs in the group */ 2187 /* Tally up the load of all CPUs in the group */
1980 avg_load = 0; 2188 sum_weighted_load = sum_nr_running = avg_load = 0;
1981 2189
1982 for_each_cpu_mask(i, group->cpumask) { 2190 for_each_cpu_mask(i, group->cpumask) {
2191 runqueue_t *rq = cpu_rq(i);
2192
1983 if (*sd_idle && !idle_cpu(i)) 2193 if (*sd_idle && !idle_cpu(i))
1984 *sd_idle = 0; 2194 *sd_idle = 0;
1985 2195
@@ -1990,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1990 load = source_load(i, load_idx); 2200 load = source_load(i, load_idx);
1991 2201
1992 avg_load += load; 2202 avg_load += load;
2203 sum_nr_running += rq->nr_running;
2204 sum_weighted_load += rq->raw_weighted_load;
1993 } 2205 }
1994 2206
1995 total_load += avg_load; 2207 total_load += avg_load;
@@ -1998,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1998 /* Adjust by relative CPU power of the group */ 2210 /* Adjust by relative CPU power of the group */
1999 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2211 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2000 2212
2213 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2214
2001 if (local_group) { 2215 if (local_group) {
2002 this_load = avg_load; 2216 this_load = avg_load;
2003 this = group; 2217 this = group;
2004 } else if (avg_load > max_load) { 2218 this_nr_running = sum_nr_running;
2219 this_load_per_task = sum_weighted_load;
2220 } else if (avg_load > max_load &&
2221 sum_nr_running > group_capacity) {
2005 max_load = avg_load; 2222 max_load = avg_load;
2006 busiest = group; 2223 busiest = group;
2224 busiest_nr_running = sum_nr_running;
2225 busiest_load_per_task = sum_weighted_load;
2007 } 2226 }
2227
2228#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2229 /*
2230 * Busy processors will not participate in power savings
2231 * balance.
2232 */
2233 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2234 goto group_next;
2235
2236 /*
2237 * If the local group is idle or completely loaded
2238 * no need to do power savings balance at this domain
2239 */
2240 if (local_group && (this_nr_running >= group_capacity ||
2241 !this_nr_running))
2242 power_savings_balance = 0;
2243
2244 /*
2245 * If a group is already running at full capacity or idle,
2246 * don't include that group in power savings calculations
2247 */
2248 if (!power_savings_balance || sum_nr_running >= group_capacity
2249 || !sum_nr_running)
2250 goto group_next;
2251
2252 /*
2253 * Calculate the group which has the least non-idle load.
2254 * This is the group from where we need to pick up the load
2255 * for saving power
2256 */
2257 if ((sum_nr_running < min_nr_running) ||
2258 (sum_nr_running == min_nr_running &&
2259 first_cpu(group->cpumask) <
2260 first_cpu(group_min->cpumask))) {
2261 group_min = group;
2262 min_nr_running = sum_nr_running;
2263 min_load_per_task = sum_weighted_load /
2264 sum_nr_running;
2265 }
2266
2267 /*
2268 * Calculate the group which is almost near its
2269 * capacity but still has some space to pick up some load
2270 * from other group and save more power
2271 */
2272 if (sum_nr_running <= group_capacity - 1)
2273 if (sum_nr_running > leader_nr_running ||
2274 (sum_nr_running == leader_nr_running &&
2275 first_cpu(group->cpumask) >
2276 first_cpu(group_leader->cpumask))) {
2277 group_leader = group;
2278 leader_nr_running = sum_nr_running;
2279 }
2280
2281group_next:
2282#endif
2008 group = group->next; 2283 group = group->next;
2009 } while (group != sd->groups); 2284 } while (group != sd->groups);
2010 2285
2011 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2286 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2012 goto out_balanced; 2287 goto out_balanced;
2013 2288
2014 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2289 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2017,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2017 100*max_load <= sd->imbalance_pct*this_load) 2292 100*max_load <= sd->imbalance_pct*this_load)
2018 goto out_balanced; 2293 goto out_balanced;
2019 2294
2295 busiest_load_per_task /= busiest_nr_running;
2020 /* 2296 /*
2021 * We're trying to get all the cpus to the average_load, so we don't 2297 * We're trying to get all the cpus to the average_load, so we don't
2022 * want to push ourselves above the average load, nor do we wish to 2298 * want to push ourselves above the average load, nor do we wish to
@@ -2028,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2028 * by pulling tasks to us. Be careful of negative numbers as they'll 2304 * by pulling tasks to us. Be careful of negative numbers as they'll
2029 * appear as very large values with unsigned longs. 2305 * appear as very large values with unsigned longs.
2030 */ 2306 */
2307 if (max_load <= busiest_load_per_task)
2308 goto out_balanced;
2309
2310 /*
2311 * In the presence of smp nice balancing, certain scenarios can have
2312 * max load less than avg load(as we skip the groups at or below
2313 * its cpu_power, while calculating max_load..)
2314 */
2315 if (max_load < avg_load) {
2316 *imbalance = 0;
2317 goto small_imbalance;
2318 }
2031 2319
2032 /* Don't want to pull so many tasks that a group would go idle */ 2320 /* Don't want to pull so many tasks that a group would go idle */
2033 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2321 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2034 2322
2035 /* How much load to actually move to equalise the imbalance */ 2323 /* How much load to actually move to equalise the imbalance */
2036 *imbalance = min(max_pull * busiest->cpu_power, 2324 *imbalance = min(max_pull * busiest->cpu_power,
2037 (avg_load - this_load) * this->cpu_power) 2325 (avg_load - this_load) * this->cpu_power)
2038 / SCHED_LOAD_SCALE; 2326 / SCHED_LOAD_SCALE;
2039 2327
2040 if (*imbalance < SCHED_LOAD_SCALE) { 2328 /*
2041 unsigned long pwr_now = 0, pwr_move = 0; 2329 * if *imbalance is less than the average load per runnable task
2330 * there is no gaurantee that any tasks will be moved so we'll have
2331 * a think about bumping its value to force at least one task to be
2332 * moved
2333 */
2334 if (*imbalance < busiest_load_per_task) {
2335 unsigned long pwr_now, pwr_move;
2042 unsigned long tmp; 2336 unsigned long tmp;
2337 unsigned int imbn;
2338
2339small_imbalance:
2340 pwr_move = pwr_now = 0;
2341 imbn = 2;
2342 if (this_nr_running) {
2343 this_load_per_task /= this_nr_running;
2344 if (busiest_load_per_task > this_load_per_task)
2345 imbn = 1;
2346 } else
2347 this_load_per_task = SCHED_LOAD_SCALE;
2043 2348
2044 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2349 if (max_load - this_load >= busiest_load_per_task * imbn) {
2045 *imbalance = 1; 2350 *imbalance = busiest_load_per_task;
2046 return busiest; 2351 return busiest;
2047 } 2352 }
2048 2353
@@ -2052,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2052 * moving them. 2357 * moving them.
2053 */ 2358 */
2054 2359
2055 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2360 pwr_now += busiest->cpu_power *
2056 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2361 min(busiest_load_per_task, max_load);
2362 pwr_now += this->cpu_power *
2363 min(this_load_per_task, this_load);
2057 pwr_now /= SCHED_LOAD_SCALE; 2364 pwr_now /= SCHED_LOAD_SCALE;
2058 2365
2059 /* Amount of load we'd subtract */ 2366 /* Amount of load we'd subtract */
2060 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2367 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2061 if (max_load > tmp) 2368 if (max_load > tmp)
2062 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2369 pwr_move += busiest->cpu_power *
2063 max_load - tmp); 2370 min(busiest_load_per_task, max_load - tmp);
2064 2371
2065 /* Amount of load we'd add */ 2372 /* Amount of load we'd add */
2066 if (max_load*busiest->cpu_power < 2373 if (max_load*busiest->cpu_power <
2067 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2374 busiest_load_per_task*SCHED_LOAD_SCALE)
2068 tmp = max_load*busiest->cpu_power/this->cpu_power; 2375 tmp = max_load*busiest->cpu_power/this->cpu_power;
2069 else 2376 else
2070 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2377 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2071 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2378 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2072 pwr_move /= SCHED_LOAD_SCALE; 2379 pwr_move /= SCHED_LOAD_SCALE;
2073 2380
2074 /* Move if we gain throughput */ 2381 /* Move if we gain throughput */
2075 if (pwr_move <= pwr_now) 2382 if (pwr_move <= pwr_now)
2076 goto out_balanced; 2383 goto out_balanced;
2077 2384
2078 *imbalance = 1; 2385 *imbalance = busiest_load_per_task;
2079 return busiest;
2080 } 2386 }
2081 2387
2082 /* Get rid of the scaling factor, rounding down as we divide */
2083 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2084 return busiest; 2388 return busiest;
2085 2389
2086out_balanced: 2390out_balanced:
2391#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2392 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2393 goto ret;
2087 2394
2395 if (this == group_leader && group_leader != group_min) {
2396 *imbalance = min_load_per_task;
2397 return group_min;
2398 }
2399ret:
2400#endif
2088 *imbalance = 0; 2401 *imbalance = 0;
2089 return NULL; 2402 return NULL;
2090} 2403}
@@ -2093,18 +2406,21 @@ out_balanced:
2093 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2406 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2094 */ 2407 */
2095static runqueue_t *find_busiest_queue(struct sched_group *group, 2408static runqueue_t *find_busiest_queue(struct sched_group *group,
2096 enum idle_type idle) 2409 enum idle_type idle, unsigned long imbalance)
2097{ 2410{
2098 unsigned long load, max_load = 0; 2411 unsigned long max_load = 0;
2099 runqueue_t *busiest = NULL; 2412 runqueue_t *busiest = NULL, *rqi;
2100 int i; 2413 int i;
2101 2414
2102 for_each_cpu_mask(i, group->cpumask) { 2415 for_each_cpu_mask(i, group->cpumask) {
2103 load = source_load(i, 0); 2416 rqi = cpu_rq(i);
2417
2418 if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
2419 continue;
2104 2420
2105 if (load > max_load) { 2421 if (rqi->raw_weighted_load > max_load) {
2106 max_load = load; 2422 max_load = rqi->raw_weighted_load;
2107 busiest = cpu_rq(i); 2423 busiest = rqi;
2108 } 2424 }
2109 } 2425 }
2110 2426
@@ -2117,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2117 */ 2433 */
2118#define MAX_PINNED_INTERVAL 512 2434#define MAX_PINNED_INTERVAL 512
2119 2435
2436#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
2120/* 2437/*
2121 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2438 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2122 * tasks if there is an imbalance. 2439 * tasks if there is an imbalance.
@@ -2133,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2133 int active_balance = 0; 2450 int active_balance = 0;
2134 int sd_idle = 0; 2451 int sd_idle = 0;
2135 2452
2136 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2453 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2454 !sched_smt_power_savings)
2137 sd_idle = 1; 2455 sd_idle = 1;
2138 2456
2139 schedstat_inc(sd, lb_cnt[idle]); 2457 schedstat_inc(sd, lb_cnt[idle]);
@@ -2144,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2144 goto out_balanced; 2462 goto out_balanced;
2145 } 2463 }
2146 2464
2147 busiest = find_busiest_queue(group, idle); 2465 busiest = find_busiest_queue(group, idle, imbalance);
2148 if (!busiest) { 2466 if (!busiest) {
2149 schedstat_inc(sd, lb_nobusyq[idle]); 2467 schedstat_inc(sd, lb_nobusyq[idle]);
2150 goto out_balanced; 2468 goto out_balanced;
@@ -2164,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2164 */ 2482 */
2165 double_rq_lock(this_rq, busiest); 2483 double_rq_lock(this_rq, busiest);
2166 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2484 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2485 minus_1_or_zero(busiest->nr_running),
2167 imbalance, sd, idle, &all_pinned); 2486 imbalance, sd, idle, &all_pinned);
2168 double_rq_unlock(this_rq, busiest); 2487 double_rq_unlock(this_rq, busiest);
2169 2488
@@ -2221,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2221 sd->balance_interval *= 2; 2540 sd->balance_interval *= 2;
2222 } 2541 }
2223 2542
2224 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2543 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2544 !sched_smt_power_savings)
2225 return -1; 2545 return -1;
2226 return nr_moved; 2546 return nr_moved;
2227 2547
@@ -2236,7 +2556,7 @@ out_one_pinned:
2236 (sd->balance_interval < sd->max_interval)) 2556 (sd->balance_interval < sd->max_interval))
2237 sd->balance_interval *= 2; 2557 sd->balance_interval *= 2;
2238 2558
2239 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2559 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2240 return -1; 2560 return -1;
2241 return 0; 2561 return 0;
2242} 2562}
@@ -2257,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2257 int nr_moved = 0; 2577 int nr_moved = 0;
2258 int sd_idle = 0; 2578 int sd_idle = 0;
2259 2579
2260 if (sd->flags & SD_SHARE_CPUPOWER) 2580 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2261 sd_idle = 1; 2581 sd_idle = 1;
2262 2582
2263 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2583 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2267,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2267 goto out_balanced; 2587 goto out_balanced;
2268 } 2588 }
2269 2589
2270 busiest = find_busiest_queue(group, NEWLY_IDLE); 2590 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2271 if (!busiest) { 2591 if (!busiest) {
2272 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2592 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2273 goto out_balanced; 2593 goto out_balanced;
@@ -2282,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2282 /* Attempt to move tasks */ 2602 /* Attempt to move tasks */
2283 double_lock_balance(this_rq, busiest); 2603 double_lock_balance(this_rq, busiest);
2284 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2604 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2605 minus_1_or_zero(busiest->nr_running),
2285 imbalance, sd, NEWLY_IDLE, NULL); 2606 imbalance, sd, NEWLY_IDLE, NULL);
2286 spin_unlock(&busiest->lock); 2607 spin_unlock(&busiest->lock);
2287 } 2608 }
@@ -2297,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2297 2618
2298out_balanced: 2619out_balanced:
2299 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2620 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2300 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2301 return -1; 2622 return -1;
2302 sd->nr_balance_failed = 0; 2623 sd->nr_balance_failed = 0;
2303 return 0; 2624 return 0;
@@ -2352,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2352 double_lock_balance(busiest_rq, target_rq); 2673 double_lock_balance(busiest_rq, target_rq);
2353 2674
2354 /* Search for an sd spanning us and the target CPU. */ 2675 /* Search for an sd spanning us and the target CPU. */
2355 for_each_domain(target_cpu, sd) 2676 for_each_domain(target_cpu, sd) {
2356 if ((sd->flags & SD_LOAD_BALANCE) && 2677 if ((sd->flags & SD_LOAD_BALANCE) &&
2357 cpu_isset(busiest_cpu, sd->span)) 2678 cpu_isset(busiest_cpu, sd->span))
2358 break; 2679 break;
2680 }
2359 2681
2360 if (unlikely(sd == NULL)) 2682 if (unlikely(sd == NULL))
2361 goto out; 2683 goto out;
2362 2684
2363 schedstat_inc(sd, alb_cnt); 2685 schedstat_inc(sd, alb_cnt);
2364 2686
2365 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2687 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2688 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
2366 schedstat_inc(sd, alb_pushed); 2689 schedstat_inc(sd, alb_pushed);
2367 else 2690 else
2368 schedstat_inc(sd, alb_failed); 2691 schedstat_inc(sd, alb_failed);
@@ -2390,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2390 struct sched_domain *sd; 2713 struct sched_domain *sd;
2391 int i; 2714 int i;
2392 2715
2393 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2716 this_load = this_rq->raw_weighted_load;
2394 /* Update our load */ 2717 /* Update our load */
2395 for (i = 0; i < 3; i++) { 2718 for (i = 0; i < 3; i++) {
2396 unsigned long new_load = this_load; 2719 unsigned long new_load = this_load;
@@ -2691,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2691 resched_task(rq->idle); 3014 resched_task(rq->idle);
2692} 3015}
2693 3016
2694static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3017/*
3018 * Called with interrupt disabled and this_rq's runqueue locked.
3019 */
3020static void wake_sleeping_dependent(int this_cpu)
2695{ 3021{
2696 struct sched_domain *tmp, *sd = NULL; 3022 struct sched_domain *tmp, *sd = NULL;
2697 cpumask_t sibling_map;
2698 int i; 3023 int i;
2699 3024
2700 for_each_domain(this_cpu, tmp) 3025 for_each_domain(this_cpu, tmp) {
2701 if (tmp->flags & SD_SHARE_CPUPOWER) 3026 if (tmp->flags & SD_SHARE_CPUPOWER) {
2702 sd = tmp; 3027 sd = tmp;
3028 break;
3029 }
3030 }
2703 3031
2704 if (!sd) 3032 if (!sd)
2705 return; 3033 return;
2706 3034
2707 /* 3035 for_each_cpu_mask(i, sd->span) {
2708 * Unlock the current runqueue because we have to lock in
2709 * CPU order to avoid deadlocks. Caller knows that we might
2710 * unlock. We keep IRQs disabled.
2711 */
2712 spin_unlock(&this_rq->lock);
2713
2714 sibling_map = sd->span;
2715
2716 for_each_cpu_mask(i, sibling_map)
2717 spin_lock(&cpu_rq(i)->lock);
2718 /*
2719 * We clear this CPU from the mask. This both simplifies the
2720 * inner loop and keps this_rq locked when we exit:
2721 */
2722 cpu_clear(this_cpu, sibling_map);
2723
2724 for_each_cpu_mask(i, sibling_map) {
2725 runqueue_t *smt_rq = cpu_rq(i); 3036 runqueue_t *smt_rq = cpu_rq(i);
2726 3037
3038 if (i == this_cpu)
3039 continue;
3040 if (unlikely(!spin_trylock(&smt_rq->lock)))
3041 continue;
3042
2727 wakeup_busy_runqueue(smt_rq); 3043 wakeup_busy_runqueue(smt_rq);
3044 spin_unlock(&smt_rq->lock);
2728 } 3045 }
2729
2730 for_each_cpu_mask(i, sibling_map)
2731 spin_unlock(&cpu_rq(i)->lock);
2732 /*
2733 * We exit with this_cpu's rq still held and IRQs
2734 * still disabled:
2735 */
2736} 3046}
2737 3047
2738/* 3048/*
@@ -2745,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2745 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3055 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2746} 3056}
2747 3057
2748static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3058/*
3059 * To minimise lock contention and not have to drop this_rq's runlock we only
3060 * trylock the sibling runqueues and bypass those runqueues if we fail to
3061 * acquire their lock. As we only trylock the normal locking order does not
3062 * need to be obeyed.
3063 */
3064static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
2749{ 3065{
2750 struct sched_domain *tmp, *sd = NULL; 3066 struct sched_domain *tmp, *sd = NULL;
2751 cpumask_t sibling_map;
2752 prio_array_t *array;
2753 int ret = 0, i; 3067 int ret = 0, i;
2754 task_t *p;
2755 3068
2756 for_each_domain(this_cpu, tmp) 3069 /* kernel/rt threads do not participate in dependent sleeping */
2757 if (tmp->flags & SD_SHARE_CPUPOWER) 3070 if (!p->mm || rt_task(p))
3071 return 0;
3072
3073 for_each_domain(this_cpu, tmp) {
3074 if (tmp->flags & SD_SHARE_CPUPOWER) {
2758 sd = tmp; 3075 sd = tmp;
3076 break;
3077 }
3078 }
2759 3079
2760 if (!sd) 3080 if (!sd)
2761 return 0; 3081 return 0;
2762 3082
2763 /* 3083 for_each_cpu_mask(i, sd->span) {
2764 * The same locking rules and details apply as for 3084 runqueue_t *smt_rq;
2765 * wake_sleeping_dependent(): 3085 task_t *smt_curr;
2766 */
2767 spin_unlock(&this_rq->lock);
2768 sibling_map = sd->span;
2769 for_each_cpu_mask(i, sibling_map)
2770 spin_lock(&cpu_rq(i)->lock);
2771 cpu_clear(this_cpu, sibling_map);
2772 3086
2773 /* 3087 if (i == this_cpu)
2774 * Establish next task to be run - it might have gone away because 3088 continue;
2775 * we released the runqueue lock above:
2776 */
2777 if (!this_rq->nr_running)
2778 goto out_unlock;
2779 array = this_rq->active;
2780 if (!array->nr_active)
2781 array = this_rq->expired;
2782 BUG_ON(!array->nr_active);
2783 3089
2784 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3090 smt_rq = cpu_rq(i);
2785 task_t, run_list); 3091 if (unlikely(!spin_trylock(&smt_rq->lock)))
3092 continue;
2786 3093
2787 for_each_cpu_mask(i, sibling_map) { 3094 smt_curr = smt_rq->curr;
2788 runqueue_t *smt_rq = cpu_rq(i);
2789 task_t *smt_curr = smt_rq->curr;
2790 3095
2791 /* Kernel threads do not participate in dependent sleeping */ 3096 if (!smt_curr->mm)
2792 if (!p->mm || !smt_curr->mm || rt_task(p)) 3097 goto unlock;
2793 goto check_smt_task;
2794 3098
2795 /* 3099 /*
2796 * If a user task with lower static priority than the 3100 * If a user task with lower static priority than the
@@ -2808,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2808 if ((jiffies % DEF_TIMESLICE) > 3112 if ((jiffies % DEF_TIMESLICE) >
2809 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3113 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2810 ret = 1; 3114 ret = 1;
2811 } else 3115 } else {
2812 if (smt_curr->static_prio < p->static_prio && 3116 if (smt_curr->static_prio < p->static_prio &&
2813 !TASK_PREEMPTS_CURR(p, smt_rq) && 3117 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2814 smt_slice(smt_curr, sd) > task_timeslice(p)) 3118 smt_slice(smt_curr, sd) > task_timeslice(p))
2815 ret = 1; 3119 ret = 1;
2816
2817check_smt_task:
2818 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2819 rt_task(smt_curr))
2820 continue;
2821 if (!p->mm) {
2822 wakeup_busy_runqueue(smt_rq);
2823 continue;
2824 }
2825
2826 /*
2827 * Reschedule a lower priority task on the SMT sibling for
2828 * it to be put to sleep, or wake it up if it has been put to
2829 * sleep for priority reasons to see if it should run now.
2830 */
2831 if (rt_task(p)) {
2832 if ((jiffies % DEF_TIMESLICE) >
2833 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2834 resched_task(smt_curr);
2835 } else {
2836 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2837 smt_slice(p, sd) > task_timeslice(smt_curr))
2838 resched_task(smt_curr);
2839 else
2840 wakeup_busy_runqueue(smt_rq);
2841 } 3120 }
3121unlock:
3122 spin_unlock(&smt_rq->lock);
2842 } 3123 }
2843out_unlock:
2844 for_each_cpu_mask(i, sibling_map)
2845 spin_unlock(&cpu_rq(i)->lock);
2846 return ret; 3124 return ret;
2847} 3125}
2848#else 3126#else
2849static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3127static inline void wake_sleeping_dependent(int this_cpu)
2850{ 3128{
2851} 3129}
2852 3130
2853static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
3132 task_t *p)
2854{ 3133{
2855 return 0; 3134 return 0;
2856} 3135}
@@ -2972,32 +3251,13 @@ need_resched_nonpreemptible:
2972 3251
2973 cpu = smp_processor_id(); 3252 cpu = smp_processor_id();
2974 if (unlikely(!rq->nr_running)) { 3253 if (unlikely(!rq->nr_running)) {
2975go_idle:
2976 idle_balance(cpu, rq); 3254 idle_balance(cpu, rq);
2977 if (!rq->nr_running) { 3255 if (!rq->nr_running) {
2978 next = rq->idle; 3256 next = rq->idle;
2979 rq->expired_timestamp = 0; 3257 rq->expired_timestamp = 0;
2980 wake_sleeping_dependent(cpu, rq); 3258 wake_sleeping_dependent(cpu);
2981 /*
2982 * wake_sleeping_dependent() might have released
2983 * the runqueue, so break out if we got new
2984 * tasks meanwhile:
2985 */
2986 if (!rq->nr_running)
2987 goto switch_tasks;
2988 }
2989 } else {
2990 if (dependent_sleeper(cpu, rq)) {
2991 next = rq->idle;
2992 goto switch_tasks; 3259 goto switch_tasks;
2993 } 3260 }
2994 /*
2995 * dependent_sleeper() releases and reacquires the runqueue
2996 * lock, hence go into the idle loop if the rq went
2997 * empty meanwhile:
2998 */
2999 if (unlikely(!rq->nr_running))
3000 goto go_idle;
3001 } 3261 }
3002 3262
3003 array = rq->active; 3263 array = rq->active;
@@ -3035,6 +3295,8 @@ go_idle:
3035 } 3295 }
3036 } 3296 }
3037 next->sleep_type = SLEEP_NORMAL; 3297 next->sleep_type = SLEEP_NORMAL;
3298 if (dependent_sleeper(cpu, rq, next))
3299 next = rq->idle;
3038switch_tasks: 3300switch_tasks:
3039 if (next == rq->idle) 3301 if (next == rq->idle)
3040 schedstat_inc(rq, sched_goidle); 3302 schedstat_inc(rq, sched_goidle);
@@ -3478,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3478 3740
3479EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3480 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3481void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3482{ 3797{
3483 unsigned long flags; 3798 unsigned long flags;
3484 prio_array_t *array; 3799 prio_array_t *array;
3485 runqueue_t *rq; 3800 runqueue_t *rq;
3486 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3487 3802
3488 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3489 return; 3804 return;
@@ -3498,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
3498 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3499 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3500 */ 3815 */
3501 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3502 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3503 goto out_unlock; 3818 goto out_unlock;
3504 } 3819 }
3505 array = p->array; 3820 array = p->array;
3506 if (array) 3821 if (array) {
3507 dequeue_task(p, array); 3822 dequeue_task(p, array);
3823 dec_raw_weighted_load(rq, p);
3824 }
3508 3825
3509 old_prio = p->prio;
3510 new_prio = NICE_TO_PRIO(nice);
3511 delta = new_prio - old_prio;
3512 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3513 p->prio += delta; 3827 set_load_weight(p);
3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3514 3831
3515 if (array) { 3832 if (array) {
3516 enqueue_task(p, array); 3833 enqueue_task(p, array);
3834 inc_raw_weighted_load(rq, p);
3517 /* 3835 /*
3518 * If the task increased its priority or is running and 3836 * If the task increased its priority or is running and
3519 * lowered its priority, then reschedule its CPU: 3837 * lowered its priority, then reschedule its CPU:
@@ -3524,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3524out_unlock: 3842out_unlock:
3525 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3526} 3844}
3527
3528EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3529 3846
3530/* 3847/*
@@ -3639,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3639 BUG_ON(p->array); 3956 BUG_ON(p->array);
3640 p->policy = policy; 3957 p->policy = policy;
3641 p->rt_priority = prio; 3958 p->rt_priority = prio;
3642 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3643 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3644 } else { 3961 p->prio = rt_mutex_getprio(p);
3645 p->prio = p->static_prio; 3962 /*
3646 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3647 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3648 */ 3965 if (policy == SCHED_BATCH)
3649 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3650 p->sleep_avg = 0; 3967 set_load_weight(p);
3651 }
3652} 3968}
3653 3969
3654/** 3970/**
@@ -3667,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
3667 unsigned long flags; 3983 unsigned long flags;
3668 runqueue_t *rq; 3984 runqueue_t *rq;
3669 3985
3986 /* may grab non-irq protected spin_locks */
3987 BUG_ON(in_interrupt());
3670recheck: 3988recheck:
3671 /* double check policy once rq lock held */ 3989 /* double check policy once rq lock held */
3672 if (policy < 0) 3990 if (policy < 0)
@@ -3715,14 +4033,20 @@ recheck:
3715 if (retval) 4033 if (retval)
3716 return retval; 4034 return retval;
3717 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3718 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3719 * runqueue lock must be held. 4042 * runqueue lock must be held.
3720 */ 4043 */
3721 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3722 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3723 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3724 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3725 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3726 goto recheck; 4050 goto recheck;
3727 } 4051 }
3728 array = p->array; 4052 array = p->array;
@@ -3743,7 +4067,11 @@ recheck:
3743 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3744 resched_task(rq->curr); 4068 resched_task(rq->curr);
3745 } 4069 }
3746 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
4073 rt_mutex_adjust_pi(p);
4074
3747 return 0; 4075 return 0;
3748} 4076}
3749EXPORT_SYMBOL_GPL(sched_setscheduler); 4077EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3765,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3765 read_unlock_irq(&tasklist_lock); 4093 read_unlock_irq(&tasklist_lock);
3766 return -ESRCH; 4094 return -ESRCH;
3767 } 4095 }
3768 retval = sched_setscheduler(p, policy, &lparam); 4096 get_task_struct(p);
3769 read_unlock_irq(&tasklist_lock); 4097 read_unlock_irq(&tasklist_lock);
4098 retval = sched_setscheduler(p, policy, &lparam);
4099 put_task_struct(p);
3770 return retval; 4100 return retval;
3771} 4101}
3772 4102
@@ -4378,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4378 idle->timestamp = sched_clock(); 4708 idle->timestamp = sched_clock();
4379 idle->sleep_avg = 0; 4709 idle->sleep_avg = 0;
4380 idle->array = NULL; 4710 idle->array = NULL;
4381 idle->prio = MAX_PRIO; 4711 idle->prio = idle->normal_prio = MAX_PRIO;
4382 idle->state = TASK_RUNNING; 4712 idle->state = TASK_RUNNING;
4383 idle->cpus_allowed = cpumask_of_cpu(cpu); 4713 idle->cpus_allowed = cpumask_of_cpu(cpu);
4384 set_task_cpu(idle, cpu); 4714 set_task_cpu(idle, cpu);
@@ -4474,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4474 * 4804 *
4475 * So we race with normal scheduler movements, but that's OK, as long 4805 * So we race with normal scheduler movements, but that's OK, as long
4476 * as the task is no longer on this CPU. 4806 * as the task is no longer on this CPU.
4807 *
4808 * Returns non-zero if task was successfully migrated.
4477 */ 4809 */
4478static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4810static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4479{ 4811{
4480 runqueue_t *rq_dest, *rq_src; 4812 runqueue_t *rq_dest, *rq_src;
4813 int ret = 0;
4481 4814
4482 if (unlikely(cpu_is_offline(dest_cpu))) 4815 if (unlikely(cpu_is_offline(dest_cpu)))
4483 return; 4816 return ret;
4484 4817
4485 rq_src = cpu_rq(src_cpu); 4818 rq_src = cpu_rq(src_cpu);
4486 rq_dest = cpu_rq(dest_cpu); 4819 rq_dest = cpu_rq(dest_cpu);
@@ -4508,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4508 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4841 if (TASK_PREEMPTS_CURR(p, rq_dest))
4509 resched_task(rq_dest->curr); 4842 resched_task(rq_dest->curr);
4510 } 4843 }
4511 4844 ret = 1;
4512out: 4845out:
4513 double_rq_unlock(rq_src, rq_dest); 4846 double_rq_unlock(rq_src, rq_dest);
4847 return ret;
4514} 4848}
4515 4849
4516/* 4850/*
@@ -4580,9 +4914,12 @@ wait_to_die:
4580/* Figure out where task on dead CPU should go, use force if neccessary. */ 4914/* Figure out where task on dead CPU should go, use force if neccessary. */
4581static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4915static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4582{ 4916{
4917 runqueue_t *rq;
4918 unsigned long flags;
4583 int dest_cpu; 4919 int dest_cpu;
4584 cpumask_t mask; 4920 cpumask_t mask;
4585 4921
4922restart:
4586 /* On same node? */ 4923 /* On same node? */
4587 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4924 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4588 cpus_and(mask, mask, tsk->cpus_allowed); 4925 cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4594,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4594 4931
4595 /* No more Mr. Nice Guy. */ 4932 /* No more Mr. Nice Guy. */
4596 if (dest_cpu == NR_CPUS) { 4933 if (dest_cpu == NR_CPUS) {
4934 rq = task_rq_lock(tsk, &flags);
4597 cpus_setall(tsk->cpus_allowed); 4935 cpus_setall(tsk->cpus_allowed);
4598 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4936 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4937 task_rq_unlock(rq, &flags);
4599 4938
4600 /* 4939 /*
4601 * Don't tell them about moving exiting tasks or 4940 * Don't tell them about moving exiting tasks or
@@ -4607,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4607 "longer affine to cpu%d\n", 4946 "longer affine to cpu%d\n",
4608 tsk->pid, tsk->comm, dead_cpu); 4947 tsk->pid, tsk->comm, dead_cpu);
4609 } 4948 }
4610 __migrate_task(tsk, dead_cpu, dest_cpu); 4949 if (!__migrate_task(tsk, dead_cpu, dest_cpu))
4950 goto restart;
4611} 4951}
4612 4952
4613/* 4953/*
@@ -4734,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4734 * migration_call - callback that gets triggered when a CPU is added. 5074 * migration_call - callback that gets triggered when a CPU is added.
4735 * Here we can start up the necessary migration thread for the new CPU. 5075 * Here we can start up the necessary migration thread for the new CPU.
4736 */ 5076 */
4737static int migration_call(struct notifier_block *nfb, unsigned long action, 5077static int __cpuinit migration_call(struct notifier_block *nfb,
4738 void *hcpu) 5078 unsigned long action,
5079 void *hcpu)
4739{ 5080{
4740 int cpu = (long)hcpu; 5081 int cpu = (long)hcpu;
4741 struct task_struct *p; 5082 struct task_struct *p;
@@ -4805,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4805/* Register at highest priority so that task migration (migrate_all_tasks) 5146/* Register at highest priority so that task migration (migrate_all_tasks)
4806 * happens before everything else. 5147 * happens before everything else.
4807 */ 5148 */
4808static struct notifier_block migration_notifier = { 5149static struct notifier_block __cpuinitdata migration_notifier = {
4809 .notifier_call = migration_call, 5150 .notifier_call = migration_call,
4810 .priority = 10 5151 .priority = 10
4811}; 5152};
@@ -5606,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node)
5606} 5947}
5607#endif 5948#endif
5608 5949
5950int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5609/* 5951/*
5610 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5952 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5611 * can switch it on easily if needed. 5953 * can switch it on easily if needed.
@@ -5621,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu)
5621 5963
5622#ifdef CONFIG_SCHED_MC 5964#ifdef CONFIG_SCHED_MC
5623static DEFINE_PER_CPU(struct sched_domain, core_domains); 5965static DEFINE_PER_CPU(struct sched_domain, core_domains);
5624static struct sched_group sched_group_core[NR_CPUS]; 5966static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5625#endif 5967#endif
5626 5968
5627#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 5969#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5637,7 +5979,7 @@ static int cpu_to_core_group(int cpu)
5637#endif 5979#endif
5638 5980
5639static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5981static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5640static struct sched_group sched_group_phys[NR_CPUS]; 5982static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
5641static int cpu_to_phys_group(int cpu) 5983static int cpu_to_phys_group(int cpu)
5642{ 5984{
5643#if defined(CONFIG_SCHED_MC) 5985#if defined(CONFIG_SCHED_MC)
@@ -5694,13 +6036,74 @@ next_sg:
5694} 6036}
5695#endif 6037#endif
5696 6038
6039/* Free memory allocated for various sched_group structures */
6040static void free_sched_groups(const cpumask_t *cpu_map)
6041{
6042 int cpu;
6043#ifdef CONFIG_NUMA
6044 int i;
6045
6046 for_each_cpu_mask(cpu, *cpu_map) {
6047 struct sched_group *sched_group_allnodes
6048 = sched_group_allnodes_bycpu[cpu];
6049 struct sched_group **sched_group_nodes
6050 = sched_group_nodes_bycpu[cpu];
6051
6052 if (sched_group_allnodes) {
6053 kfree(sched_group_allnodes);
6054 sched_group_allnodes_bycpu[cpu] = NULL;
6055 }
6056
6057 if (!sched_group_nodes)
6058 continue;
6059
6060 for (i = 0; i < MAX_NUMNODES; i++) {
6061 cpumask_t nodemask = node_to_cpumask(i);
6062 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6063
6064 cpus_and(nodemask, nodemask, *cpu_map);
6065 if (cpus_empty(nodemask))
6066 continue;
6067
6068 if (sg == NULL)
6069 continue;
6070 sg = sg->next;
6071next_sg:
6072 oldsg = sg;
6073 sg = sg->next;
6074 kfree(oldsg);
6075 if (oldsg != sched_group_nodes[i])
6076 goto next_sg;
6077 }
6078 kfree(sched_group_nodes);
6079 sched_group_nodes_bycpu[cpu] = NULL;
6080 }
6081#endif
6082 for_each_cpu_mask(cpu, *cpu_map) {
6083 if (sched_group_phys_bycpu[cpu]) {
6084 kfree(sched_group_phys_bycpu[cpu]);
6085 sched_group_phys_bycpu[cpu] = NULL;
6086 }
6087#ifdef CONFIG_SCHED_MC
6088 if (sched_group_core_bycpu[cpu]) {
6089 kfree(sched_group_core_bycpu[cpu]);
6090 sched_group_core_bycpu[cpu] = NULL;
6091 }
6092#endif
6093 }
6094}
6095
5697/* 6096/*
5698 * Build sched domains for a given set of cpus and attach the sched domains 6097 * Build sched domains for a given set of cpus and attach the sched domains
5699 * to the individual cpus 6098 * to the individual cpus
5700 */ 6099 */
5701void build_sched_domains(const cpumask_t *cpu_map) 6100static int build_sched_domains(const cpumask_t *cpu_map)
5702{ 6101{
5703 int i; 6102 int i;
6103 struct sched_group *sched_group_phys = NULL;
6104#ifdef CONFIG_SCHED_MC
6105 struct sched_group *sched_group_core = NULL;
6106#endif
5704#ifdef CONFIG_NUMA 6107#ifdef CONFIG_NUMA
5705 struct sched_group **sched_group_nodes = NULL; 6108 struct sched_group **sched_group_nodes = NULL;
5706 struct sched_group *sched_group_allnodes = NULL; 6109 struct sched_group *sched_group_allnodes = NULL;
@@ -5708,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5708 /* 6111 /*
5709 * Allocate the per-node list of sched groups 6112 * Allocate the per-node list of sched groups
5710 */ 6113 */
5711 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6114 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5712 GFP_ATOMIC); 6115 GFP_KERNEL);
5713 if (!sched_group_nodes) { 6116 if (!sched_group_nodes) {
5714 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6117 printk(KERN_WARNING "Can not alloc sched group node list\n");
5715 return; 6118 return -ENOMEM;
5716 } 6119 }
5717 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6120 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5718#endif 6121#endif
@@ -5738,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5738 if (!sched_group_allnodes) { 6141 if (!sched_group_allnodes) {
5739 printk(KERN_WARNING 6142 printk(KERN_WARNING
5740 "Can not alloc allnodes sched group\n"); 6143 "Can not alloc allnodes sched group\n");
5741 break; 6144 goto error;
5742 } 6145 }
5743 sched_group_allnodes_bycpu[i] 6146 sched_group_allnodes_bycpu[i]
5744 = sched_group_allnodes; 6147 = sched_group_allnodes;
@@ -5759,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5759 cpus_and(sd->span, sd->span, *cpu_map); 6162 cpus_and(sd->span, sd->span, *cpu_map);
5760#endif 6163#endif
5761 6164
6165 if (!sched_group_phys) {
6166 sched_group_phys
6167 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6168 GFP_KERNEL);
6169 if (!sched_group_phys) {
6170 printk (KERN_WARNING "Can not alloc phys sched"
6171 "group\n");
6172 goto error;
6173 }
6174 sched_group_phys_bycpu[i] = sched_group_phys;
6175 }
6176
5762 p = sd; 6177 p = sd;
5763 sd = &per_cpu(phys_domains, i); 6178 sd = &per_cpu(phys_domains, i);
5764 group = cpu_to_phys_group(i); 6179 group = cpu_to_phys_group(i);
@@ -5768,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5768 sd->groups = &sched_group_phys[group]; 6183 sd->groups = &sched_group_phys[group];
5769 6184
5770#ifdef CONFIG_SCHED_MC 6185#ifdef CONFIG_SCHED_MC
6186 if (!sched_group_core) {
6187 sched_group_core
6188 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6189 GFP_KERNEL);
6190 if (!sched_group_core) {
6191 printk (KERN_WARNING "Can not alloc core sched"
6192 "group\n");
6193 goto error;
6194 }
6195 sched_group_core_bycpu[i] = sched_group_core;
6196 }
6197
5771 p = sd; 6198 p = sd;
5772 sd = &per_cpu(core_domains, i); 6199 sd = &per_cpu(core_domains, i);
5773 group = cpu_to_core_group(i); 6200 group = cpu_to_core_group(i);
@@ -5851,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5851 domainspan = sched_domain_node_span(i); 6278 domainspan = sched_domain_node_span(i);
5852 cpus_and(domainspan, domainspan, *cpu_map); 6279 cpus_and(domainspan, domainspan, *cpu_map);
5853 6280
5854 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6281 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6282 if (!sg) {
6283 printk(KERN_WARNING "Can not alloc domain group for "
6284 "node %d\n", i);
6285 goto error;
6286 }
5855 sched_group_nodes[i] = sg; 6287 sched_group_nodes[i] = sg;
5856 for_each_cpu_mask(j, nodemask) { 6288 for_each_cpu_mask(j, nodemask) {
5857 struct sched_domain *sd; 6289 struct sched_domain *sd;
5858 sd = &per_cpu(node_domains, j); 6290 sd = &per_cpu(node_domains, j);
5859 sd->groups = sg; 6291 sd->groups = sg;
5860 if (sd->groups == NULL) {
5861 /* Turn off balancing if we have no groups */
5862 sd->flags = 0;
5863 }
5864 }
5865 if (!sg) {
5866 printk(KERN_WARNING
5867 "Can not alloc domain group for node %d\n", i);
5868 continue;
5869 } 6292 }
5870 sg->cpu_power = 0; 6293 sg->cpu_power = 0;
5871 sg->cpumask = nodemask; 6294 sg->cpumask = nodemask;
6295 sg->next = sg;
5872 cpus_or(covered, covered, nodemask); 6296 cpus_or(covered, covered, nodemask);
5873 prev = sg; 6297 prev = sg;
5874 6298
@@ -5887,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5887 if (cpus_empty(tmp)) 6311 if (cpus_empty(tmp))
5888 continue; 6312 continue;
5889 6313
5890 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6314 sg = kmalloc_node(sizeof(struct sched_group),
6315 GFP_KERNEL, i);
5891 if (!sg) { 6316 if (!sg) {
5892 printk(KERN_WARNING 6317 printk(KERN_WARNING
5893 "Can not alloc domain group for node %d\n", j); 6318 "Can not alloc domain group for node %d\n", j);
5894 break; 6319 goto error;
5895 } 6320 }
5896 sg->cpu_power = 0; 6321 sg->cpu_power = 0;
5897 sg->cpumask = tmp; 6322 sg->cpumask = tmp;
6323 sg->next = prev->next;
5898 cpus_or(covered, covered, tmp); 6324 cpus_or(covered, covered, tmp);
5899 prev->next = sg; 6325 prev->next = sg;
5900 prev = sg; 6326 prev = sg;
5901 } 6327 }
5902 prev->next = sched_group_nodes[i];
5903 } 6328 }
5904#endif 6329#endif
5905 6330
5906 /* Calculate CPU power for physical packages and nodes */ 6331 /* Calculate CPU power for physical packages and nodes */
6332#ifdef CONFIG_SCHED_SMT
5907 for_each_cpu_mask(i, *cpu_map) { 6333 for_each_cpu_mask(i, *cpu_map) {
5908 int power;
5909 struct sched_domain *sd; 6334 struct sched_domain *sd;
5910#ifdef CONFIG_SCHED_SMT
5911 sd = &per_cpu(cpu_domains, i); 6335 sd = &per_cpu(cpu_domains, i);
5912 power = SCHED_LOAD_SCALE; 6336 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5913 sd->groups->cpu_power = power; 6337 }
5914#endif 6338#endif
5915#ifdef CONFIG_SCHED_MC 6339#ifdef CONFIG_SCHED_MC
6340 for_each_cpu_mask(i, *cpu_map) {
6341 int power;
6342 struct sched_domain *sd;
5916 sd = &per_cpu(core_domains, i); 6343 sd = &per_cpu(core_domains, i);
5917 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6344 if (sched_smt_power_savings)
6345 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6346 else
6347 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5918 * SCHED_LOAD_SCALE / 10; 6348 * SCHED_LOAD_SCALE / 10;
5919 sd->groups->cpu_power = power; 6349 sd->groups->cpu_power = power;
6350 }
6351#endif
5920 6352
6353 for_each_cpu_mask(i, *cpu_map) {
6354 struct sched_domain *sd;
6355#ifdef CONFIG_SCHED_MC
5921 sd = &per_cpu(phys_domains, i); 6356 sd = &per_cpu(phys_domains, i);
6357 if (i != first_cpu(sd->groups->cpumask))
6358 continue;
5922 6359
5923 /* 6360 sd->groups->cpu_power = 0;
5924 * This has to be < 2 * SCHED_LOAD_SCALE 6361 if (sched_mc_power_savings || sched_smt_power_savings) {
5925 * Lets keep it SCHED_LOAD_SCALE, so that 6362 int j;
5926 * while calculating NUMA group's cpu_power 6363
5927 * we can simply do 6364 for_each_cpu_mask(j, sd->groups->cpumask) {
5928 * numa_group->cpu_power += phys_group->cpu_power; 6365 struct sched_domain *sd1;
5929 * 6366 sd1 = &per_cpu(core_domains, j);
5930 * See "only add power once for each physical pkg" 6367 /*
5931 * comment below 6368 * for each core we will add once
5932 */ 6369 * to the group in physical domain
5933 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6370 */
6371 if (j != first_cpu(sd1->groups->cpumask))
6372 continue;
6373
6374 if (sched_smt_power_savings)
6375 sd->groups->cpu_power += sd1->groups->cpu_power;
6376 else
6377 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6378 }
6379 } else
6380 /*
6381 * This has to be < 2 * SCHED_LOAD_SCALE
6382 * Lets keep it SCHED_LOAD_SCALE, so that
6383 * while calculating NUMA group's cpu_power
6384 * we can simply do
6385 * numa_group->cpu_power += phys_group->cpu_power;
6386 *
6387 * See "only add power once for each physical pkg"
6388 * comment below
6389 */
6390 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5934#else 6391#else
6392 int power;
5935 sd = &per_cpu(phys_domains, i); 6393 sd = &per_cpu(phys_domains, i);
5936 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6394 if (sched_smt_power_savings)
5937 (cpus_weight(sd->groups->cpumask)-1) / 10; 6395 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6396 else
6397 power = SCHED_LOAD_SCALE;
5938 sd->groups->cpu_power = power; 6398 sd->groups->cpu_power = power;
5939#endif 6399#endif
5940 } 6400 }
@@ -5962,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5962 * Tune cache-hot values: 6422 * Tune cache-hot values:
5963 */ 6423 */
5964 calibrate_migration_costs(cpu_map); 6424 calibrate_migration_costs(cpu_map);
6425
6426 return 0;
6427
6428error:
6429 free_sched_groups(cpu_map);
6430 return -ENOMEM;
5965} 6431}
5966/* 6432/*
5967 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6433 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5968 */ 6434 */
5969static void arch_init_sched_domains(const cpumask_t *cpu_map) 6435static int arch_init_sched_domains(const cpumask_t *cpu_map)
5970{ 6436{
5971 cpumask_t cpu_default_map; 6437 cpumask_t cpu_default_map;
6438 int err;
5972 6439
5973 /* 6440 /*
5974 * Setup mask for cpus without special case scheduling requirements. 6441 * Setup mask for cpus without special case scheduling requirements.
@@ -5977,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5977 */ 6444 */
5978 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6445 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5979 6446
5980 build_sched_domains(&cpu_default_map); 6447 err = build_sched_domains(&cpu_default_map);
6448
6449 return err;
5981} 6450}
5982 6451
5983static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6452static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5984{ 6453{
5985#ifdef CONFIG_NUMA 6454 free_sched_groups(cpu_map);
5986 int i;
5987 int cpu;
5988
5989 for_each_cpu_mask(cpu, *cpu_map) {
5990 struct sched_group *sched_group_allnodes
5991 = sched_group_allnodes_bycpu[cpu];
5992 struct sched_group **sched_group_nodes
5993 = sched_group_nodes_bycpu[cpu];
5994
5995 if (sched_group_allnodes) {
5996 kfree(sched_group_allnodes);
5997 sched_group_allnodes_bycpu[cpu] = NULL;
5998 }
5999
6000 if (!sched_group_nodes)
6001 continue;
6002
6003 for (i = 0; i < MAX_NUMNODES; i++) {
6004 cpumask_t nodemask = node_to_cpumask(i);
6005 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6006
6007 cpus_and(nodemask, nodemask, *cpu_map);
6008 if (cpus_empty(nodemask))
6009 continue;
6010
6011 if (sg == NULL)
6012 continue;
6013 sg = sg->next;
6014next_sg:
6015 oldsg = sg;
6016 sg = sg->next;
6017 kfree(oldsg);
6018 if (oldsg != sched_group_nodes[i])
6019 goto next_sg;
6020 }
6021 kfree(sched_group_nodes);
6022 sched_group_nodes_bycpu[cpu] = NULL;
6023 }
6024#endif
6025} 6455}
6026 6456
6027/* 6457/*
@@ -6046,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6046 * correct sched domains 6476 * correct sched domains
6047 * Call with hotplug lock held 6477 * Call with hotplug lock held
6048 */ 6478 */
6049void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6479int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6050{ 6480{
6051 cpumask_t change_map; 6481 cpumask_t change_map;
6482 int err = 0;
6052 6483
6053 cpus_and(*partition1, *partition1, cpu_online_map); 6484 cpus_and(*partition1, *partition1, cpu_online_map);
6054 cpus_and(*partition2, *partition2, cpu_online_map); 6485 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6057,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6057 /* Detach sched domains from all of the affected cpus */ 6488 /* Detach sched domains from all of the affected cpus */
6058 detach_destroy_domains(&change_map); 6489 detach_destroy_domains(&change_map);
6059 if (!cpus_empty(*partition1)) 6490 if (!cpus_empty(*partition1))
6060 build_sched_domains(partition1); 6491 err = build_sched_domains(partition1);
6061 if (!cpus_empty(*partition2)) 6492 if (!err && !cpus_empty(*partition2))
6062 build_sched_domains(partition2); 6493 err = build_sched_domains(partition2);
6494
6495 return err;
6496}
6497
6498#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6499int arch_reinit_sched_domains(void)
6500{
6501 int err;
6502
6503 lock_cpu_hotplug();
6504 detach_destroy_domains(&cpu_online_map);
6505 err = arch_init_sched_domains(&cpu_online_map);
6506 unlock_cpu_hotplug();
6507
6508 return err;
6509}
6510
6511static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6512{
6513 int ret;
6514
6515 if (buf[0] != '0' && buf[0] != '1')
6516 return -EINVAL;
6517
6518 if (smt)
6519 sched_smt_power_savings = (buf[0] == '1');
6520 else
6521 sched_mc_power_savings = (buf[0] == '1');
6522
6523 ret = arch_reinit_sched_domains();
6524
6525 return ret ? ret : count;
6526}
6527
6528int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6529{
6530 int err = 0;
6531#ifdef CONFIG_SCHED_SMT
6532 if (smt_capable())
6533 err = sysfs_create_file(&cls->kset.kobj,
6534 &attr_sched_smt_power_savings.attr);
6535#endif
6536#ifdef CONFIG_SCHED_MC
6537 if (!err && mc_capable())
6538 err = sysfs_create_file(&cls->kset.kobj,
6539 &attr_sched_mc_power_savings.attr);
6540#endif
6541 return err;
6542}
6543#endif
6544
6545#ifdef CONFIG_SCHED_MC
6546static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6547{
6548 return sprintf(page, "%u\n", sched_mc_power_savings);
6549}
6550static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6551{
6552 return sched_power_savings_store(buf, count, 0);
6553}
6554SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6555 sched_mc_power_savings_store);
6556#endif
6557
6558#ifdef CONFIG_SCHED_SMT
6559static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6560{
6561 return sprintf(page, "%u\n", sched_smt_power_savings);
6562}
6563static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6564{
6565 return sched_power_savings_store(buf, count, 1);
6063} 6566}
6567SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6568 sched_smt_power_savings_store);
6569#endif
6570
6064 6571
6065#ifdef CONFIG_HOTPLUG_CPU 6572#ifdef CONFIG_HOTPLUG_CPU
6066/* 6573/*
@@ -6143,7 +6650,6 @@ void __init sched_init(void)
6143 rq->push_cpu = 0; 6650 rq->push_cpu = 0;
6144 rq->migration_thread = NULL; 6651 rq->migration_thread = NULL;
6145 INIT_LIST_HEAD(&rq->migration_queue); 6652 INIT_LIST_HEAD(&rq->migration_queue);
6146 rq->cpu = i;
6147#endif 6653#endif
6148 atomic_set(&rq->nr_iowait, 0); 6654 atomic_set(&rq->nr_iowait, 0);
6149 6655
@@ -6158,6 +6664,7 @@ void __init sched_init(void)
6158 } 6664 }
6159 } 6665 }
6160 6666
6667 set_load_weight(&init_task);
6161 /* 6668 /*
6162 * The boot idle thread does lazy MMU switching as well: 6669 * The boot idle thread does lazy MMU switching as well:
6163 */ 6670 */
@@ -6204,11 +6711,12 @@ void normalize_rt_tasks(void)
6204 runqueue_t *rq; 6711 runqueue_t *rq;
6205 6712
6206 read_lock_irq(&tasklist_lock); 6713 read_lock_irq(&tasklist_lock);
6207 for_each_process (p) { 6714 for_each_process(p) {
6208 if (!rt_task(p)) 6715 if (!rt_task(p))
6209 continue; 6716 continue;
6210 6717
6211 rq = task_rq_lock(p, &flags); 6718 spin_lock_irqsave(&p->pi_lock, flags);
6719 rq = __task_rq_lock(p);
6212 6720
6213 array = p->array; 6721 array = p->array;
6214 if (array) 6722 if (array)
@@ -6219,7 +6727,8 @@ void normalize_rt_tasks(void)
6219 resched_task(rq->curr); 6727 resched_task(rq->curr);
6220 } 6728 }
6221 6729
6222 task_rq_unlock(rq, &flags); 6730 __task_rq_unlock(rq);
6731 spin_unlock_irqrestore(&p->pi_lock, flags);
6223 } 6732 }
6224 read_unlock_irq(&tasklist_lock); 6733 read_unlock_irq(&tasklist_lock);
6225} 6734}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9e2f1c6e73d7..8f03e3b89b55 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 446}
447#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
448 448
449static int cpu_callback(struct notifier_block *nfb, 449static int __devinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 450 unsigned long action,
451 void *hcpu) 451 void *hcpu)
452{ 452{
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
486 return NOTIFY_OK; 486 return NOTIFY_OK;
487} 487}
488 488
489static struct notifier_block cpu_nfb = { 489static struct notifier_block __devinitdata cpu_nfb = {
490 .notifier_call = cpu_callback 490 .notifier_call = cpu_callback
491}; 491};
492 492
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b5c3b94e01ce..6b76caa22981 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __devinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
142 return NOTIFY_OK; 142 return NOTIFY_OK;
143} 143}
144 144
145static struct notifier_block cpu_nfb = { 145static struct notifier_block __devinitdata cpu_nfb = {
146 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
147}; 147};
148 148
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f1a4eb1a655e..93a2c5398648 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -133,6 +133,10 @@ extern int acct_parm[];
133extern int no_unaligned_warning; 133extern int no_unaligned_warning;
134#endif 134#endif
135 135
136#ifdef CONFIG_RT_MUTEXES
137extern int max_lock_depth;
138#endif
139
136static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 140static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
137 ctl_table *, void **); 141 ctl_table *, void **);
138static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 142static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -688,6 +692,17 @@ static ctl_table kern_table[] = {
688 .proc_handler = &proc_dointvec, 692 .proc_handler = &proc_dointvec,
689 }, 693 },
690#endif 694#endif
695#ifdef CONFIG_RT_MUTEXES
696 {
697 .ctl_name = KERN_MAX_LOCK_DEPTH,
698 .procname = "max_lock_depth",
699 .data = &max_lock_depth,
700 .maxlen = sizeof(int),
701 .mode = 0644,
702 .proc_handler = &proc_dointvec,
703 },
704#endif
705
691 { .ctl_name = 0 } 706 { .ctl_name = 0 }
692}; 707};
693 708
@@ -928,6 +943,18 @@ static ctl_table vm_table[] = {
928 .strategy = &sysctl_jiffies, 943 .strategy = &sysctl_jiffies,
929 }, 944 },
930#endif 945#endif
946#ifdef CONFIG_X86_32
947 {
948 .ctl_name = VM_VDSO_ENABLED,
949 .procname = "vdso_enabled",
950 .data = &vdso_enabled,
951 .maxlen = sizeof(vdso_enabled),
952 .mode = 0644,
953 .proc_handler = &proc_dointvec,
954 .strategy = &sysctl_intvec,
955 .extra1 = &zero,
956 },
957#endif
931 { .ctl_name = 0 } 958 { .ctl_name = 0 }
932}; 959};
933 960
diff --git a/kernel/timer.c b/kernel/timer.c
index 5bb6b7976eec..5a8960253063 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
1652} 1652}
1653#endif /* CONFIG_HOTPLUG_CPU */ 1653#endif /* CONFIG_HOTPLUG_CPU */
1654 1654
1655static int timer_cpu_notify(struct notifier_block *self, 1655static int __devinit timer_cpu_notify(struct notifier_block *self,
1656 unsigned long action, void *hcpu) 1656 unsigned long action, void *hcpu)
1657{ 1657{
1658 long cpu = (long)hcpu; 1658 long cpu = (long)hcpu;
@@ -1672,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
1672 return NOTIFY_OK; 1672 return NOTIFY_OK;
1673} 1673}
1674 1674
1675static struct notifier_block timers_nb = { 1675static struct notifier_block __devinitdata timers_nb = {
1676 .notifier_call = timer_cpu_notify, 1676 .notifier_call = timer_cpu_notify,
1677}; 1677};
1678 1678
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 565cf7a1febd..59f0b42bd89e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
559} 559}
560 560
561/* We're holding the cpucontrol mutex here */ 561/* We're holding the cpucontrol mutex here */
562static int workqueue_cpu_callback(struct notifier_block *nfb, 562static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
563 unsigned long action, 563 unsigned long action,
564 void *hcpu) 564 void *hcpu)
565{ 565{