diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/acct.c | 3 | ||||
-rw-r--r-- | kernel/audit.c | 2 | ||||
-rw-r--r-- | kernel/auditsc.c | 10 | ||||
-rw-r--r-- | kernel/cpu.c | 8 | ||||
-rw-r--r-- | kernel/exit.c | 9 | ||||
-rw-r--r-- | kernel/fork.c | 19 | ||||
-rw-r--r-- | kernel/futex.c | 1067 | ||||
-rw-r--r-- | kernel/futex_compat.c | 14 | ||||
-rw-r--r-- | kernel/hrtimer.c | 4 | ||||
-rw-r--r-- | kernel/mutex-debug.c | 5 | ||||
-rw-r--r-- | kernel/power/Kconfig | 13 | ||||
-rw-r--r-- | kernel/profile.c | 2 | ||||
-rw-r--r-- | kernel/rcupdate.c | 14 | ||||
-rw-r--r-- | kernel/rcutorture.c | 201 | ||||
-rw-r--r-- | kernel/resource.c | 38 | ||||
-rw-r--r-- | kernel/rtmutex-debug.c | 513 | ||||
-rw-r--r-- | kernel/rtmutex-debug.h | 37 | ||||
-rw-r--r-- | kernel/rtmutex-tester.c | 440 | ||||
-rw-r--r-- | kernel/rtmutex.c | 990 | ||||
-rw-r--r-- | kernel/rtmutex.h | 29 | ||||
-rw-r--r-- | kernel/rtmutex_common.h | 123 | ||||
-rw-r--r-- | kernel/sched.c | 1199 | ||||
-rw-r--r-- | kernel/softirq.c | 4 | ||||
-rw-r--r-- | kernel/softlockup.c | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 27 | ||||
-rw-r--r-- | kernel/timer.c | 4 | ||||
-rw-r--r-- | kernel/workqueue.c | 2 |
28 files changed, 4215 insertions, 569 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 752bd7d383af..82fb182f6f61 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -16,6 +16,9 @@ obj-$(CONFIG_FUTEX) += futex.o | |||
16 | ifeq ($(CONFIG_COMPAT),y) | 16 | ifeq ($(CONFIG_COMPAT),y) |
17 | obj-$(CONFIG_FUTEX) += futex_compat.o | 17 | obj-$(CONFIG_FUTEX) += futex_compat.o |
18 | endif | 18 | endif |
19 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
20 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
21 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
19 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 22 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
20 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 23 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
21 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 24 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 368c4f03fe0e..126ca43d5d2b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file) | |||
521 | 521 | ||
522 | /** | 522 | /** |
523 | * acct_init_pacct - initialize a new pacct_struct | 523 | * acct_init_pacct - initialize a new pacct_struct |
524 | * @pacct: per-process accounting info struct to initialize | ||
524 | */ | 525 | */ |
525 | void acct_init_pacct(struct pacct_struct *pacct) | 526 | void acct_init_pacct(struct pacct_struct *pacct) |
526 | { | 527 | { |
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) | |||
576 | * | 577 | * |
577 | * handles process accounting for an exiting task | 578 | * handles process accounting for an exiting task |
578 | */ | 579 | */ |
579 | void acct_process() | 580 | void acct_process(void) |
580 | { | 581 | { |
581 | struct file *file = NULL; | 582 | struct file *file = NULL; |
582 | 583 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 7dfac7031bd7..82443fb433ef 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -818,7 +818,7 @@ err: | |||
818 | */ | 818 | */ |
819 | unsigned int audit_serial(void) | 819 | unsigned int audit_serial(void) |
820 | { | 820 | { |
821 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 821 | static DEFINE_SPINLOCK(serial_lock); |
822 | static unsigned int serial = 0; | 822 | static unsigned int serial = 0; |
823 | 823 | ||
824 | unsigned long flags; | 824 | unsigned long flags; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ebd96fda295..dc5e3f01efe7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab) | |||
658 | return; | 658 | return; |
659 | 659 | ||
660 | error_path: | 660 | error_path: |
661 | if (ctx) | 661 | kfree(ctx); |
662 | kfree(ctx); | ||
663 | audit_panic("error in audit_log_task_context"); | 662 | audit_panic("error in audit_log_task_context"); |
664 | return; | 663 | return; |
665 | } | 664 | } |
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) | |||
1367 | * @mqdes: MQ descriptor | 1366 | * @mqdes: MQ descriptor |
1368 | * @msg_len: Message length | 1367 | * @msg_len: Message length |
1369 | * @msg_prio: Message priority | 1368 | * @msg_prio: Message priority |
1370 | * @abs_timeout: Message timeout in absolute time | 1369 | * @u_abs_timeout: Message timeout in absolute time |
1371 | * | 1370 | * |
1372 | * Returns 0 for success or NULL context or < 0 on error. | 1371 | * Returns 0 for success or NULL context or < 0 on error. |
1373 | */ | 1372 | */ |
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, | |||
1409 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive | 1408 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive |
1410 | * @mqdes: MQ descriptor | 1409 | * @mqdes: MQ descriptor |
1411 | * @msg_len: Message length | 1410 | * @msg_len: Message length |
1412 | * @msg_prio: Message priority | 1411 | * @u_msg_prio: Message priority |
1413 | * @abs_timeout: Message timeout in absolute time | 1412 | * @u_abs_timeout: Message timeout in absolute time |
1414 | * | 1413 | * |
1415 | * Returns 0 for success or NULL context or < 0 on error. | 1414 | * Returns 0 for success or NULL context or < 0 on error. |
1416 | */ | 1415 | */ |
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1558 | * @uid: msgq user id | 1557 | * @uid: msgq user id |
1559 | * @gid: msgq group id | 1558 | * @gid: msgq group id |
1560 | * @mode: msgq mode (permissions) | 1559 | * @mode: msgq mode (permissions) |
1561 | * @ipcp: in-kernel IPC permissions | ||
1562 | * | 1560 | * |
1563 | * Returns 0 for success or NULL context or < 0 on error. | 1561 | * Returns 0 for success or NULL context or < 0 on error. |
1564 | */ | 1562 | */ |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 03dcd981846a..70fbf2e83766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -18,7 +18,7 @@ | |||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | static DEFINE_MUTEX(cpucontrol); | 19 | static DEFINE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | 23 | #ifdef CONFIG_HOTPLUG_CPU |
24 | static struct task_struct *lock_cpu_hotplug_owner; | 24 | static struct task_struct *lock_cpu_hotplug_owner; |
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | |||
69 | #endif /* CONFIG_HOTPLUG_CPU */ | 69 | #endif /* CONFIG_HOTPLUG_CPU */ |
70 | 70 | ||
71 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
72 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
73 | { | 73 | { |
74 | return blocking_notifier_chain_register(&cpu_chain, nb); | 74 | return blocking_notifier_chain_register(&cpu_chain, nb); |
75 | } | 75 | } |
76 | |||
77 | #ifdef CONFIG_HOTPLUG_CPU | ||
78 | |||
76 | EXPORT_SYMBOL(register_cpu_notifier); | 79 | EXPORT_SYMBOL(register_cpu_notifier); |
77 | 80 | ||
78 | void unregister_cpu_notifier(struct notifier_block *nb) | 81 | void unregister_cpu_notifier(struct notifier_block *nb) |
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) | |||
81 | } | 84 | } |
82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 85 | EXPORT_SYMBOL(unregister_cpu_notifier); |
83 | 86 | ||
84 | #ifdef CONFIG_HOTPLUG_CPU | ||
85 | static inline void check_for_tasks(int cpu) | 87 | static inline void check_for_tasks(int cpu) |
86 | { | 88 | { |
87 | struct task_struct *p; | 89 | struct task_struct *p; |
diff --git a/kernel/exit.c b/kernel/exit.c index 304ef637be6c..ab06b9f88f64 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -926,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code) | |||
926 | tsk->mempolicy = NULL; | 926 | tsk->mempolicy = NULL; |
927 | #endif | 927 | #endif |
928 | /* | 928 | /* |
929 | * This must happen late, after the PID is not | ||
930 | * hashed anymore: | ||
931 | */ | ||
932 | if (unlikely(!list_empty(&tsk->pi_state_list))) | ||
933 | exit_pi_state_list(tsk); | ||
934 | if (unlikely(current->pi_state_cache)) | ||
935 | kfree(current->pi_state_cache); | ||
936 | /* | ||
929 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 937 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: |
930 | */ | 938 | */ |
931 | mutex_debug_check_no_locks_held(tsk); | 939 | mutex_debug_check_no_locks_held(tsk); |
940 | rt_mutex_debug_check_no_locks_held(tsk); | ||
932 | 941 | ||
933 | if (tsk->io_context) | 942 | if (tsk->io_context) |
934 | exit_io_context(); | 943 | exit_io_context(); |
diff --git a/kernel/fork.c b/kernel/fork.c index 9b4e54ef0225..628198a4f28a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; | |||
104 | void free_task(struct task_struct *tsk) | 104 | void free_task(struct task_struct *tsk) |
105 | { | 105 | { |
106 | free_thread_info(tsk->thread_info); | 106 | free_thread_info(tsk->thread_info); |
107 | rt_mutex_debug_task_free(tsk); | ||
107 | free_task_struct(tsk); | 108 | free_task_struct(tsk); |
108 | } | 109 | } |
109 | EXPORT_SYMBOL(free_task); | 110 | EXPORT_SYMBOL(free_task); |
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
913 | return current->pid; | 914 | return current->pid; |
914 | } | 915 | } |
915 | 916 | ||
917 | static inline void rt_mutex_init_task(struct task_struct *p) | ||
918 | { | ||
919 | #ifdef CONFIG_RT_MUTEXES | ||
920 | spin_lock_init(&p->pi_lock); | ||
921 | plist_head_init(&p->pi_waiters, &p->pi_lock); | ||
922 | p->pi_blocked_on = NULL; | ||
923 | # ifdef CONFIG_DEBUG_RT_MUTEXES | ||
924 | spin_lock_init(&p->held_list_lock); | ||
925 | INIT_LIST_HEAD(&p->held_list_head); | ||
926 | # endif | ||
927 | #endif | ||
928 | } | ||
929 | |||
916 | /* | 930 | /* |
917 | * This creates a new process as a copy of the old one, | 931 | * This creates a new process as a copy of the old one, |
918 | * but does not actually start it yet. | 932 | * but does not actually start it yet. |
@@ -1034,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1034 | mpol_fix_fork_child_flag(p); | 1048 | mpol_fix_fork_child_flag(p); |
1035 | #endif | 1049 | #endif |
1036 | 1050 | ||
1051 | rt_mutex_init_task(p); | ||
1052 | |||
1037 | #ifdef CONFIG_DEBUG_MUTEXES | 1053 | #ifdef CONFIG_DEBUG_MUTEXES |
1038 | p->blocked_on = NULL; /* not blocked yet */ | 1054 | p->blocked_on = NULL; /* not blocked yet */ |
1039 | #endif | 1055 | #endif |
@@ -1076,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1076 | #ifdef CONFIG_COMPAT | 1092 | #ifdef CONFIG_COMPAT |
1077 | p->compat_robust_list = NULL; | 1093 | p->compat_robust_list = NULL; |
1078 | #endif | 1094 | #endif |
1095 | INIT_LIST_HEAD(&p->pi_state_list); | ||
1096 | p->pi_state_cache = NULL; | ||
1097 | |||
1079 | /* | 1098 | /* |
1080 | * sigaltstack should be cleared when sharing the same VM | 1099 | * sigaltstack should be cleared when sharing the same VM |
1081 | */ | 1100 | */ |
diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5a..6c91f938005d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -12,6 +12,10 @@ | |||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
18 | * | ||
15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
@@ -46,6 +50,8 @@ | |||
46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
48 | 52 | ||
53 | #include "rtmutex_common.h" | ||
54 | |||
49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
50 | 56 | ||
51 | /* | 57 | /* |
@@ -63,7 +69,7 @@ union futex_key { | |||
63 | int offset; | 69 | int offset; |
64 | } shared; | 70 | } shared; |
65 | struct { | 71 | struct { |
66 | unsigned long uaddr; | 72 | unsigned long address; |
67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; |
68 | int offset; | 74 | int offset; |
69 | } private; | 75 | } private; |
@@ -75,6 +81,27 @@ union futex_key { | |||
75 | }; | 81 | }; |
76 | 82 | ||
77 | /* | 83 | /* |
84 | * Priority Inheritance state: | ||
85 | */ | ||
86 | struct futex_pi_state { | ||
87 | /* | ||
88 | * list of 'owned' pi_state instances - these have to be | ||
89 | * cleaned up in do_exit() if the task exits prematurely: | ||
90 | */ | ||
91 | struct list_head list; | ||
92 | |||
93 | /* | ||
94 | * The PI object: | ||
95 | */ | ||
96 | struct rt_mutex pi_mutex; | ||
97 | |||
98 | struct task_struct *owner; | ||
99 | atomic_t refcount; | ||
100 | |||
101 | union futex_key key; | ||
102 | }; | ||
103 | |||
104 | /* | ||
78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
80 | * | 107 | * |
@@ -87,15 +114,19 @@ struct futex_q { | |||
87 | struct list_head list; | 114 | struct list_head list; |
88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; |
89 | 116 | ||
90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ |
91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; |
92 | 119 | ||
93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ |
94 | union futex_key key; | 121 | union futex_key key; |
95 | 122 | ||
96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ |
97 | int fd; | 124 | int fd; |
98 | struct file *filp; | 125 | struct file *filp; |
126 | |||
127 | /* Optional priority inheritance state: */ | ||
128 | struct futex_pi_state *pi_state; | ||
129 | struct task_struct *task; | ||
99 | }; | 130 | }; |
100 | 131 | ||
101 | /* | 132 | /* |
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
144 | * | 175 | * |
145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
146 | */ | 177 | */ |
147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) |
148 | { | 179 | { |
180 | unsigned long address = (unsigned long)uaddr; | ||
149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; |
150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; |
151 | struct page *page; | 183 | struct page *page; |
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
154 | /* | 186 | /* |
155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. |
156 | */ | 188 | */ |
157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; |
158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) |
159 | return -EINVAL; | 191 | return -EINVAL; |
160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; |
161 | 193 | ||
162 | /* | 194 | /* |
163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether |
164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. |
165 | */ | 197 | */ |
166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); |
167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) |
168 | return -EFAULT; | 200 | return -EFAULT; |
169 | 201 | ||
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
184 | */ | 216 | */ |
185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { |
186 | key->private.mm = mm; | 218 | key->private.mm = mm; |
187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; |
188 | return 0; | 220 | return 0; |
189 | } | 221 | } |
190 | 222 | ||
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; |
195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); |
199 | return 0; | 231 | return 0; |
200 | } | 232 | } |
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here |
206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. |
207 | */ | 239 | */ |
208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); |
209 | if (err >= 0) { | 241 | if (err >= 0) { |
210 | key->shared.pgoff = | 242 | key->shared.pgoff = |
211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key) | |||
246 | } | 278 | } |
247 | } | 279 | } |
248 | 280 | ||
249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
250 | { | 282 | { |
251 | int ret; | 283 | int ret; |
252 | 284 | ||
253 | inc_preempt_count(); | 285 | inc_preempt_count(); |
254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
255 | dec_preempt_count(); | 287 | dec_preempt_count(); |
256 | 288 | ||
257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
258 | } | 290 | } |
259 | 291 | ||
260 | /* | 292 | /* |
293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
294 | */ | ||
295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
296 | { | ||
297 | struct vm_area_struct * vma; | ||
298 | struct mm_struct *mm = current->mm; | ||
299 | |||
300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
302 | return -EFAULT; | ||
303 | |||
304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
305 | case VM_FAULT_MINOR: | ||
306 | current->min_flt++; | ||
307 | break; | ||
308 | case VM_FAULT_MAJOR: | ||
309 | current->maj_flt++; | ||
310 | break; | ||
311 | default: | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * PI code: | ||
319 | */ | ||
320 | static int refill_pi_state_cache(void) | ||
321 | { | ||
322 | struct futex_pi_state *pi_state; | ||
323 | |||
324 | if (likely(current->pi_state_cache)) | ||
325 | return 0; | ||
326 | |||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
328 | |||
329 | if (!pi_state) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | ||
334 | /* pi_mutex gets initialized later */ | ||
335 | pi_state->owner = NULL; | ||
336 | atomic_set(&pi_state->refcount, 1); | ||
337 | |||
338 | current->pi_state_cache = pi_state; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static struct futex_pi_state * alloc_pi_state(void) | ||
344 | { | ||
345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
346 | |||
347 | WARN_ON(!pi_state); | ||
348 | current->pi_state_cache = NULL; | ||
349 | |||
350 | return pi_state; | ||
351 | } | ||
352 | |||
353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
354 | { | ||
355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
356 | return; | ||
357 | |||
358 | /* | ||
359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
360 | * and has cleaned up the pi_state already | ||
361 | */ | ||
362 | if (pi_state->owner) { | ||
363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
364 | list_del_init(&pi_state->list); | ||
365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
366 | |||
367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
368 | } | ||
369 | |||
370 | if (current->pi_state_cache) | ||
371 | kfree(pi_state); | ||
372 | else { | ||
373 | /* | ||
374 | * pi_state->list is already empty. | ||
375 | * clear pi_state->owner. | ||
376 | * refcount is at 0 - put it back to 1. | ||
377 | */ | ||
378 | pi_state->owner = NULL; | ||
379 | atomic_set(&pi_state->refcount, 1); | ||
380 | current->pi_state_cache = pi_state; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Look up the task based on what TID userspace gave us. | ||
386 | * We dont trust it. | ||
387 | */ | ||
388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
389 | { | ||
390 | struct task_struct *p; | ||
391 | |||
392 | read_lock(&tasklist_lock); | ||
393 | p = find_task_by_pid(pid); | ||
394 | if (!p) | ||
395 | goto out_unlock; | ||
396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
397 | p = NULL; | ||
398 | goto out_unlock; | ||
399 | } | ||
400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
401 | p = NULL; | ||
402 | goto out_unlock; | ||
403 | } | ||
404 | get_task_struct(p); | ||
405 | out_unlock: | ||
406 | read_unlock(&tasklist_lock); | ||
407 | |||
408 | return p; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * This task is holding PI mutexes at exit time => bad. | ||
413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
415 | */ | ||
416 | void exit_pi_state_list(struct task_struct *curr) | ||
417 | { | ||
418 | struct futex_hash_bucket *hb; | ||
419 | struct list_head *next, *head = &curr->pi_state_list; | ||
420 | struct futex_pi_state *pi_state; | ||
421 | union futex_key key; | ||
422 | |||
423 | /* | ||
424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
425 | * pi_state_list anymore, but we have to be careful | ||
426 | * versus waiters unqueueing themselfs | ||
427 | */ | ||
428 | spin_lock_irq(&curr->pi_lock); | ||
429 | while (!list_empty(head)) { | ||
430 | |||
431 | next = head->next; | ||
432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
433 | key = pi_state->key; | ||
434 | spin_unlock_irq(&curr->pi_lock); | ||
435 | |||
436 | hb = hash_futex(&key); | ||
437 | spin_lock(&hb->lock); | ||
438 | |||
439 | spin_lock_irq(&curr->pi_lock); | ||
440 | if (head->next != next) { | ||
441 | spin_unlock(&hb->lock); | ||
442 | continue; | ||
443 | } | ||
444 | |||
445 | list_del_init(&pi_state->list); | ||
446 | |||
447 | WARN_ON(pi_state->owner != curr); | ||
448 | |||
449 | pi_state->owner = NULL; | ||
450 | spin_unlock_irq(&curr->pi_lock); | ||
451 | |||
452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
453 | |||
454 | spin_unlock(&hb->lock); | ||
455 | |||
456 | spin_lock_irq(&curr->pi_lock); | ||
457 | } | ||
458 | spin_unlock_irq(&curr->pi_lock); | ||
459 | } | ||
460 | |||
461 | static int | ||
462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
463 | { | ||
464 | struct futex_pi_state *pi_state = NULL; | ||
465 | struct futex_q *this, *next; | ||
466 | struct list_head *head; | ||
467 | struct task_struct *p; | ||
468 | pid_t pid; | ||
469 | |||
470 | head = &hb->chain; | ||
471 | |||
472 | list_for_each_entry_safe(this, next, head, list) { | ||
473 | if (match_futex (&this->key, &me->key)) { | ||
474 | /* | ||
475 | * Another waiter already exists - bump up | ||
476 | * the refcount and return its pi_state: | ||
477 | */ | ||
478 | pi_state = this->pi_state; | ||
479 | atomic_inc(&pi_state->refcount); | ||
480 | me->pi_state = pi_state; | ||
481 | |||
482 | return 0; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * We are the first waiter - try to look up the real owner and | ||
488 | * attach the new pi_state to it: | ||
489 | */ | ||
490 | pid = uval & FUTEX_TID_MASK; | ||
491 | p = futex_find_get_task(pid); | ||
492 | if (!p) | ||
493 | return -ESRCH; | ||
494 | |||
495 | pi_state = alloc_pi_state(); | ||
496 | |||
497 | /* | ||
498 | * Initialize the pi_mutex in locked state and make 'p' | ||
499 | * the owner of it: | ||
500 | */ | ||
501 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
502 | |||
503 | /* Store the key for possible exit cleanups: */ | ||
504 | pi_state->key = me->key; | ||
505 | |||
506 | spin_lock_irq(&p->pi_lock); | ||
507 | list_add(&pi_state->list, &p->pi_state_list); | ||
508 | pi_state->owner = p; | ||
509 | spin_unlock_irq(&p->pi_lock); | ||
510 | |||
511 | put_task_struct(p); | ||
512 | |||
513 | me->pi_state = pi_state; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* | ||
261 | * The hash bucket lock must be held when this is called. | 519 | * The hash bucket lock must be held when this is called. |
262 | * Afterwards, the futex_q must not be accessed. | 520 | * Afterwards, the futex_q must not be accessed. |
263 | */ | 521 | */ |
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q) | |||
284 | q->lock_ptr = NULL; | 542 | q->lock_ptr = NULL; |
285 | } | 543 | } |
286 | 544 | ||
545 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
546 | { | ||
547 | struct task_struct *new_owner; | ||
548 | struct futex_pi_state *pi_state = this->pi_state; | ||
549 | u32 curval, newval; | ||
550 | |||
551 | if (!pi_state) | ||
552 | return -EINVAL; | ||
553 | |||
554 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
555 | |||
556 | /* | ||
557 | * This happens when we have stolen the lock and the original | ||
558 | * pending owner did not enqueue itself back on the rt_mutex. | ||
559 | * Thats not a tragedy. We know that way, that a lock waiter | ||
560 | * is on the fly. We make the futex_q waiter the pending owner. | ||
561 | */ | ||
562 | if (!new_owner) | ||
563 | new_owner = this->task; | ||
564 | |||
565 | /* | ||
566 | * We pass it to the next owner. (The WAITERS bit is always | ||
567 | * kept enabled while there is PI state around. We must also | ||
568 | * preserve the owner died bit.) | ||
569 | */ | ||
570 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
571 | |||
572 | inc_preempt_count(); | ||
573 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
574 | dec_preempt_count(); | ||
575 | |||
576 | if (curval == -EFAULT) | ||
577 | return -EFAULT; | ||
578 | if (curval != uval) | ||
579 | return -EINVAL; | ||
580 | |||
581 | list_del_init(&pi_state->owner->pi_state_list); | ||
582 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
583 | pi_state->owner = new_owner; | ||
584 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
590 | { | ||
591 | u32 oldval; | ||
592 | |||
593 | /* | ||
594 | * There is no waiter, so we unlock the futex. The owner died | ||
595 | * bit has not to be preserved here. We are the owner: | ||
596 | */ | ||
597 | inc_preempt_count(); | ||
598 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
599 | dec_preempt_count(); | ||
600 | |||
601 | if (oldval == -EFAULT) | ||
602 | return oldval; | ||
603 | if (oldval != uval) | ||
604 | return -EAGAIN; | ||
605 | |||
606 | return 0; | ||
607 | } | ||
608 | |||
287 | /* | 609 | /* |
288 | * Wake up all waiters hashed on the physical page that is mapped | 610 | * Wake up all waiters hashed on the physical page that is mapped |
289 | * to this virtual address: | 611 | * to this virtual address: |
290 | */ | 612 | */ |
291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 613 | static int futex_wake(u32 __user *uaddr, int nr_wake) |
292 | { | 614 | { |
293 | union futex_key key; | 615 | struct futex_hash_bucket *hb; |
294 | struct futex_hash_bucket *bh; | ||
295 | struct list_head *head; | ||
296 | struct futex_q *this, *next; | 616 | struct futex_q *this, *next; |
617 | struct list_head *head; | ||
618 | union futex_key key; | ||
297 | int ret; | 619 | int ret; |
298 | 620 | ||
299 | down_read(¤t->mm->mmap_sem); | 621 | down_read(¤t->mm->mmap_sem); |
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
302 | if (unlikely(ret != 0)) | 624 | if (unlikely(ret != 0)) |
303 | goto out; | 625 | goto out; |
304 | 626 | ||
305 | bh = hash_futex(&key); | 627 | hb = hash_futex(&key); |
306 | spin_lock(&bh->lock); | 628 | spin_lock(&hb->lock); |
307 | head = &bh->chain; | 629 | head = &hb->chain; |
308 | 630 | ||
309 | list_for_each_entry_safe(this, next, head, list) { | 631 | list_for_each_entry_safe(this, next, head, list) { |
310 | if (match_futex (&this->key, &key)) { | 632 | if (match_futex (&this->key, &key)) { |
633 | if (this->pi_state) | ||
634 | return -EINVAL; | ||
311 | wake_futex(this); | 635 | wake_futex(this); |
312 | if (++ret >= nr_wake) | 636 | if (++ret >= nr_wake) |
313 | break; | 637 | break; |
314 | } | 638 | } |
315 | } | 639 | } |
316 | 640 | ||
317 | spin_unlock(&bh->lock); | 641 | spin_unlock(&hb->lock); |
318 | out: | 642 | out: |
319 | up_read(¤t->mm->mmap_sem); | 643 | up_read(¤t->mm->mmap_sem); |
320 | return ret; | 644 | return ret; |
@@ -324,10 +648,12 @@ out: | |||
324 | * Wake up all waiters hashed on the physical page that is mapped | 648 | * Wake up all waiters hashed on the physical page that is mapped |
325 | * to this virtual address: | 649 | * to this virtual address: |
326 | */ | 650 | */ |
327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 651 | static int |
652 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
653 | int nr_wake, int nr_wake2, int op) | ||
328 | { | 654 | { |
329 | union futex_key key1, key2; | 655 | union futex_key key1, key2; |
330 | struct futex_hash_bucket *bh1, *bh2; | 656 | struct futex_hash_bucket *hb1, *hb2; |
331 | struct list_head *head; | 657 | struct list_head *head; |
332 | struct futex_q *this, *next; | 658 | struct futex_q *this, *next; |
333 | int ret, op_ret, attempt = 0; | 659 | int ret, op_ret, attempt = 0; |
@@ -342,27 +668,29 @@ retryfull: | |||
342 | if (unlikely(ret != 0)) | 668 | if (unlikely(ret != 0)) |
343 | goto out; | 669 | goto out; |
344 | 670 | ||
345 | bh1 = hash_futex(&key1); | 671 | hb1 = hash_futex(&key1); |
346 | bh2 = hash_futex(&key2); | 672 | hb2 = hash_futex(&key2); |
347 | 673 | ||
348 | retry: | 674 | retry: |
349 | if (bh1 < bh2) | 675 | if (hb1 < hb2) |
350 | spin_lock(&bh1->lock); | 676 | spin_lock(&hb1->lock); |
351 | spin_lock(&bh2->lock); | 677 | spin_lock(&hb2->lock); |
352 | if (bh1 > bh2) | 678 | if (hb1 > hb2) |
353 | spin_lock(&bh1->lock); | 679 | spin_lock(&hb1->lock); |
354 | 680 | ||
355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 681 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
356 | if (unlikely(op_ret < 0)) { | 682 | if (unlikely(op_ret < 0)) { |
357 | int dummy; | 683 | u32 dummy; |
358 | 684 | ||
359 | spin_unlock(&bh1->lock); | 685 | spin_unlock(&hb1->lock); |
360 | if (bh1 != bh2) | 686 | if (hb1 != hb2) |
361 | spin_unlock(&bh2->lock); | 687 | spin_unlock(&hb2->lock); |
362 | 688 | ||
363 | #ifndef CONFIG_MMU | 689 | #ifndef CONFIG_MMU |
364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 690 | /* |
365 | * but we might get them from range checking */ | 691 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
692 | * but we might get them from range checking | ||
693 | */ | ||
366 | ret = op_ret; | 694 | ret = op_ret; |
367 | goto out; | 695 | goto out; |
368 | #endif | 696 | #endif |
@@ -372,47 +700,34 @@ retry: | |||
372 | goto out; | 700 | goto out; |
373 | } | 701 | } |
374 | 702 | ||
375 | /* futex_atomic_op_inuser needs to both read and write | 703 | /* |
704 | * futex_atomic_op_inuser needs to both read and write | ||
376 | * *(int __user *)uaddr2, but we can't modify it | 705 | * *(int __user *)uaddr2, but we can't modify it |
377 | * non-atomically. Therefore, if get_user below is not | 706 | * non-atomically. Therefore, if get_user below is not |
378 | * enough, we need to handle the fault ourselves, while | 707 | * enough, we need to handle the fault ourselves, while |
379 | * still holding the mmap_sem. */ | 708 | * still holding the mmap_sem. |
709 | */ | ||
380 | if (attempt++) { | 710 | if (attempt++) { |
381 | struct vm_area_struct * vma; | 711 | if (futex_handle_fault((unsigned long)uaddr2, |
382 | struct mm_struct *mm = current->mm; | 712 | attempt)) |
383 | |||
384 | ret = -EFAULT; | ||
385 | if (attempt >= 2 || | ||
386 | !(vma = find_vma(mm, uaddr2)) || | ||
387 | vma->vm_start > uaddr2 || | ||
388 | !(vma->vm_flags & VM_WRITE)) | ||
389 | goto out; | ||
390 | |||
391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
392 | case VM_FAULT_MINOR: | ||
393 | current->min_flt++; | ||
394 | break; | ||
395 | case VM_FAULT_MAJOR: | ||
396 | current->maj_flt++; | ||
397 | break; | ||
398 | default: | ||
399 | goto out; | 713 | goto out; |
400 | } | ||
401 | goto retry; | 714 | goto retry; |
402 | } | 715 | } |
403 | 716 | ||
404 | /* If we would have faulted, release mmap_sem, | 717 | /* |
405 | * fault it in and start all over again. */ | 718 | * If we would have faulted, release mmap_sem, |
719 | * fault it in and start all over again. | ||
720 | */ | ||
406 | up_read(¤t->mm->mmap_sem); | 721 | up_read(¤t->mm->mmap_sem); |
407 | 722 | ||
408 | ret = get_user(dummy, (int __user *)uaddr2); | 723 | ret = get_user(dummy, uaddr2); |
409 | if (ret) | 724 | if (ret) |
410 | return ret; | 725 | return ret; |
411 | 726 | ||
412 | goto retryfull; | 727 | goto retryfull; |
413 | } | 728 | } |
414 | 729 | ||
415 | head = &bh1->chain; | 730 | head = &hb1->chain; |
416 | 731 | ||
417 | list_for_each_entry_safe(this, next, head, list) { | 732 | list_for_each_entry_safe(this, next, head, list) { |
418 | if (match_futex (&this->key, &key1)) { | 733 | if (match_futex (&this->key, &key1)) { |
@@ -423,7 +738,7 @@ retry: | |||
423 | } | 738 | } |
424 | 739 | ||
425 | if (op_ret > 0) { | 740 | if (op_ret > 0) { |
426 | head = &bh2->chain; | 741 | head = &hb2->chain; |
427 | 742 | ||
428 | op_ret = 0; | 743 | op_ret = 0; |
429 | list_for_each_entry_safe(this, next, head, list) { | 744 | list_for_each_entry_safe(this, next, head, list) { |
@@ -436,9 +751,9 @@ retry: | |||
436 | ret += op_ret; | 751 | ret += op_ret; |
437 | } | 752 | } |
438 | 753 | ||
439 | spin_unlock(&bh1->lock); | 754 | spin_unlock(&hb1->lock); |
440 | if (bh1 != bh2) | 755 | if (hb1 != hb2) |
441 | spin_unlock(&bh2->lock); | 756 | spin_unlock(&hb2->lock); |
442 | out: | 757 | out: |
443 | up_read(¤t->mm->mmap_sem); | 758 | up_read(¤t->mm->mmap_sem); |
444 | return ret; | 759 | return ret; |
@@ -448,11 +763,11 @@ out: | |||
448 | * Requeue all waiters hashed on one physical page to another | 763 | * Requeue all waiters hashed on one physical page to another |
449 | * physical page. | 764 | * physical page. |
450 | */ | 765 | */ |
451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 766 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, |
452 | int nr_wake, int nr_requeue, int *valp) | 767 | int nr_wake, int nr_requeue, u32 *cmpval) |
453 | { | 768 | { |
454 | union futex_key key1, key2; | 769 | union futex_key key1, key2; |
455 | struct futex_hash_bucket *bh1, *bh2; | 770 | struct futex_hash_bucket *hb1, *hb2; |
456 | struct list_head *head1; | 771 | struct list_head *head1; |
457 | struct futex_q *this, *next; | 772 | struct futex_q *this, *next; |
458 | int ret, drop_count = 0; | 773 | int ret, drop_count = 0; |
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
467 | if (unlikely(ret != 0)) | 782 | if (unlikely(ret != 0)) |
468 | goto out; | 783 | goto out; |
469 | 784 | ||
470 | bh1 = hash_futex(&key1); | 785 | hb1 = hash_futex(&key1); |
471 | bh2 = hash_futex(&key2); | 786 | hb2 = hash_futex(&key2); |
472 | 787 | ||
473 | if (bh1 < bh2) | 788 | if (hb1 < hb2) |
474 | spin_lock(&bh1->lock); | 789 | spin_lock(&hb1->lock); |
475 | spin_lock(&bh2->lock); | 790 | spin_lock(&hb2->lock); |
476 | if (bh1 > bh2) | 791 | if (hb1 > hb2) |
477 | spin_lock(&bh1->lock); | 792 | spin_lock(&hb1->lock); |
478 | 793 | ||
479 | if (likely(valp != NULL)) { | 794 | if (likely(cmpval != NULL)) { |
480 | int curval; | 795 | u32 curval; |
481 | 796 | ||
482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 797 | ret = get_futex_value_locked(&curval, uaddr1); |
483 | 798 | ||
484 | if (unlikely(ret)) { | 799 | if (unlikely(ret)) { |
485 | spin_unlock(&bh1->lock); | 800 | spin_unlock(&hb1->lock); |
486 | if (bh1 != bh2) | 801 | if (hb1 != hb2) |
487 | spin_unlock(&bh2->lock); | 802 | spin_unlock(&hb2->lock); |
488 | 803 | ||
489 | /* If we would have faulted, release mmap_sem, fault | 804 | /* |
805 | * If we would have faulted, release mmap_sem, fault | ||
490 | * it in and start all over again. | 806 | * it in and start all over again. |
491 | */ | 807 | */ |
492 | up_read(¤t->mm->mmap_sem); | 808 | up_read(¤t->mm->mmap_sem); |
493 | 809 | ||
494 | ret = get_user(curval, (int __user *)uaddr1); | 810 | ret = get_user(curval, uaddr1); |
495 | 811 | ||
496 | if (!ret) | 812 | if (!ret) |
497 | goto retry; | 813 | goto retry; |
498 | 814 | ||
499 | return ret; | 815 | return ret; |
500 | } | 816 | } |
501 | if (curval != *valp) { | 817 | if (curval != *cmpval) { |
502 | ret = -EAGAIN; | 818 | ret = -EAGAIN; |
503 | goto out_unlock; | 819 | goto out_unlock; |
504 | } | 820 | } |
505 | } | 821 | } |
506 | 822 | ||
507 | head1 = &bh1->chain; | 823 | head1 = &hb1->chain; |
508 | list_for_each_entry_safe(this, next, head1, list) { | 824 | list_for_each_entry_safe(this, next, head1, list) { |
509 | if (!match_futex (&this->key, &key1)) | 825 | if (!match_futex (&this->key, &key1)) |
510 | continue; | 826 | continue; |
511 | if (++ret <= nr_wake) { | 827 | if (++ret <= nr_wake) { |
512 | wake_futex(this); | 828 | wake_futex(this); |
513 | } else { | 829 | } else { |
514 | list_move_tail(&this->list, &bh2->chain); | 830 | /* |
515 | this->lock_ptr = &bh2->lock; | 831 | * If key1 and key2 hash to the same bucket, no need to |
832 | * requeue. | ||
833 | */ | ||
834 | if (likely(head1 != &hb2->chain)) { | ||
835 | list_move_tail(&this->list, &hb2->chain); | ||
836 | this->lock_ptr = &hb2->lock; | ||
837 | } | ||
516 | this->key = key2; | 838 | this->key = key2; |
517 | get_key_refs(&key2); | 839 | get_key_refs(&key2); |
518 | drop_count++; | 840 | drop_count++; |
519 | 841 | ||
520 | if (ret - nr_wake >= nr_requeue) | 842 | if (ret - nr_wake >= nr_requeue) |
521 | break; | 843 | break; |
522 | /* Make sure to stop if key1 == key2 */ | ||
523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
524 | head1 = &this->list; | ||
525 | } | 844 | } |
526 | } | 845 | } |
527 | 846 | ||
528 | out_unlock: | 847 | out_unlock: |
529 | spin_unlock(&bh1->lock); | 848 | spin_unlock(&hb1->lock); |
530 | if (bh1 != bh2) | 849 | if (hb1 != hb2) |
531 | spin_unlock(&bh2->lock); | 850 | spin_unlock(&hb2->lock); |
532 | 851 | ||
533 | /* drop_key_refs() must be called outside the spinlocks. */ | 852 | /* drop_key_refs() must be called outside the spinlocks. */ |
534 | while (--drop_count >= 0) | 853 | while (--drop_count >= 0) |
@@ -543,7 +862,7 @@ out: | |||
543 | static inline struct futex_hash_bucket * | 862 | static inline struct futex_hash_bucket * |
544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 863 | queue_lock(struct futex_q *q, int fd, struct file *filp) |
545 | { | 864 | { |
546 | struct futex_hash_bucket *bh; | 865 | struct futex_hash_bucket *hb; |
547 | 866 | ||
548 | q->fd = fd; | 867 | q->fd = fd; |
549 | q->filp = filp; | 868 | q->filp = filp; |
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
551 | init_waitqueue_head(&q->waiters); | 870 | init_waitqueue_head(&q->waiters); |
552 | 871 | ||
553 | get_key_refs(&q->key); | 872 | get_key_refs(&q->key); |
554 | bh = hash_futex(&q->key); | 873 | hb = hash_futex(&q->key); |
555 | q->lock_ptr = &bh->lock; | 874 | q->lock_ptr = &hb->lock; |
556 | 875 | ||
557 | spin_lock(&bh->lock); | 876 | spin_lock(&hb->lock); |
558 | return bh; | 877 | return hb; |
559 | } | 878 | } |
560 | 879 | ||
561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 880 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
562 | { | 881 | { |
563 | list_add_tail(&q->list, &bh->chain); | 882 | list_add_tail(&q->list, &hb->chain); |
564 | spin_unlock(&bh->lock); | 883 | q->task = current; |
884 | spin_unlock(&hb->lock); | ||
565 | } | 885 | } |
566 | 886 | ||
567 | static inline void | 887 | static inline void |
568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 888 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
569 | { | 889 | { |
570 | spin_unlock(&bh->lock); | 890 | spin_unlock(&hb->lock); |
571 | drop_key_refs(&q->key); | 891 | drop_key_refs(&q->key); |
572 | } | 892 | } |
573 | 893 | ||
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
579 | /* The key must be already stored in q->key. */ | 899 | /* The key must be already stored in q->key. */ |
580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 900 | static void queue_me(struct futex_q *q, int fd, struct file *filp) |
581 | { | 901 | { |
582 | struct futex_hash_bucket *bh; | 902 | struct futex_hash_bucket *hb; |
583 | bh = queue_lock(q, fd, filp); | 903 | |
584 | __queue_me(q, bh); | 904 | hb = queue_lock(q, fd, filp); |
905 | __queue_me(q, hb); | ||
585 | } | 906 | } |
586 | 907 | ||
587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 908 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ |
588 | static int unqueue_me(struct futex_q *q) | 909 | static int unqueue_me(struct futex_q *q) |
589 | { | 910 | { |
590 | int ret = 0; | ||
591 | spinlock_t *lock_ptr; | 911 | spinlock_t *lock_ptr; |
912 | int ret = 0; | ||
592 | 913 | ||
593 | /* In the common case we don't take the spinlock, which is nice. */ | 914 | /* In the common case we don't take the spinlock, which is nice. */ |
594 | retry: | 915 | retry: |
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q) | |||
614 | } | 935 | } |
615 | WARN_ON(list_empty(&q->list)); | 936 | WARN_ON(list_empty(&q->list)); |
616 | list_del(&q->list); | 937 | list_del(&q->list); |
938 | |||
939 | BUG_ON(q->pi_state); | ||
940 | |||
617 | spin_unlock(lock_ptr); | 941 | spin_unlock(lock_ptr); |
618 | ret = 1; | 942 | ret = 1; |
619 | } | 943 | } |
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q) | |||
622 | return ret; | 946 | return ret; |
623 | } | 947 | } |
624 | 948 | ||
625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 949 | /* |
950 | * PI futexes can not be requeued and must remove themself from the | ||
951 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
952 | */ | ||
953 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
626 | { | 954 | { |
627 | DECLARE_WAITQUEUE(wait, current); | 955 | WARN_ON(list_empty(&q->list)); |
628 | int ret, curval; | 956 | list_del(&q->list); |
957 | |||
958 | BUG_ON(!q->pi_state); | ||
959 | free_pi_state(q->pi_state); | ||
960 | q->pi_state = NULL; | ||
961 | |||
962 | spin_unlock(&hb->lock); | ||
963 | |||
964 | drop_key_refs(&q->key); | ||
965 | } | ||
966 | |||
967 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
968 | { | ||
969 | struct task_struct *curr = current; | ||
970 | DECLARE_WAITQUEUE(wait, curr); | ||
971 | struct futex_hash_bucket *hb; | ||
629 | struct futex_q q; | 972 | struct futex_q q; |
630 | struct futex_hash_bucket *bh; | 973 | u32 uval; |
974 | int ret; | ||
631 | 975 | ||
976 | q.pi_state = NULL; | ||
632 | retry: | 977 | retry: |
633 | down_read(¤t->mm->mmap_sem); | 978 | down_read(&curr->mm->mmap_sem); |
634 | 979 | ||
635 | ret = get_futex_key(uaddr, &q.key); | 980 | ret = get_futex_key(uaddr, &q.key); |
636 | if (unlikely(ret != 0)) | 981 | if (unlikely(ret != 0)) |
637 | goto out_release_sem; | 982 | goto out_release_sem; |
638 | 983 | ||
639 | bh = queue_lock(&q, -1, NULL); | 984 | hb = queue_lock(&q, -1, NULL); |
640 | 985 | ||
641 | /* | 986 | /* |
642 | * Access the page AFTER the futex is queued. | 987 | * Access the page AFTER the futex is queued. |
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1003 | * We hold the mmap semaphore, so the mapping cannot have changed |
659 | * since we looked it up in get_futex_key. | 1004 | * since we looked it up in get_futex_key. |
660 | */ | 1005 | */ |
661 | 1006 | ret = get_futex_value_locked(&uval, uaddr); | |
662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
663 | 1007 | ||
664 | if (unlikely(ret)) { | 1008 | if (unlikely(ret)) { |
665 | queue_unlock(&q, bh); | 1009 | queue_unlock(&q, hb); |
666 | 1010 | ||
667 | /* If we would have faulted, release mmap_sem, fault it in and | 1011 | /* |
1012 | * If we would have faulted, release mmap_sem, fault it in and | ||
668 | * start all over again. | 1013 | * start all over again. |
669 | */ | 1014 | */ |
670 | up_read(¤t->mm->mmap_sem); | 1015 | up_read(&curr->mm->mmap_sem); |
671 | 1016 | ||
672 | ret = get_user(curval, (int __user *)uaddr); | 1017 | ret = get_user(uval, uaddr); |
673 | 1018 | ||
674 | if (!ret) | 1019 | if (!ret) |
675 | goto retry; | 1020 | goto retry; |
676 | return ret; | 1021 | return ret; |
677 | } | 1022 | } |
678 | if (curval != val) { | 1023 | ret = -EWOULDBLOCK; |
679 | ret = -EWOULDBLOCK; | 1024 | if (uval != val) |
680 | queue_unlock(&q, bh); | 1025 | goto out_unlock_release_sem; |
681 | goto out_release_sem; | ||
682 | } | ||
683 | 1026 | ||
684 | /* Only actually queue if *uaddr contained val. */ | 1027 | /* Only actually queue if *uaddr contained val. */ |
685 | __queue_me(&q, bh); | 1028 | __queue_me(&q, hb); |
686 | 1029 | ||
687 | /* | 1030 | /* |
688 | * Now the futex is queued and we have checked the data, we | 1031 | * Now the futex is queued and we have checked the data, we |
689 | * don't want to hold mmap_sem while we sleep. | 1032 | * don't want to hold mmap_sem while we sleep. |
690 | */ | 1033 | */ |
691 | up_read(¤t->mm->mmap_sem); | 1034 | up_read(&curr->mm->mmap_sem); |
692 | 1035 | ||
693 | /* | 1036 | /* |
694 | * There might have been scheduling since the queue_me(), as we | 1037 | * There might have been scheduling since the queue_me(), as we |
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
720 | return 0; | 1063 | return 0; |
721 | if (time == 0) | 1064 | if (time == 0) |
722 | return -ETIMEDOUT; | 1065 | return -ETIMEDOUT; |
723 | /* We expect signal_pending(current), but another thread may | 1066 | /* |
724 | * have handled it for us already. */ | 1067 | * We expect signal_pending(current), but another thread may |
1068 | * have handled it for us already. | ||
1069 | */ | ||
725 | return -EINTR; | 1070 | return -EINTR; |
726 | 1071 | ||
1072 | out_unlock_release_sem: | ||
1073 | queue_unlock(&q, hb); | ||
1074 | |||
727 | out_release_sem: | 1075 | out_release_sem: |
1076 | up_read(&curr->mm->mmap_sem); | ||
1077 | return ret; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
1082 | * and failed. The kernel side here does the whole locking operation: | ||
1083 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
1084 | * races the kernel might see a 0 value of the futex too.) | ||
1085 | */ | ||
1086 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
1087 | struct hrtimer_sleeper *to) | ||
1088 | { | ||
1089 | struct task_struct *curr = current; | ||
1090 | struct futex_hash_bucket *hb; | ||
1091 | u32 uval, newval, curval; | ||
1092 | struct futex_q q; | ||
1093 | int ret, attempt = 0; | ||
1094 | |||
1095 | if (refill_pi_state_cache()) | ||
1096 | return -ENOMEM; | ||
1097 | |||
1098 | q.pi_state = NULL; | ||
1099 | retry: | ||
1100 | down_read(&curr->mm->mmap_sem); | ||
1101 | |||
1102 | ret = get_futex_key(uaddr, &q.key); | ||
1103 | if (unlikely(ret != 0)) | ||
1104 | goto out_release_sem; | ||
1105 | |||
1106 | hb = queue_lock(&q, -1, NULL); | ||
1107 | |||
1108 | retry_locked: | ||
1109 | /* | ||
1110 | * To avoid races, we attempt to take the lock here again | ||
1111 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1112 | * the locks. It will most likely not succeed. | ||
1113 | */ | ||
1114 | newval = current->pid; | ||
1115 | |||
1116 | inc_preempt_count(); | ||
1117 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1118 | dec_preempt_count(); | ||
1119 | |||
1120 | if (unlikely(curval == -EFAULT)) | ||
1121 | goto uaddr_faulted; | ||
1122 | |||
1123 | /* We own the lock already */ | ||
1124 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
1125 | if (!detect && 0) | ||
1126 | force_sig(SIGKILL, current); | ||
1127 | ret = -EDEADLK; | ||
1128 | goto out_unlock_release_sem; | ||
1129 | } | ||
1130 | |||
1131 | /* | ||
1132 | * Surprise - we got the lock. Just return | ||
1133 | * to userspace: | ||
1134 | */ | ||
1135 | if (unlikely(!curval)) | ||
1136 | goto out_unlock_release_sem; | ||
1137 | |||
1138 | uval = curval; | ||
1139 | newval = uval | FUTEX_WAITERS; | ||
1140 | |||
1141 | inc_preempt_count(); | ||
1142 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1143 | dec_preempt_count(); | ||
1144 | |||
1145 | if (unlikely(curval == -EFAULT)) | ||
1146 | goto uaddr_faulted; | ||
1147 | if (unlikely(curval != uval)) | ||
1148 | goto retry_locked; | ||
1149 | |||
1150 | /* | ||
1151 | * We dont have the lock. Look up the PI state (or create it if | ||
1152 | * we are the first waiter): | ||
1153 | */ | ||
1154 | ret = lookup_pi_state(uval, hb, &q); | ||
1155 | |||
1156 | if (unlikely(ret)) { | ||
1157 | /* | ||
1158 | * There were no waiters and the owner task lookup | ||
1159 | * failed. When the OWNER_DIED bit is set, then we | ||
1160 | * know that this is a robust futex and we actually | ||
1161 | * take the lock. This is safe as we are protected by | ||
1162 | * the hash bucket lock. We also set the waiters bit | ||
1163 | * unconditionally here, to simplify glibc handling of | ||
1164 | * multiple tasks racing to acquire the lock and | ||
1165 | * cleanup the problems which were left by the dead | ||
1166 | * owner. | ||
1167 | */ | ||
1168 | if (curval & FUTEX_OWNER_DIED) { | ||
1169 | uval = newval; | ||
1170 | newval = current->pid | | ||
1171 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1172 | |||
1173 | inc_preempt_count(); | ||
1174 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1175 | uval, newval); | ||
1176 | dec_preempt_count(); | ||
1177 | |||
1178 | if (unlikely(curval == -EFAULT)) | ||
1179 | goto uaddr_faulted; | ||
1180 | if (unlikely(curval != uval)) | ||
1181 | goto retry_locked; | ||
1182 | ret = 0; | ||
1183 | } | ||
1184 | goto out_unlock_release_sem; | ||
1185 | } | ||
1186 | |||
1187 | /* | ||
1188 | * Only actually queue now that the atomic ops are done: | ||
1189 | */ | ||
1190 | __queue_me(&q, hb); | ||
1191 | |||
1192 | /* | ||
1193 | * Now the futex is queued and we have checked the data, we | ||
1194 | * don't want to hold mmap_sem while we sleep. | ||
1195 | */ | ||
1196 | up_read(&curr->mm->mmap_sem); | ||
1197 | |||
1198 | WARN_ON(!q.pi_state); | ||
1199 | /* | ||
1200 | * Block on the PI mutex: | ||
1201 | */ | ||
1202 | if (!trylock) | ||
1203 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
1204 | else { | ||
1205 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
1206 | /* Fixup the trylock return value: */ | ||
1207 | ret = ret ? 0 : -EWOULDBLOCK; | ||
1208 | } | ||
1209 | |||
1210 | down_read(&curr->mm->mmap_sem); | ||
1211 | hb = queue_lock(&q, -1, NULL); | ||
1212 | |||
1213 | /* | ||
1214 | * Got the lock. We might not be the anticipated owner if we | ||
1215 | * did a lock-steal - fix up the PI-state in that case. | ||
1216 | */ | ||
1217 | if (!ret && q.pi_state->owner != curr) { | ||
1218 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
1219 | |||
1220 | /* Owner died? */ | ||
1221 | if (q.pi_state->owner != NULL) { | ||
1222 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
1223 | list_del_init(&q.pi_state->list); | ||
1224 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
1225 | } else | ||
1226 | newtid |= FUTEX_OWNER_DIED; | ||
1227 | |||
1228 | q.pi_state->owner = current; | ||
1229 | |||
1230 | spin_lock_irq(¤t->pi_lock); | ||
1231 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
1232 | spin_unlock_irq(¤t->pi_lock); | ||
1233 | |||
1234 | /* Unqueue and drop the lock */ | ||
1235 | unqueue_me_pi(&q, hb); | ||
1236 | up_read(&curr->mm->mmap_sem); | ||
1237 | /* | ||
1238 | * We own it, so we have to replace the pending owner | ||
1239 | * TID. This must be atomic as we have preserve the | ||
1240 | * owner died bit here. | ||
1241 | */ | ||
1242 | ret = get_user(uval, uaddr); | ||
1243 | while (!ret) { | ||
1244 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1245 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1246 | uval, newval); | ||
1247 | if (curval == -EFAULT) | ||
1248 | ret = -EFAULT; | ||
1249 | if (curval == uval) | ||
1250 | break; | ||
1251 | uval = curval; | ||
1252 | } | ||
1253 | } else { | ||
1254 | /* | ||
1255 | * Catch the rare case, where the lock was released | ||
1256 | * when we were on the way back before we locked | ||
1257 | * the hash bucket. | ||
1258 | */ | ||
1259 | if (ret && q.pi_state->owner == curr) { | ||
1260 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1261 | ret = 0; | ||
1262 | } | ||
1263 | /* Unqueue and drop the lock */ | ||
1264 | unqueue_me_pi(&q, hb); | ||
1265 | up_read(&curr->mm->mmap_sem); | ||
1266 | } | ||
1267 | |||
1268 | if (!detect && ret == -EDEADLK && 0) | ||
1269 | force_sig(SIGKILL, current); | ||
1270 | |||
1271 | return ret; | ||
1272 | |||
1273 | out_unlock_release_sem: | ||
1274 | queue_unlock(&q, hb); | ||
1275 | |||
1276 | out_release_sem: | ||
1277 | up_read(&curr->mm->mmap_sem); | ||
1278 | return ret; | ||
1279 | |||
1280 | uaddr_faulted: | ||
1281 | /* | ||
1282 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1283 | * non-atomically. Therefore, if get_user below is not | ||
1284 | * enough, we need to handle the fault ourselves, while | ||
1285 | * still holding the mmap_sem. | ||
1286 | */ | ||
1287 | if (attempt++) { | ||
1288 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1289 | goto out_unlock_release_sem; | ||
1290 | |||
1291 | goto retry_locked; | ||
1292 | } | ||
1293 | |||
1294 | queue_unlock(&q, hb); | ||
1295 | up_read(&curr->mm->mmap_sem); | ||
1296 | |||
1297 | ret = get_user(uval, uaddr); | ||
1298 | if (!ret && (uval != -EFAULT)) | ||
1299 | goto retry; | ||
1300 | |||
1301 | return ret; | ||
1302 | } | ||
1303 | |||
1304 | /* | ||
1305 | * Restart handler | ||
1306 | */ | ||
1307 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1308 | { | ||
1309 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1310 | int ret; | ||
1311 | |||
1312 | restart->fn = do_no_restart_syscall; | ||
1313 | |||
1314 | if (restart->arg2 || restart->arg3) { | ||
1315 | to = &timeout; | ||
1316 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1317 | hrtimer_init_sleeper(to, current); | ||
1318 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
1319 | (u64) restart->arg0; | ||
1320 | } | ||
1321 | |||
1322 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
1323 | (u32 __user *)restart->arg0, current->pid); | ||
1324 | |||
1325 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
1326 | 0, to); | ||
1327 | |||
1328 | if (ret != -EINTR) | ||
1329 | return ret; | ||
1330 | |||
1331 | restart->fn = futex_lock_pi_restart; | ||
1332 | |||
1333 | /* The other values are filled in */ | ||
1334 | return -ERESTART_RESTARTBLOCK; | ||
1335 | } | ||
1336 | |||
1337 | /* | ||
1338 | * Called from the syscall entry below. | ||
1339 | */ | ||
1340 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
1341 | long nsec, int trylock) | ||
1342 | { | ||
1343 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1344 | struct restart_block *restart; | ||
1345 | int ret; | ||
1346 | |||
1347 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
1348 | to = &timeout; | ||
1349 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1350 | hrtimer_init_sleeper(to, current); | ||
1351 | to->timer.expires = ktime_set(sec, nsec); | ||
1352 | } | ||
1353 | |||
1354 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
1355 | |||
1356 | if (ret != -EINTR) | ||
1357 | return ret; | ||
1358 | |||
1359 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
1360 | |||
1361 | restart = ¤t_thread_info()->restart_block; | ||
1362 | restart->fn = futex_lock_pi_restart; | ||
1363 | restart->arg0 = (unsigned long) uaddr; | ||
1364 | restart->arg1 = detect; | ||
1365 | if (to) { | ||
1366 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
1367 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
1368 | } else | ||
1369 | restart->arg2 = restart->arg3 = 0; | ||
1370 | |||
1371 | return -ERESTART_RESTARTBLOCK; | ||
1372 | } | ||
1373 | |||
1374 | /* | ||
1375 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
1376 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
1377 | * and do the rt-mutex unlock. | ||
1378 | */ | ||
1379 | static int futex_unlock_pi(u32 __user *uaddr) | ||
1380 | { | ||
1381 | struct futex_hash_bucket *hb; | ||
1382 | struct futex_q *this, *next; | ||
1383 | u32 uval; | ||
1384 | struct list_head *head; | ||
1385 | union futex_key key; | ||
1386 | int ret, attempt = 0; | ||
1387 | |||
1388 | retry: | ||
1389 | if (get_user(uval, uaddr)) | ||
1390 | return -EFAULT; | ||
1391 | /* | ||
1392 | * We release only a lock we actually own: | ||
1393 | */ | ||
1394 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
1395 | return -EPERM; | ||
1396 | /* | ||
1397 | * First take all the futex related locks: | ||
1398 | */ | ||
1399 | down_read(¤t->mm->mmap_sem); | ||
1400 | |||
1401 | ret = get_futex_key(uaddr, &key); | ||
1402 | if (unlikely(ret != 0)) | ||
1403 | goto out; | ||
1404 | |||
1405 | hb = hash_futex(&key); | ||
1406 | spin_lock(&hb->lock); | ||
1407 | |||
1408 | retry_locked: | ||
1409 | /* | ||
1410 | * To avoid races, try to do the TID -> 0 atomic transition | ||
1411 | * again. If it succeeds then we can return without waking | ||
1412 | * anyone else up: | ||
1413 | */ | ||
1414 | inc_preempt_count(); | ||
1415 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
1416 | dec_preempt_count(); | ||
1417 | |||
1418 | if (unlikely(uval == -EFAULT)) | ||
1419 | goto pi_faulted; | ||
1420 | /* | ||
1421 | * Rare case: we managed to release the lock atomically, | ||
1422 | * no need to wake anyone else up: | ||
1423 | */ | ||
1424 | if (unlikely(uval == current->pid)) | ||
1425 | goto out_unlock; | ||
1426 | |||
1427 | /* | ||
1428 | * Ok, other tasks may need to be woken up - check waiters | ||
1429 | * and do the wakeup if necessary: | ||
1430 | */ | ||
1431 | head = &hb->chain; | ||
1432 | |||
1433 | list_for_each_entry_safe(this, next, head, list) { | ||
1434 | if (!match_futex (&this->key, &key)) | ||
1435 | continue; | ||
1436 | ret = wake_futex_pi(uaddr, uval, this); | ||
1437 | /* | ||
1438 | * The atomic access to the futex value | ||
1439 | * generated a pagefault, so retry the | ||
1440 | * user-access and the wakeup: | ||
1441 | */ | ||
1442 | if (ret == -EFAULT) | ||
1443 | goto pi_faulted; | ||
1444 | goto out_unlock; | ||
1445 | } | ||
1446 | /* | ||
1447 | * No waiters - kernel unlocks the futex: | ||
1448 | */ | ||
1449 | ret = unlock_futex_pi(uaddr, uval); | ||
1450 | if (ret == -EFAULT) | ||
1451 | goto pi_faulted; | ||
1452 | |||
1453 | out_unlock: | ||
1454 | spin_unlock(&hb->lock); | ||
1455 | out: | ||
728 | up_read(¤t->mm->mmap_sem); | 1456 | up_read(¤t->mm->mmap_sem); |
1457 | |||
1458 | return ret; | ||
1459 | |||
1460 | pi_faulted: | ||
1461 | /* | ||
1462 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1463 | * non-atomically. Therefore, if get_user below is not | ||
1464 | * enough, we need to handle the fault ourselves, while | ||
1465 | * still holding the mmap_sem. | ||
1466 | */ | ||
1467 | if (attempt++) { | ||
1468 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1469 | goto out_unlock; | ||
1470 | |||
1471 | goto retry_locked; | ||
1472 | } | ||
1473 | |||
1474 | spin_unlock(&hb->lock); | ||
1475 | up_read(¤t->mm->mmap_sem); | ||
1476 | |||
1477 | ret = get_user(uval, uaddr); | ||
1478 | if (!ret && (uval != -EFAULT)) | ||
1479 | goto retry; | ||
1480 | |||
729 | return ret; | 1481 | return ret; |
730 | } | 1482 | } |
731 | 1483 | ||
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
735 | 1487 | ||
736 | unqueue_me(q); | 1488 | unqueue_me(q); |
737 | kfree(q); | 1489 | kfree(q); |
1490 | |||
738 | return 0; | 1491 | return 0; |
739 | } | 1492 | } |
740 | 1493 | ||
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = { | |||
766 | * Signal allows caller to avoid the race which would occur if they | 1519 | * Signal allows caller to avoid the race which would occur if they |
767 | * set the sigio stuff up afterwards. | 1520 | * set the sigio stuff up afterwards. |
768 | */ | 1521 | */ |
769 | static int futex_fd(unsigned long uaddr, int signal) | 1522 | static int futex_fd(u32 __user *uaddr, int signal) |
770 | { | 1523 | { |
771 | struct futex_q *q; | 1524 | struct futex_q *q; |
772 | struct file *filp; | 1525 | struct file *filp; |
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
803 | err = -ENOMEM; | 1556 | err = -ENOMEM; |
804 | goto error; | 1557 | goto error; |
805 | } | 1558 | } |
1559 | q->pi_state = NULL; | ||
806 | 1560 | ||
807 | down_read(¤t->mm->mmap_sem); | 1561 | down_read(¤t->mm->mmap_sem); |
808 | err = get_futex_key(uaddr, &q->key); | 1562 | err = get_futex_key(uaddr, &q->key); |
@@ -840,7 +1594,7 @@ error: | |||
840 | * Implementation: user-space maintains a per-thread list of locks it | 1594 | * Implementation: user-space maintains a per-thread list of locks it |
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1595 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
842 | * and marks all locks that are owned by this thread with the | 1596 | * and marks all locks that are owned by this thread with the |
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1597 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
844 | * always manipulated with the lock held, so the list is private and | 1598 | * always manipulated with the lock held, so the list is private and |
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1599 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
846 | * field, to allow the kernel to clean up if the thread dies after | 1600 | * field, to allow the kernel to clean up if the thread dies after |
@@ -915,7 +1669,7 @@ err_unlock: | |||
915 | */ | 1669 | */ |
916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1670 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
917 | { | 1671 | { |
918 | u32 uval; | 1672 | u32 uval, nval; |
919 | 1673 | ||
920 | retry: | 1674 | retry: |
921 | if (get_user(uval, uaddr)) | 1675 | if (get_user(uval, uaddr)) |
@@ -932,12 +1686,16 @@ retry: | |||
932 | * thread-death.) The rest of the cleanup is done in | 1686 | * thread-death.) The rest of the cleanup is done in |
933 | * userspace. | 1687 | * userspace. |
934 | */ | 1688 | */ |
935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1689 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
936 | uval | FUTEX_OWNER_DIED) != uval) | 1690 | uval | FUTEX_OWNER_DIED); |
1691 | if (nval == -EFAULT) | ||
1692 | return -1; | ||
1693 | |||
1694 | if (nval != uval) | ||
937 | goto retry; | 1695 | goto retry; |
938 | 1696 | ||
939 | if (uval & FUTEX_WAITERS) | 1697 | if (uval & FUTEX_WAITERS) |
940 | futex_wake((unsigned long)uaddr, 1); | 1698 | futex_wake(uaddr, 1); |
941 | } | 1699 | } |
942 | return 0; | 1700 | return 0; |
943 | } | 1701 | } |
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr) | |||
978 | while (entry != &head->list) { | 1736 | while (entry != &head->list) { |
979 | /* | 1737 | /* |
980 | * A pending lock might already be on the list, so | 1738 | * A pending lock might already be on the list, so |
981 | * dont process it twice: | 1739 | * don't process it twice: |
982 | */ | 1740 | */ |
983 | if (entry != pending) | 1741 | if (entry != pending) |
984 | if (handle_futex_death((void *)entry + futex_offset, | 1742 | if (handle_futex_death((void *)entry + futex_offset, |
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr) | |||
999 | } | 1757 | } |
1000 | } | 1758 | } |
1001 | 1759 | ||
1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1760 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, |
1003 | unsigned long uaddr2, int val2, int val3) | 1761 | u32 __user *uaddr2, u32 val2, u32 val3) |
1004 | { | 1762 | { |
1005 | int ret; | 1763 | int ret; |
1006 | 1764 | ||
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1024 | case FUTEX_WAKE_OP: | 1782 | case FUTEX_WAKE_OP: |
1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1783 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
1026 | break; | 1784 | break; |
1785 | case FUTEX_LOCK_PI: | ||
1786 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
1787 | break; | ||
1788 | case FUTEX_UNLOCK_PI: | ||
1789 | ret = futex_unlock_pi(uaddr); | ||
1790 | break; | ||
1791 | case FUTEX_TRYLOCK_PI: | ||
1792 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
1793 | break; | ||
1027 | default: | 1794 | default: |
1028 | ret = -ENOSYS; | 1795 | ret = -ENOSYS; |
1029 | } | 1796 | } |
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1031 | } | 1798 | } |
1032 | 1799 | ||
1033 | 1800 | ||
1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1801 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, |
1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1802 | struct timespec __user *utime, u32 __user *uaddr2, |
1036 | int val3) | 1803 | u32 val3) |
1037 | { | 1804 | { |
1038 | struct timespec t; | 1805 | struct timespec t; |
1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1806 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
1040 | int val2 = 0; | 1807 | u32 val2 = 0; |
1041 | 1808 | ||
1042 | if (utime && (op == FUTEX_WAIT)) { | 1809 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1810 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
1044 | return -EFAULT; | 1811 | return -EFAULT; |
1045 | if (!timespec_valid(&t)) | 1812 | if (!timespec_valid(&t)) |
1046 | return -EINVAL; | 1813 | return -EINVAL; |
1047 | timeout = timespec_to_jiffies(&t) + 1; | 1814 | if (op == FUTEX_WAIT) |
1815 | timeout = timespec_to_jiffies(&t) + 1; | ||
1816 | else { | ||
1817 | timeout = t.tv_sec; | ||
1818 | val2 = t.tv_nsec; | ||
1819 | } | ||
1048 | } | 1820 | } |
1049 | /* | 1821 | /* |
1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1822 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
1051 | */ | 1823 | */ |
1052 | if (op >= FUTEX_REQUEUE) | 1824 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
1053 | val2 = (int) (unsigned long) utime; | 1825 | val2 = (u32) (unsigned long) utime; |
1054 | 1826 | ||
1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1827 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
1056 | (unsigned long)uaddr2, val2, val3); | ||
1057 | } | 1828 | } |
1058 | 1829 | ||
1059 | static int futexfs_get_sb(struct file_system_type *fs_type, | 1830 | static int futexfs_get_sb(struct file_system_type *fs_type, |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d14..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
130 | int val2 = 0; | 130 | int val2 = 0; |
131 | 131 | ||
132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
134 | return -EFAULT; | 134 | return -EFAULT; |
135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
136 | return -EINVAL; | 136 | return -EINVAL; |
137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
138 | timeout = timespec_to_jiffies(&t) + 1; | ||
139 | else { | ||
140 | timeout = t.tv_sec; | ||
141 | val2 = t.tv_nsec; | ||
142 | } | ||
138 | } | 143 | } |
139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
141 | 146 | ||
142 | return do_futex((unsigned long)uaddr, op, val, timeout, | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
143 | (unsigned long)uaddr2, val2, val3); | ||
144 | } | 148 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 55601b3ce60e..8d3dc29ef41a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu) | |||
833 | } | 833 | } |
834 | #endif /* CONFIG_HOTPLUG_CPU */ | 834 | #endif /* CONFIG_HOTPLUG_CPU */ |
835 | 835 | ||
836 | static int hrtimer_cpu_notify(struct notifier_block *self, | 836 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, |
837 | unsigned long action, void *hcpu) | 837 | unsigned long action, void *hcpu) |
838 | { | 838 | { |
839 | long cpu = (long)hcpu; | 839 | long cpu = (long)hcpu; |
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
857 | return NOTIFY_OK; | 857 | return NOTIFY_OK; |
858 | } | 858 | } |
859 | 859 | ||
860 | static struct notifier_block hrtimers_nb = { | 860 | static struct notifier_block __devinitdata hrtimers_nb = { |
861 | .notifier_call = hrtimer_cpu_notify, | 861 | .notifier_call = hrtimer_cpu_notify, |
862 | }; | 862 | }; |
863 | 863 | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 036b6285b15c..e38e4bac97ca 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/poison.h> | ||
19 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
21 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, | |||
381 | 382 | ||
382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | 383 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) |
383 | { | 384 | { |
384 | memset(waiter, 0x11, sizeof(*waiter)); | 385 | memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); |
385 | waiter->magic = waiter; | 386 | waiter->magic = waiter; |
386 | INIT_LIST_HEAD(&waiter->list); | 387 | INIT_LIST_HEAD(&waiter->list); |
387 | } | 388 | } |
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | |||
397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | 398 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) |
398 | { | 399 | { |
399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | 400 | DEBUG_WARN_ON(!list_empty(&waiter->list)); |
400 | memset(waiter, 0x22, sizeof(*waiter)); | 401 | memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); |
401 | } | 402 | } |
402 | 403 | ||
403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 404 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index fc311a4673a2..857b4fa09124 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -38,13 +38,22 @@ config PM_DEBUG | |||
38 | 38 | ||
39 | config PM_TRACE | 39 | config PM_TRACE |
40 | bool "Suspend/resume event tracing" | 40 | bool "Suspend/resume event tracing" |
41 | depends on PM && PM_DEBUG && X86_32 | 41 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL |
42 | default y | 42 | default n |
43 | ---help--- | 43 | ---help--- |
44 | This enables some cheesy code to save the last PM event point in the | 44 | This enables some cheesy code to save the last PM event point in the |
45 | RTC across reboots, so that you can debug a machine that just hangs | 45 | RTC across reboots, so that you can debug a machine that just hangs |
46 | during suspend (or more commonly, during resume). | 46 | during suspend (or more commonly, during resume). |
47 | 47 | ||
48 | To use this debugging feature you should attempt to suspend the machine, | ||
49 | then reboot it, then run | ||
50 | |||
51 | dmesg -s 1000000 | grep 'hash matches' | ||
52 | |||
53 | CAUTION: this option will cause your machine's real-time clock to be | ||
54 | set to an invalid time after a resume. | ||
55 | |||
56 | |||
48 | config SOFTWARE_SUSPEND | 57 | config SOFTWARE_SUSPEND |
49 | bool "Software Suspend" | 58 | bool "Software Suspend" |
50 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 59 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e507..5a730fdb1a2c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -299,7 +299,7 @@ out: | |||
299 | } | 299 | } |
300 | 300 | ||
301 | #ifdef CONFIG_HOTPLUG_CPU | 301 | #ifdef CONFIG_HOTPLUG_CPU |
302 | static int profile_cpu_callback(struct notifier_block *info, | 302 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
303 | unsigned long action, void *__cpu) | 303 | unsigned long action, void *__cpu) |
304 | { | 304 | { |
305 | int node, cpu = (unsigned long)__cpu; | 305 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 20e9710fc21c..f464f5ae3f11 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -182,6 +182,15 @@ long rcu_batches_completed(void) | |||
182 | return rcu_ctrlblk.completed; | 182 | return rcu_ctrlblk.completed; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | ||
186 | * Return the number of RCU batches processed thus far. Useful | ||
187 | * for debug and statistics. | ||
188 | */ | ||
189 | long rcu_batches_completed_bh(void) | ||
190 | { | ||
191 | return rcu_bh_ctrlblk.completed; | ||
192 | } | ||
193 | |||
185 | static void rcu_barrier_callback(struct rcu_head *notused) | 194 | static void rcu_barrier_callback(struct rcu_head *notused) |
186 | { | 195 | { |
187 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 196 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 548 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
540 | } | 549 | } |
541 | 550 | ||
542 | static int rcu_cpu_notify(struct notifier_block *self, | 551 | static int __devinit rcu_cpu_notify(struct notifier_block *self, |
543 | unsigned long action, void *hcpu) | 552 | unsigned long action, void *hcpu) |
544 | { | 553 | { |
545 | long cpu = (long)hcpu; | 554 | long cpu = (long)hcpu; |
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
556 | return NOTIFY_OK; | 565 | return NOTIFY_OK; |
557 | } | 566 | } |
558 | 567 | ||
559 | static struct notifier_block rcu_nb = { | 568 | static struct notifier_block __devinitdata rcu_nb = { |
560 | .notifier_call = rcu_cpu_notify, | 569 | .notifier_call = rcu_cpu_notify, |
561 | }; | 570 | }; |
562 | 571 | ||
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0); | |||
619 | module_param(rsinterval, int, 0); | 628 | module_param(rsinterval, int, 0); |
620 | #endif | 629 | #endif |
621 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 630 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
631 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
622 | EXPORT_SYMBOL_GPL(call_rcu); | 632 | EXPORT_SYMBOL_GPL(call_rcu); |
623 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 633 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
624 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 634 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d12..4d1c3d247127 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ | |||
53 | static int verbose; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
56 | static char *torture_type = "rcu"; /* What to torture. */ | ||
56 | 57 | ||
57 | module_param(nreaders, int, 0); | 58 | module_param(nreaders, int, 0); |
58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 59 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); | |||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 65 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
65 | module_param(shuffle_interval, int, 0); | 66 | module_param(shuffle_interval, int, 0); |
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 67 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
67 | #define TORTURE_FLAG "rcutorture: " | 68 | module_param(torture_type, charp, 0); |
69 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); | ||
70 | |||
71 | #define TORTURE_FLAG "-torture:" | ||
68 | #define PRINTK_STRING(s) \ | 72 | #define PRINTK_STRING(s) \ |
69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 73 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
70 | #define VERBOSE_PRINTK_STRING(s) \ | 74 | #define VERBOSE_PRINTK_STRING(s) \ |
71 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 75 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
72 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 76 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
73 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 77 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
74 | 78 | ||
75 | static char printk_buf[4096]; | 79 | static char printk_buf[4096]; |
76 | 80 | ||
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) | |||
139 | spin_unlock_bh(&rcu_torture_lock); | 143 | spin_unlock_bh(&rcu_torture_lock); |
140 | } | 144 | } |
141 | 145 | ||
142 | static void | ||
143 | rcu_torture_cb(struct rcu_head *p) | ||
144 | { | ||
145 | int i; | ||
146 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
147 | |||
148 | if (fullstop) { | ||
149 | /* Test is ending, just drop callbacks on the floor. */ | ||
150 | /* The next initialization will pick up the pieces. */ | ||
151 | return; | ||
152 | } | ||
153 | i = rp->rtort_pipe_count; | ||
154 | if (i > RCU_TORTURE_PIPE_LEN) | ||
155 | i = RCU_TORTURE_PIPE_LEN; | ||
156 | atomic_inc(&rcu_torture_wcount[i]); | ||
157 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
158 | rp->rtort_mbtest = 0; | ||
159 | rcu_torture_free(rp); | ||
160 | } else | ||
161 | call_rcu(p, rcu_torture_cb); | ||
162 | } | ||
163 | |||
164 | struct rcu_random_state { | 146 | struct rcu_random_state { |
165 | unsigned long rrs_state; | 147 | unsigned long rrs_state; |
166 | unsigned long rrs_count; | 148 | unsigned long rrs_count; |
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) | |||
191 | } | 173 | } |
192 | 174 | ||
193 | /* | 175 | /* |
176 | * Operations vector for selecting different types of tests. | ||
177 | */ | ||
178 | |||
179 | struct rcu_torture_ops { | ||
180 | void (*init)(void); | ||
181 | void (*cleanup)(void); | ||
182 | int (*readlock)(void); | ||
183 | void (*readunlock)(int idx); | ||
184 | int (*completed)(void); | ||
185 | void (*deferredfree)(struct rcu_torture *p); | ||
186 | int (*stats)(char *page); | ||
187 | char *name; | ||
188 | }; | ||
189 | static struct rcu_torture_ops *cur_ops = NULL; | ||
190 | |||
191 | /* | ||
192 | * Definitions for rcu torture testing. | ||
193 | */ | ||
194 | |||
195 | static int rcu_torture_read_lock(void) | ||
196 | { | ||
197 | rcu_read_lock(); | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void rcu_torture_read_unlock(int idx) | ||
202 | { | ||
203 | rcu_read_unlock(); | ||
204 | } | ||
205 | |||
206 | static int rcu_torture_completed(void) | ||
207 | { | ||
208 | return rcu_batches_completed(); | ||
209 | } | ||
210 | |||
211 | static void | ||
212 | rcu_torture_cb(struct rcu_head *p) | ||
213 | { | ||
214 | int i; | ||
215 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
216 | |||
217 | if (fullstop) { | ||
218 | /* Test is ending, just drop callbacks on the floor. */ | ||
219 | /* The next initialization will pick up the pieces. */ | ||
220 | return; | ||
221 | } | ||
222 | i = rp->rtort_pipe_count; | ||
223 | if (i > RCU_TORTURE_PIPE_LEN) | ||
224 | i = RCU_TORTURE_PIPE_LEN; | ||
225 | atomic_inc(&rcu_torture_wcount[i]); | ||
226 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
227 | rp->rtort_mbtest = 0; | ||
228 | rcu_torture_free(rp); | ||
229 | } else | ||
230 | cur_ops->deferredfree(rp); | ||
231 | } | ||
232 | |||
233 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
234 | { | ||
235 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
236 | } | ||
237 | |||
238 | static struct rcu_torture_ops rcu_ops = { | ||
239 | .init = NULL, | ||
240 | .cleanup = NULL, | ||
241 | .readlock = rcu_torture_read_lock, | ||
242 | .readunlock = rcu_torture_read_unlock, | ||
243 | .completed = rcu_torture_completed, | ||
244 | .deferredfree = rcu_torture_deferred_free, | ||
245 | .stats = NULL, | ||
246 | .name = "rcu" | ||
247 | }; | ||
248 | |||
249 | /* | ||
250 | * Definitions for rcu_bh torture testing. | ||
251 | */ | ||
252 | |||
253 | static int rcu_bh_torture_read_lock(void) | ||
254 | { | ||
255 | rcu_read_lock_bh(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static void rcu_bh_torture_read_unlock(int idx) | ||
260 | { | ||
261 | rcu_read_unlock_bh(); | ||
262 | } | ||
263 | |||
264 | static int rcu_bh_torture_completed(void) | ||
265 | { | ||
266 | return rcu_batches_completed_bh(); | ||
267 | } | ||
268 | |||
269 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
270 | { | ||
271 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
272 | } | ||
273 | |||
274 | static struct rcu_torture_ops rcu_bh_ops = { | ||
275 | .init = NULL, | ||
276 | .cleanup = NULL, | ||
277 | .readlock = rcu_bh_torture_read_lock, | ||
278 | .readunlock = rcu_bh_torture_read_unlock, | ||
279 | .completed = rcu_bh_torture_completed, | ||
280 | .deferredfree = rcu_bh_torture_deferred_free, | ||
281 | .stats = NULL, | ||
282 | .name = "rcu_bh" | ||
283 | }; | ||
284 | |||
285 | static struct rcu_torture_ops *torture_ops[] = | ||
286 | { &rcu_ops, &rcu_bh_ops, NULL }; | ||
287 | |||
288 | /* | ||
194 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 289 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
195 | * for that pointed to by rcu_torture_current, freeing the old structure | 290 | * for that pointed to by rcu_torture_current, freeing the old structure |
196 | * after a series of grace periods (the "pipeline"). | 291 | * after a series of grace periods (the "pipeline"). |
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) | |||
209 | 304 | ||
210 | do { | 305 | do { |
211 | schedule_timeout_uninterruptible(1); | 306 | schedule_timeout_uninterruptible(1); |
212 | if (rcu_batches_completed() == oldbatch) | ||
213 | continue; | ||
214 | if ((rp = rcu_torture_alloc()) == NULL) | 307 | if ((rp = rcu_torture_alloc()) == NULL) |
215 | continue; | 308 | continue; |
216 | rp->rtort_pipe_count = 0; | 309 | rp->rtort_pipe_count = 0; |
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) | |||
225 | i = RCU_TORTURE_PIPE_LEN; | 318 | i = RCU_TORTURE_PIPE_LEN; |
226 | atomic_inc(&rcu_torture_wcount[i]); | 319 | atomic_inc(&rcu_torture_wcount[i]); |
227 | old_rp->rtort_pipe_count++; | 320 | old_rp->rtort_pipe_count++; |
228 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 321 | cur_ops->deferredfree(old_rp); |
229 | } | 322 | } |
230 | rcu_torture_current_version++; | 323 | rcu_torture_current_version++; |
231 | oldbatch = rcu_batches_completed(); | 324 | oldbatch = cur_ops->completed(); |
232 | } while (!kthread_should_stop() && !fullstop); | 325 | } while (!kthread_should_stop() && !fullstop); |
233 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 326 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
234 | while (!kthread_should_stop()) | 327 | while (!kthread_should_stop()) |
@@ -246,6 +339,7 @@ static int | |||
246 | rcu_torture_reader(void *arg) | 339 | rcu_torture_reader(void *arg) |
247 | { | 340 | { |
248 | int completed; | 341 | int completed; |
342 | int idx; | ||
249 | DEFINE_RCU_RANDOM(rand); | 343 | DEFINE_RCU_RANDOM(rand); |
250 | struct rcu_torture *p; | 344 | struct rcu_torture *p; |
251 | int pipe_count; | 345 | int pipe_count; |
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) | |||
254 | set_user_nice(current, 19); | 348 | set_user_nice(current, 19); |
255 | 349 | ||
256 | do { | 350 | do { |
257 | rcu_read_lock(); | 351 | idx = cur_ops->readlock(); |
258 | completed = rcu_batches_completed(); | 352 | completed = cur_ops->completed(); |
259 | p = rcu_dereference(rcu_torture_current); | 353 | p = rcu_dereference(rcu_torture_current); |
260 | if (p == NULL) { | 354 | if (p == NULL) { |
261 | /* Wait for rcu_torture_writer to get underway */ | 355 | /* Wait for rcu_torture_writer to get underway */ |
262 | rcu_read_unlock(); | 356 | cur_ops->readunlock(idx); |
263 | schedule_timeout_interruptible(HZ); | 357 | schedule_timeout_interruptible(HZ); |
264 | continue; | 358 | continue; |
265 | } | 359 | } |
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) | |||
273 | pipe_count = RCU_TORTURE_PIPE_LEN; | 367 | pipe_count = RCU_TORTURE_PIPE_LEN; |
274 | } | 368 | } |
275 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 369 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
276 | completed = rcu_batches_completed() - completed; | 370 | completed = cur_ops->completed() - completed; |
277 | if (completed > RCU_TORTURE_PIPE_LEN) { | 371 | if (completed > RCU_TORTURE_PIPE_LEN) { |
278 | /* Should not happen, but... */ | 372 | /* Should not happen, but... */ |
279 | completed = RCU_TORTURE_PIPE_LEN; | 373 | completed = RCU_TORTURE_PIPE_LEN; |
280 | } | 374 | } |
281 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 375 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
282 | preempt_enable(); | 376 | preempt_enable(); |
283 | rcu_read_unlock(); | 377 | cur_ops->readunlock(idx); |
284 | schedule(); | 378 | schedule(); |
285 | } while (!kthread_should_stop() && !fullstop); | 379 | } while (!kthread_should_stop() && !fullstop); |
286 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 380 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page) | |||
311 | if (pipesummary[i] != 0) | 405 | if (pipesummary[i] != 0) |
312 | break; | 406 | break; |
313 | } | 407 | } |
314 | cnt += sprintf(&page[cnt], "rcutorture: "); | 408 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
315 | cnt += sprintf(&page[cnt], | 409 | cnt += sprintf(&page[cnt], |
316 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 410 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
317 | "rtmbe: %d", | 411 | "rtmbe: %d", |
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page) | |||
324 | atomic_read(&n_rcu_torture_mberror)); | 418 | atomic_read(&n_rcu_torture_mberror)); |
325 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 419 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
326 | cnt += sprintf(&page[cnt], " !!!"); | 420 | cnt += sprintf(&page[cnt], " !!!"); |
327 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 421 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
328 | if (i > 1) { | 422 | if (i > 1) { |
329 | cnt += sprintf(&page[cnt], "!!! "); | 423 | cnt += sprintf(&page[cnt], "!!! "); |
330 | atomic_inc(&n_rcu_torture_error); | 424 | atomic_inc(&n_rcu_torture_error); |
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page) | |||
332 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 426 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
333 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 427 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
334 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 428 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
335 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 429 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
336 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 430 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
337 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 431 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
338 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 432 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
339 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 433 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
340 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 434 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
341 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
342 | cnt += sprintf(&page[cnt], " %d", | 436 | cnt += sprintf(&page[cnt], " %d", |
343 | atomic_read(&rcu_torture_wcount[i])); | 437 | atomic_read(&rcu_torture_wcount[i])); |
344 | } | 438 | } |
345 | cnt += sprintf(&page[cnt], "\n"); | 439 | cnt += sprintf(&page[cnt], "\n"); |
440 | if (cur_ops->stats != NULL) | ||
441 | cnt += cur_ops->stats(&page[cnt]); | ||
346 | return cnt; | 442 | return cnt; |
347 | } | 443 | } |
348 | 444 | ||
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) | |||
444 | static inline void | 540 | static inline void |
445 | rcu_torture_print_module_parms(char *tag) | 541 | rcu_torture_print_module_parms(char *tag) |
446 | { | 542 | { |
447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | 543 | printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " |
448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 544 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
449 | "shuffle_interval = %d\n", | 545 | "shuffle_interval = %d\n", |
450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | 546 | torture_type, tag, nrealreaders, stat_interval, verbose, |
451 | shuffle_interval); | 547 | test_no_idle_hz, shuffle_interval); |
452 | } | 548 | } |
453 | 549 | ||
454 | static void | 550 | static void |
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void) | |||
493 | rcu_barrier(); | 589 | rcu_barrier(); |
494 | 590 | ||
495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 591 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
592 | |||
593 | if (cur_ops->cleanup != NULL) | ||
594 | cur_ops->cleanup(); | ||
496 | if (atomic_read(&n_rcu_torture_error)) | 595 | if (atomic_read(&n_rcu_torture_error)) |
497 | rcu_torture_print_module_parms("End of test: FAILURE"); | 596 | rcu_torture_print_module_parms("End of test: FAILURE"); |
498 | else | 597 | else |
@@ -508,6 +607,20 @@ rcu_torture_init(void) | |||
508 | 607 | ||
509 | /* Process args and tell the world that the torturer is on the job. */ | 608 | /* Process args and tell the world that the torturer is on the job. */ |
510 | 609 | ||
610 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
611 | cur_ops = torture_ops[i]; | ||
612 | if (strcmp(torture_type, cur_ops->name) == 0) { | ||
613 | break; | ||
614 | } | ||
615 | } | ||
616 | if (cur_ops == NULL) { | ||
617 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | ||
618 | torture_type); | ||
619 | return (-EINVAL); | ||
620 | } | ||
621 | if (cur_ops->init != NULL) | ||
622 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
623 | |||
511 | if (nreaders >= 0) | 624 | if (nreaders >= 0) |
512 | nrealreaders = nreaders; | 625 | nrealreaders = nreaders; |
513 | else | 626 | else |
diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a3..2404f9b0bc47 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -232,6 +232,44 @@ int release_resource(struct resource *old) | |||
232 | 232 | ||
233 | EXPORT_SYMBOL(release_resource); | 233 | EXPORT_SYMBOL(release_resource); |
234 | 234 | ||
235 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
236 | /* | ||
237 | * Finds the lowest memory reosurce exists within [res->start.res->end) | ||
238 | * the caller must specify res->start, res->end, res->flags. | ||
239 | * If found, returns 0, res is overwritten, if not found, returns -1. | ||
240 | */ | ||
241 | int find_next_system_ram(struct resource *res) | ||
242 | { | ||
243 | resource_size_t start, end; | ||
244 | struct resource *p; | ||
245 | |||
246 | BUG_ON(!res); | ||
247 | |||
248 | start = res->start; | ||
249 | end = res->end; | ||
250 | |||
251 | read_lock(&resource_lock); | ||
252 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
253 | /* system ram is just marked as IORESOURCE_MEM */ | ||
254 | if (p->flags != res->flags) | ||
255 | continue; | ||
256 | if (p->start > end) { | ||
257 | p = NULL; | ||
258 | break; | ||
259 | } | ||
260 | if (p->start >= start) | ||
261 | break; | ||
262 | } | ||
263 | read_unlock(&resource_lock); | ||
264 | if (!p) | ||
265 | return -1; | ||
266 | /* copy data */ | ||
267 | res->start = p->start; | ||
268 | res->end = p->end; | ||
269 | return 0; | ||
270 | } | ||
271 | #endif | ||
272 | |||
235 | /* | 273 | /* |
236 | * Find empty slot in the resource tree given range and alignment. | 274 | * Find empty slot in the resource tree given range and alignment. |
237 | */ | 275 | */ |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 000000000000..4aa8a2c9f453 --- /dev/null +++ b/kernel/rtmutex-debug.c | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This code is based on the rt.c implementation in the preempt-rt tree. | ||
10 | * Portions of said code are | ||
11 | * | ||
12 | * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
13 | * Copyright (C) 2006 Esben Nielsen | ||
14 | * Copyright (C) 2006 Kihon Technologies Inc., | ||
15 | * Steven Rostedt <rostedt@goodmis.org> | ||
16 | * | ||
17 | * See rt.c in preempt-rt for proper credits and further information | ||
18 | */ | ||
19 | #include <linux/config.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/delay.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/spinlock.h> | ||
24 | #include <linux/kallsyms.h> | ||
25 | #include <linux/syscalls.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/plist.h> | ||
28 | #include <linux/fs.h> | ||
29 | |||
30 | #include "rtmutex_common.h" | ||
31 | |||
32 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
33 | # include "rtmutex-debug.h" | ||
34 | #else | ||
35 | # include "rtmutex.h" | ||
36 | #endif | ||
37 | |||
38 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
39 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
40 | |||
41 | # define TRACE_OFF() \ | ||
42 | do { \ | ||
43 | if (rt_trace_on) { \ | ||
44 | rt_trace_on = 0; \ | ||
45 | console_verbose(); \ | ||
46 | if (spin_is_locked(¤t->pi_lock)) \ | ||
47 | spin_unlock(¤t->pi_lock); \ | ||
48 | if (spin_is_locked(¤t->held_list_lock)) \ | ||
49 | spin_unlock(¤t->held_list_lock); \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | # define TRACE_OFF_NOLOCK() \ | ||
54 | do { \ | ||
55 | if (rt_trace_on) { \ | ||
56 | rt_trace_on = 0; \ | ||
57 | console_verbose(); \ | ||
58 | } \ | ||
59 | } while (0) | ||
60 | |||
61 | # define TRACE_BUG_LOCKED() \ | ||
62 | do { \ | ||
63 | TRACE_OFF(); \ | ||
64 | BUG(); \ | ||
65 | } while (0) | ||
66 | |||
67 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
68 | do { \ | ||
69 | if (unlikely(c)) { \ | ||
70 | TRACE_OFF(); \ | ||
71 | WARN_ON(1); \ | ||
72 | } \ | ||
73 | } while (0) | ||
74 | |||
75 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
76 | do { \ | ||
77 | if (unlikely(c)) \ | ||
78 | TRACE_BUG_LOCKED(); \ | ||
79 | } while (0) | ||
80 | |||
81 | #ifdef CONFIG_SMP | ||
82 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
83 | #else | ||
84 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
85 | #endif | ||
86 | |||
87 | /* | ||
88 | * deadlock detection flag. We turn it off when we detect | ||
89 | * the first problem because we dont want to recurse back | ||
90 | * into the tracing code when doing error printk or | ||
91 | * executing a BUG(): | ||
92 | */ | ||
93 | int rt_trace_on = 1; | ||
94 | |||
95 | void deadlock_trace_off(void) | ||
96 | { | ||
97 | rt_trace_on = 0; | ||
98 | } | ||
99 | |||
100 | static void printk_task(task_t *p) | ||
101 | { | ||
102 | if (p) | ||
103 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
104 | else | ||
105 | printk("<none>"); | ||
106 | } | ||
107 | |||
108 | static void printk_task_short(task_t *p) | ||
109 | { | ||
110 | if (p) | ||
111 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
112 | else | ||
113 | printk("<none>"); | ||
114 | } | ||
115 | |||
116 | static void printk_lock(struct rt_mutex *lock, int print_owner) | ||
117 | { | ||
118 | if (lock->name) | ||
119 | printk(" [%p] {%s}\n", | ||
120 | lock, lock->name); | ||
121 | else | ||
122 | printk(" [%p] {%s:%d}\n", | ||
123 | lock, lock->file, lock->line); | ||
124 | |||
125 | if (print_owner && rt_mutex_owner(lock)) { | ||
126 | printk(".. ->owner: %p\n", lock->owner); | ||
127 | printk(".. held by: "); | ||
128 | printk_task(rt_mutex_owner(lock)); | ||
129 | printk("\n"); | ||
130 | } | ||
131 | if (rt_mutex_owner(lock)) { | ||
132 | printk("... acquired at: "); | ||
133 | print_symbol("%s\n", lock->acquire_ip); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static void printk_waiter(struct rt_mutex_waiter *w) | ||
138 | { | ||
139 | printk("-------------------------\n"); | ||
140 | printk("| waiter struct %p:\n", w); | ||
141 | printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
142 | w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, | ||
143 | w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, | ||
144 | w->list_entry.prio); | ||
145 | printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
146 | w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, | ||
147 | w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, | ||
148 | w->pi_list_entry.prio); | ||
149 | printk("\n| lock:\n"); | ||
150 | printk_lock(w->lock, 1); | ||
151 | printk("| w->ti->task:\n"); | ||
152 | printk_task(w->task); | ||
153 | printk("| blocked at: "); | ||
154 | print_symbol("%s\n", w->ip); | ||
155 | printk("-------------------------\n"); | ||
156 | } | ||
157 | |||
158 | static void show_task_locks(task_t *p) | ||
159 | { | ||
160 | switch (p->state) { | ||
161 | case TASK_RUNNING: printk("R"); break; | ||
162 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
163 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
164 | case TASK_STOPPED: printk("T"); break; | ||
165 | case EXIT_ZOMBIE: printk("Z"); break; | ||
166 | case EXIT_DEAD: printk("X"); break; | ||
167 | default: printk("?"); break; | ||
168 | } | ||
169 | printk_task(p); | ||
170 | if (p->pi_blocked_on) { | ||
171 | struct rt_mutex *lock = p->pi_blocked_on->lock; | ||
172 | |||
173 | printk(" blocked on:"); | ||
174 | printk_lock(lock, 1); | ||
175 | } else | ||
176 | printk(" (not blocked)\n"); | ||
177 | } | ||
178 | |||
179 | void rt_mutex_show_held_locks(task_t *task, int verbose) | ||
180 | { | ||
181 | struct list_head *curr, *cursor = NULL; | ||
182 | struct rt_mutex *lock; | ||
183 | task_t *t; | ||
184 | unsigned long flags; | ||
185 | int count = 0; | ||
186 | |||
187 | if (!rt_trace_on) | ||
188 | return; | ||
189 | |||
190 | if (verbose) { | ||
191 | printk("------------------------------\n"); | ||
192 | printk("| showing all locks held by: | ("); | ||
193 | printk_task_short(task); | ||
194 | printk("):\n"); | ||
195 | printk("------------------------------\n"); | ||
196 | } | ||
197 | |||
198 | next: | ||
199 | spin_lock_irqsave(&task->held_list_lock, flags); | ||
200 | list_for_each(curr, &task->held_list_head) { | ||
201 | if (cursor && curr != cursor) | ||
202 | continue; | ||
203 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
204 | t = rt_mutex_owner(lock); | ||
205 | WARN_ON(t != task); | ||
206 | count++; | ||
207 | cursor = curr->next; | ||
208 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
209 | |||
210 | printk("\n#%03d: ", count); | ||
211 | printk_lock(lock, 0); | ||
212 | goto next; | ||
213 | } | ||
214 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
215 | |||
216 | printk("\n"); | ||
217 | } | ||
218 | |||
219 | void rt_mutex_show_all_locks(void) | ||
220 | { | ||
221 | task_t *g, *p; | ||
222 | int count = 10; | ||
223 | int unlock = 1; | ||
224 | |||
225 | printk("\n"); | ||
226 | printk("----------------------\n"); | ||
227 | printk("| showing all tasks: |\n"); | ||
228 | printk("----------------------\n"); | ||
229 | |||
230 | /* | ||
231 | * Here we try to get the tasklist_lock as hard as possible, | ||
232 | * if not successful after 2 seconds we ignore it (but keep | ||
233 | * trying). This is to enable a debug printout even if a | ||
234 | * tasklist_lock-holding task deadlocks or crashes. | ||
235 | */ | ||
236 | retry: | ||
237 | if (!read_trylock(&tasklist_lock)) { | ||
238 | if (count == 10) | ||
239 | printk("hm, tasklist_lock locked, retrying... "); | ||
240 | if (count) { | ||
241 | count--; | ||
242 | printk(" #%d", 10-count); | ||
243 | mdelay(200); | ||
244 | goto retry; | ||
245 | } | ||
246 | printk(" ignoring it.\n"); | ||
247 | unlock = 0; | ||
248 | } | ||
249 | if (count != 10) | ||
250 | printk(" locked it.\n"); | ||
251 | |||
252 | do_each_thread(g, p) { | ||
253 | show_task_locks(p); | ||
254 | if (!unlock) | ||
255 | if (read_trylock(&tasklist_lock)) | ||
256 | unlock = 1; | ||
257 | } while_each_thread(g, p); | ||
258 | |||
259 | printk("\n"); | ||
260 | |||
261 | printk("-----------------------------------------\n"); | ||
262 | printk("| showing all locks held in the system: |\n"); | ||
263 | printk("-----------------------------------------\n"); | ||
264 | |||
265 | do_each_thread(g, p) { | ||
266 | rt_mutex_show_held_locks(p, 0); | ||
267 | if (!unlock) | ||
268 | if (read_trylock(&tasklist_lock)) | ||
269 | unlock = 1; | ||
270 | } while_each_thread(g, p); | ||
271 | |||
272 | |||
273 | printk("=============================================\n\n"); | ||
274 | |||
275 | if (unlock) | ||
276 | read_unlock(&tasklist_lock); | ||
277 | } | ||
278 | |||
279 | void rt_mutex_debug_check_no_locks_held(task_t *task) | ||
280 | { | ||
281 | struct rt_mutex_waiter *w; | ||
282 | struct list_head *curr; | ||
283 | struct rt_mutex *lock; | ||
284 | |||
285 | if (!rt_trace_on) | ||
286 | return; | ||
287 | if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { | ||
288 | printk("BUG: PI priority boost leaked!\n"); | ||
289 | printk_task(task); | ||
290 | printk("\n"); | ||
291 | } | ||
292 | if (list_empty(&task->held_list_head)) | ||
293 | return; | ||
294 | |||
295 | spin_lock(&task->pi_lock); | ||
296 | plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { | ||
297 | TRACE_OFF(); | ||
298 | |||
299 | printk("hm, PI interest held at exit time? Task:\n"); | ||
300 | printk_task(task); | ||
301 | printk_waiter(w); | ||
302 | return; | ||
303 | } | ||
304 | spin_unlock(&task->pi_lock); | ||
305 | |||
306 | list_for_each(curr, &task->held_list_head) { | ||
307 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
308 | |||
309 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
310 | task->comm, task->pid); | ||
311 | printk_lock(lock, 1); | ||
312 | if (rt_mutex_owner(lock) != task) | ||
313 | printk("exiting task is not even the owner??\n"); | ||
314 | } | ||
315 | } | ||
316 | |||
317 | int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
318 | { | ||
319 | const void *to = from + len; | ||
320 | struct list_head *curr; | ||
321 | struct rt_mutex *lock; | ||
322 | unsigned long flags; | ||
323 | void *lock_addr; | ||
324 | |||
325 | if (!rt_trace_on) | ||
326 | return 0; | ||
327 | |||
328 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
329 | list_for_each(curr, ¤t->held_list_head) { | ||
330 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
331 | lock_addr = lock; | ||
332 | if (lock_addr < from || lock_addr >= to) | ||
333 | continue; | ||
334 | TRACE_OFF(); | ||
335 | |||
336 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
337 | current->comm, current->pid, lock, from, to); | ||
338 | dump_stack(); | ||
339 | printk_lock(lock, 1); | ||
340 | if (rt_mutex_owner(lock) != current) | ||
341 | printk("freeing task is not even the owner??\n"); | ||
342 | return 1; | ||
343 | } | ||
344 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | void rt_mutex_debug_task_free(struct task_struct *task) | ||
350 | { | ||
351 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | ||
352 | WARN_ON(task->pi_blocked_on); | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * We fill out the fields in the waiter to store the information about | ||
357 | * the deadlock. We print when we return. act_waiter can be NULL in | ||
358 | * case of a remove waiter operation. | ||
359 | */ | ||
360 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | ||
361 | struct rt_mutex *lock) | ||
362 | { | ||
363 | struct task_struct *task; | ||
364 | |||
365 | if (!rt_trace_on || detect || !act_waiter) | ||
366 | return; | ||
367 | |||
368 | task = rt_mutex_owner(act_waiter->lock); | ||
369 | if (task && task != current) { | ||
370 | act_waiter->deadlock_task_pid = task->pid; | ||
371 | act_waiter->deadlock_lock = lock; | ||
372 | } | ||
373 | } | ||
374 | |||
375 | void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | ||
376 | { | ||
377 | struct task_struct *task; | ||
378 | |||
379 | if (!waiter->deadlock_lock || !rt_trace_on) | ||
380 | return; | ||
381 | |||
382 | task = find_task_by_pid(waiter->deadlock_task_pid); | ||
383 | if (!task) | ||
384 | return; | ||
385 | |||
386 | TRACE_OFF_NOLOCK(); | ||
387 | |||
388 | printk("\n============================================\n"); | ||
389 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
390 | printk( "--------------------------------------------\n"); | ||
391 | printk("%s/%d is deadlocking current task %s/%d\n\n", | ||
392 | task->comm, task->pid, current->comm, current->pid); | ||
393 | |||
394 | printk("\n1) %s/%d is trying to acquire this lock:\n", | ||
395 | current->comm, current->pid); | ||
396 | printk_lock(waiter->lock, 1); | ||
397 | |||
398 | printk("... trying at: "); | ||
399 | print_symbol("%s\n", waiter->ip); | ||
400 | |||
401 | printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); | ||
402 | printk_lock(waiter->deadlock_lock, 1); | ||
403 | |||
404 | rt_mutex_show_held_locks(current, 1); | ||
405 | rt_mutex_show_held_locks(task, 1); | ||
406 | |||
407 | printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); | ||
408 | show_stack(task, NULL); | ||
409 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
410 | current->comm, current->pid); | ||
411 | dump_stack(); | ||
412 | rt_mutex_show_all_locks(); | ||
413 | printk("[ turning off deadlock detection." | ||
414 | "Please report this trace. ]\n\n"); | ||
415 | local_irq_disable(); | ||
416 | } | ||
417 | |||
418 | void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) | ||
419 | { | ||
420 | unsigned long flags; | ||
421 | |||
422 | if (rt_trace_on) { | ||
423 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
424 | |||
425 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
426 | list_add_tail(&lock->held_list_entry, ¤t->held_list_head); | ||
427 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
428 | |||
429 | lock->acquire_ip = ip; | ||
430 | } | ||
431 | } | ||
432 | |||
433 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | ||
434 | { | ||
435 | unsigned long flags; | ||
436 | |||
437 | if (rt_trace_on) { | ||
438 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | ||
439 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
440 | |||
441 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
442 | list_del_init(&lock->held_list_entry); | ||
443 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
448 | struct task_struct *powner __IP_DECL__) | ||
449 | { | ||
450 | unsigned long flags; | ||
451 | |||
452 | if (rt_trace_on) { | ||
453 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
454 | |||
455 | spin_lock_irqsave(&powner->held_list_lock, flags); | ||
456 | list_add_tail(&lock->held_list_entry, &powner->held_list_head); | ||
457 | spin_unlock_irqrestore(&powner->held_list_lock, flags); | ||
458 | |||
459 | lock->acquire_ip = ip; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | ||
464 | { | ||
465 | unsigned long flags; | ||
466 | |||
467 | if (rt_trace_on) { | ||
468 | struct task_struct *owner = rt_mutex_owner(lock); | ||
469 | |||
470 | TRACE_WARN_ON_LOCKED(!owner); | ||
471 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
472 | |||
473 | spin_lock_irqsave(&owner->held_list_lock, flags); | ||
474 | list_del_init(&lock->held_list_entry); | ||
475 | spin_unlock_irqrestore(&owner->held_list_lock, flags); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
480 | { | ||
481 | memset(waiter, 0x11, sizeof(*waiter)); | ||
482 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
483 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
484 | } | ||
485 | |||
486 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | ||
487 | { | ||
488 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
489 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
490 | TRACE_WARN_ON(waiter->task); | ||
491 | memset(waiter, 0x22, sizeof(*waiter)); | ||
492 | } | ||
493 | |||
494 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
495 | { | ||
496 | void *addr = lock; | ||
497 | |||
498 | if (rt_trace_on) { | ||
499 | rt_mutex_debug_check_no_locks_freed(addr, | ||
500 | sizeof(struct rt_mutex)); | ||
501 | INIT_LIST_HEAD(&lock->held_list_entry); | ||
502 | lock->name = name; | ||
503 | } | ||
504 | } | ||
505 | |||
506 | void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
511 | { | ||
512 | } | ||
513 | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 000000000000..7612fbc62d70 --- /dev/null +++ b/kernel/rtmutex-debug.h | |||
@@ -0,0 +1,37 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. Debug version. | ||
10 | */ | ||
11 | |||
12 | #define __IP_DECL__ , unsigned long ip | ||
13 | #define __IP__ , ip | ||
14 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
15 | |||
16 | extern void | ||
17 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
18 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
19 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
20 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | ||
21 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | ||
22 | extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); | ||
23 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | ||
24 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
25 | struct task_struct *powner __IP_DECL__); | ||
26 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | ||
27 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | ||
28 | struct rt_mutex *lock); | ||
29 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | ||
30 | # define debug_rt_mutex_reset_waiter(w) \ | ||
31 | do { (w)->deadlock_lock = NULL; } while (0) | ||
32 | |||
33 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
34 | int detect) | ||
35 | { | ||
36 | return (waiter != NULL); | ||
37 | } | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 000000000000..e82c2f848249 --- /dev/null +++ b/kernel/rtmutex-tester.c | |||
@@ -0,0 +1,440 @@ | |||
1 | /* | ||
2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
3 | * | ||
4 | * started by Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/kthread.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/smp_lock.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/timer.h> | ||
17 | |||
18 | #include "rtmutex.h" | ||
19 | |||
20 | #define MAX_RT_TEST_THREADS 8 | ||
21 | #define MAX_RT_TEST_MUTEXES 8 | ||
22 | |||
23 | static spinlock_t rttest_lock; | ||
24 | static atomic_t rttest_event; | ||
25 | |||
26 | struct test_thread_data { | ||
27 | int opcode; | ||
28 | int opdata; | ||
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
30 | int bkl; | ||
31 | int event; | ||
32 | struct sys_device sysdev; | ||
33 | }; | ||
34 | |||
35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
36 | static task_t *threads[MAX_RT_TEST_THREADS]; | ||
37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
38 | |||
39 | enum test_opcodes { | ||
40 | RTTEST_NOP = 0, | ||
41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | ||
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | ||
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
54 | }; | ||
55 | |||
56 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
57 | { | ||
58 | int i, id, ret = -EINVAL; | ||
59 | |||
60 | switch(td->opcode) { | ||
61 | |||
62 | case RTTEST_NOP: | ||
63 | return 0; | ||
64 | |||
65 | case RTTEST_LOCKCONT: | ||
66 | td->mutexes[td->opdata] = 1; | ||
67 | td->event = atomic_add_return(1, &rttest_event); | ||
68 | return 0; | ||
69 | |||
70 | case RTTEST_RESET: | ||
71 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
72 | if (td->mutexes[i] == 4) { | ||
73 | rt_mutex_unlock(&mutexes[i]); | ||
74 | td->mutexes[i] = 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | unlock_kernel(); | ||
80 | td->bkl = 0; | ||
81 | } | ||
82 | return 0; | ||
83 | |||
84 | case RTTEST_RESETEVENT: | ||
85 | atomic_set(&rttest_event, 0); | ||
86 | return 0; | ||
87 | |||
88 | default: | ||
89 | if (lockwakeup) | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | switch(td->opcode) { | ||
94 | |||
95 | case RTTEST_LOCK: | ||
96 | case RTTEST_LOCKNOWAIT: | ||
97 | id = td->opdata; | ||
98 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
99 | return ret; | ||
100 | |||
101 | td->mutexes[id] = 1; | ||
102 | td->event = atomic_add_return(1, &rttest_event); | ||
103 | rt_mutex_lock(&mutexes[id]); | ||
104 | td->event = atomic_add_return(1, &rttest_event); | ||
105 | td->mutexes[id] = 4; | ||
106 | return 0; | ||
107 | |||
108 | case RTTEST_LOCKINT: | ||
109 | case RTTEST_LOCKINTNOWAIT: | ||
110 | id = td->opdata; | ||
111 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
112 | return ret; | ||
113 | |||
114 | td->mutexes[id] = 1; | ||
115 | td->event = atomic_add_return(1, &rttest_event); | ||
116 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
117 | td->event = atomic_add_return(1, &rttest_event); | ||
118 | td->mutexes[id] = ret ? 0 : 4; | ||
119 | return ret ? -EINTR : 0; | ||
120 | |||
121 | case RTTEST_UNLOCK: | ||
122 | id = td->opdata; | ||
123 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
124 | return ret; | ||
125 | |||
126 | td->event = atomic_add_return(1, &rttest_event); | ||
127 | rt_mutex_unlock(&mutexes[id]); | ||
128 | td->event = atomic_add_return(1, &rttest_event); | ||
129 | td->mutexes[id] = 0; | ||
130 | return 0; | ||
131 | |||
132 | case RTTEST_LOCKBKL: | ||
133 | if (td->bkl) | ||
134 | return 0; | ||
135 | td->bkl = 1; | ||
136 | lock_kernel(); | ||
137 | td->bkl = 4; | ||
138 | return 0; | ||
139 | |||
140 | case RTTEST_UNLOCKBKL: | ||
141 | if (td->bkl != 4) | ||
142 | break; | ||
143 | unlock_kernel(); | ||
144 | td->bkl = 0; | ||
145 | return 0; | ||
146 | |||
147 | default: | ||
148 | break; | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
155 | * PF_MUTEX_TESTER set. | ||
156 | * | ||
157 | * This allows us to have finegrained control over the event flow. | ||
158 | * | ||
159 | */ | ||
160 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
161 | { | ||
162 | int tid, op, dat; | ||
163 | struct test_thread_data *td; | ||
164 | |||
165 | /* We have to lookup the task */ | ||
166 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
167 | if (threads[tid] == current) | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
172 | |||
173 | td = &thread_data[tid]; | ||
174 | |||
175 | op = td->opcode; | ||
176 | dat = td->opdata; | ||
177 | |||
178 | switch (op) { | ||
179 | case RTTEST_LOCK: | ||
180 | case RTTEST_LOCKINT: | ||
181 | case RTTEST_LOCKNOWAIT: | ||
182 | case RTTEST_LOCKINTNOWAIT: | ||
183 | if (mutex != &mutexes[dat]) | ||
184 | break; | ||
185 | |||
186 | if (td->mutexes[dat] != 1) | ||
187 | break; | ||
188 | |||
189 | td->mutexes[dat] = 2; | ||
190 | td->event = atomic_add_return(1, &rttest_event); | ||
191 | break; | ||
192 | |||
193 | case RTTEST_LOCKBKL: | ||
194 | default: | ||
195 | break; | ||
196 | } | ||
197 | |||
198 | schedule(); | ||
199 | |||
200 | |||
201 | switch (op) { | ||
202 | case RTTEST_LOCK: | ||
203 | case RTTEST_LOCKINT: | ||
204 | if (mutex != &mutexes[dat]) | ||
205 | return; | ||
206 | |||
207 | if (td->mutexes[dat] != 2) | ||
208 | return; | ||
209 | |||
210 | td->mutexes[dat] = 3; | ||
211 | td->event = atomic_add_return(1, &rttest_event); | ||
212 | break; | ||
213 | |||
214 | case RTTEST_LOCKNOWAIT: | ||
215 | case RTTEST_LOCKINTNOWAIT: | ||
216 | if (mutex != &mutexes[dat]) | ||
217 | return; | ||
218 | |||
219 | if (td->mutexes[dat] != 2) | ||
220 | return; | ||
221 | |||
222 | td->mutexes[dat] = 1; | ||
223 | td->event = atomic_add_return(1, &rttest_event); | ||
224 | return; | ||
225 | |||
226 | case RTTEST_LOCKBKL: | ||
227 | return; | ||
228 | default: | ||
229 | return; | ||
230 | } | ||
231 | |||
232 | td->opcode = 0; | ||
233 | |||
234 | for (;;) { | ||
235 | set_current_state(TASK_INTERRUPTIBLE); | ||
236 | |||
237 | if (td->opcode > 0) { | ||
238 | int ret; | ||
239 | |||
240 | set_current_state(TASK_RUNNING); | ||
241 | ret = handle_op(td, 1); | ||
242 | set_current_state(TASK_INTERRUPTIBLE); | ||
243 | if (td->opcode == RTTEST_LOCKCONT) | ||
244 | break; | ||
245 | td->opcode = ret; | ||
246 | } | ||
247 | |||
248 | /* Wait for the next command to be executed */ | ||
249 | schedule(); | ||
250 | } | ||
251 | |||
252 | /* Restore previous command and data */ | ||
253 | td->opcode = op; | ||
254 | td->opdata = dat; | ||
255 | } | ||
256 | |||
257 | static int test_func(void *data) | ||
258 | { | ||
259 | struct test_thread_data *td = data; | ||
260 | int ret; | ||
261 | |||
262 | current->flags |= PF_MUTEX_TESTER; | ||
263 | allow_signal(SIGHUP); | ||
264 | |||
265 | for(;;) { | ||
266 | |||
267 | set_current_state(TASK_INTERRUPTIBLE); | ||
268 | |||
269 | if (td->opcode > 0) { | ||
270 | set_current_state(TASK_RUNNING); | ||
271 | ret = handle_op(td, 0); | ||
272 | set_current_state(TASK_INTERRUPTIBLE); | ||
273 | td->opcode = ret; | ||
274 | } | ||
275 | |||
276 | /* Wait for the next command to be executed */ | ||
277 | schedule(); | ||
278 | |||
279 | if (signal_pending(current)) | ||
280 | flush_signals(current); | ||
281 | |||
282 | if(kthread_should_stop()) | ||
283 | break; | ||
284 | } | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * sysfs_test_command - interface for test commands | ||
290 | * @dev: thread reference | ||
291 | * @buf: command for actual step | ||
292 | * @count: length of buffer | ||
293 | * | ||
294 | * command syntax: | ||
295 | * | ||
296 | * opcode:data | ||
297 | */ | ||
298 | static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, | ||
299 | size_t count) | ||
300 | { | ||
301 | struct sched_param schedpar; | ||
302 | struct test_thread_data *td; | ||
303 | char cmdbuf[32]; | ||
304 | int op, dat, tid, ret; | ||
305 | |||
306 | td = container_of(dev, struct test_thread_data, sysdev); | ||
307 | tid = td->sysdev.id; | ||
308 | |||
309 | /* strings from sysfs write are not 0 terminated! */ | ||
310 | if (count >= sizeof(cmdbuf)) | ||
311 | return -EINVAL; | ||
312 | |||
313 | /* strip of \n: */ | ||
314 | if (buf[count-1] == '\n') | ||
315 | count--; | ||
316 | if (count < 1) | ||
317 | return -EINVAL; | ||
318 | |||
319 | memcpy(cmdbuf, buf, count); | ||
320 | cmdbuf[count] = 0; | ||
321 | |||
322 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
323 | return -EINVAL; | ||
324 | |||
325 | switch (op) { | ||
326 | case RTTEST_SCHEDOT: | ||
327 | schedpar.sched_priority = 0; | ||
328 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
329 | if (ret) | ||
330 | return ret; | ||
331 | set_user_nice(current, 0); | ||
332 | break; | ||
333 | |||
334 | case RTTEST_SCHEDRT: | ||
335 | schedpar.sched_priority = dat; | ||
336 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
337 | if (ret) | ||
338 | return ret; | ||
339 | break; | ||
340 | |||
341 | case RTTEST_SIGNAL: | ||
342 | send_sig(SIGHUP, threads[tid], 0); | ||
343 | break; | ||
344 | |||
345 | default: | ||
346 | if (td->opcode > 0) | ||
347 | return -EBUSY; | ||
348 | td->opdata = dat; | ||
349 | td->opcode = op; | ||
350 | wake_up_process(threads[tid]); | ||
351 | } | ||
352 | |||
353 | return count; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * sysfs_test_status - sysfs interface for rt tester | ||
358 | * @dev: thread to query | ||
359 | * @buf: char buffer to be filled with thread status info | ||
360 | */ | ||
361 | static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) | ||
362 | { | ||
363 | struct test_thread_data *td; | ||
364 | char *curr = buf; | ||
365 | task_t *tsk; | ||
366 | int i; | ||
367 | |||
368 | td = container_of(dev, struct test_thread_data, sysdev); | ||
369 | tsk = threads[td->sysdev.id]; | ||
370 | |||
371 | spin_lock(&rttest_lock); | ||
372 | |||
373 | curr += sprintf(curr, | ||
374 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | ||
375 | td->opcode, td->event, tsk->state, | ||
376 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
377 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
378 | tsk->pi_blocked_on, td->bkl); | ||
379 | |||
380 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
381 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
382 | |||
383 | spin_unlock(&rttest_lock); | ||
384 | |||
385 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
386 | mutexes[td->sysdev.id].owner); | ||
387 | |||
388 | return curr - buf; | ||
389 | } | ||
390 | |||
391 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | ||
392 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | ||
393 | |||
394 | static struct sysdev_class rttest_sysclass = { | ||
395 | set_kset_name("rttest"), | ||
396 | }; | ||
397 | |||
398 | static int init_test_thread(int id) | ||
399 | { | ||
400 | thread_data[id].sysdev.cls = &rttest_sysclass; | ||
401 | thread_data[id].sysdev.id = id; | ||
402 | |||
403 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
404 | if (IS_ERR(threads[id])) | ||
405 | return PTR_ERR(threads[id]); | ||
406 | |||
407 | return sysdev_register(&thread_data[id].sysdev); | ||
408 | } | ||
409 | |||
410 | static int init_rttest(void) | ||
411 | { | ||
412 | int ret, i; | ||
413 | |||
414 | spin_lock_init(&rttest_lock); | ||
415 | |||
416 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
417 | rt_mutex_init(&mutexes[i]); | ||
418 | |||
419 | ret = sysdev_class_register(&rttest_sysclass); | ||
420 | if (ret) | ||
421 | return ret; | ||
422 | |||
423 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
424 | ret = init_test_thread(i); | ||
425 | if (ret) | ||
426 | break; | ||
427 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | ||
428 | if (ret) | ||
429 | break; | ||
430 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | ||
431 | if (ret) | ||
432 | break; | ||
433 | } | ||
434 | |||
435 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
436 | |||
437 | return ret; | ||
438 | } | ||
439 | |||
440 | device_initcall(init_rttest); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 000000000000..45d61016da57 --- /dev/null +++ b/kernel/rtmutex.c | |||
@@ -0,0 +1,990 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner. | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | ||
9 | * Copyright (C) 2006 Esben Nielsen | ||
10 | */ | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/timer.h> | ||
15 | |||
16 | #include "rtmutex_common.h" | ||
17 | |||
18 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
19 | # include "rtmutex-debug.h" | ||
20 | #else | ||
21 | # include "rtmutex.h" | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * lock->owner state tracking: | ||
26 | * | ||
27 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | ||
28 | * are used to keep track of the "owner is pending" and "lock has | ||
29 | * waiters" state. | ||
30 | * | ||
31 | * owner bit1 bit0 | ||
32 | * NULL 0 0 lock is free (fast acquire possible) | ||
33 | * NULL 0 1 invalid state | ||
34 | * NULL 1 0 Transitional State* | ||
35 | * NULL 1 1 invalid state | ||
36 | * taskpointer 0 0 lock is held (fast release possible) | ||
37 | * taskpointer 0 1 task is pending owner | ||
38 | * taskpointer 1 0 lock is held and has waiters | ||
39 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
40 | * | ||
41 | * Pending ownership is assigned to the top (highest priority) | ||
42 | * waiter of the lock, when the lock is released. The thread is woken | ||
43 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
44 | * cleared) a competing higher priority thread can steal the lock | ||
45 | * which puts the woken up thread back on the waiters list. | ||
46 | * | ||
47 | * The fast atomic compare exchange based acquire and release is only | ||
48 | * possible when bit 0 and 1 of lock->owner are 0. | ||
49 | * | ||
50 | * (*) There's a small time where the owner can be NULL and the | ||
51 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | ||
52 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | ||
53 | * bit before looking at the lock, hence the reason this is a transitional | ||
54 | * state. | ||
55 | */ | ||
56 | |||
57 | static void | ||
58 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | ||
59 | unsigned long mask) | ||
60 | { | ||
61 | unsigned long val = (unsigned long)owner | mask; | ||
62 | |||
63 | if (rt_mutex_has_waiters(lock)) | ||
64 | val |= RT_MUTEX_HAS_WAITERS; | ||
65 | |||
66 | lock->owner = (struct task_struct *)val; | ||
67 | } | ||
68 | |||
69 | static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) | ||
70 | { | ||
71 | lock->owner = (struct task_struct *) | ||
72 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
73 | } | ||
74 | |||
75 | static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | ||
76 | { | ||
77 | if (!rt_mutex_has_waiters(lock)) | ||
78 | clear_rt_mutex_waiters(lock); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * We can speed up the acquire/release, if the architecture | ||
83 | * supports cmpxchg and if there's no debugging state to be set up | ||
84 | */ | ||
85 | #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) | ||
86 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | ||
87 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
88 | { | ||
89 | unsigned long owner, *p = (unsigned long *) &lock->owner; | ||
90 | |||
91 | do { | ||
92 | owner = *p; | ||
93 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
94 | } | ||
95 | #else | ||
96 | # define rt_mutex_cmpxchg(l,c,n) (0) | ||
97 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
98 | { | ||
99 | lock->owner = (struct task_struct *) | ||
100 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | ||
101 | } | ||
102 | #endif | ||
103 | |||
104 | /* | ||
105 | * Calculate task priority from the waiter list priority | ||
106 | * | ||
107 | * Return task->normal_prio when the waiter list is empty or when | ||
108 | * the waiter is not allowed to do priority boosting | ||
109 | */ | ||
110 | int rt_mutex_getprio(struct task_struct *task) | ||
111 | { | ||
112 | if (likely(!task_has_pi_waiters(task))) | ||
113 | return task->normal_prio; | ||
114 | |||
115 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | ||
116 | task->normal_prio); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
121 | * | ||
122 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
123 | */ | ||
124 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
125 | { | ||
126 | int prio = rt_mutex_getprio(task); | ||
127 | |||
128 | if (task->prio != prio) | ||
129 | rt_mutex_setprio(task, prio); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Adjust task priority (undo boosting). Called from the exit path of | ||
134 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
135 | * | ||
136 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
137 | * allow the lock to be taken while or before we readjust the priority | ||
138 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
139 | * outside of the debug path.) | ||
140 | */ | ||
141 | static void rt_mutex_adjust_prio(struct task_struct *task) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&task->pi_lock, flags); | ||
146 | __rt_mutex_adjust_prio(task); | ||
147 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Max number of times we'll walk the boosting chain: | ||
152 | */ | ||
153 | int max_lock_depth = 1024; | ||
154 | |||
155 | /* | ||
156 | * Adjust the priority chain. Also used for deadlock detection. | ||
157 | * Decreases task's usage by one - may thus free the task. | ||
158 | * Returns 0 or -EDEADLK. | ||
159 | */ | ||
160 | static int rt_mutex_adjust_prio_chain(task_t *task, | ||
161 | int deadlock_detect, | ||
162 | struct rt_mutex *orig_lock, | ||
163 | struct rt_mutex_waiter *orig_waiter, | ||
164 | struct task_struct *top_task | ||
165 | __IP_DECL__) | ||
166 | { | ||
167 | struct rt_mutex *lock; | ||
168 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | ||
169 | int detect_deadlock, ret = 0, depth = 0; | ||
170 | unsigned long flags; | ||
171 | |||
172 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | ||
173 | deadlock_detect); | ||
174 | |||
175 | /* | ||
176 | * The (de)boosting is a step by step approach with a lot of | ||
177 | * pitfalls. We want this to be preemptible and we want hold a | ||
178 | * maximum of two locks per step. So we have to check | ||
179 | * carefully whether things change under us. | ||
180 | */ | ||
181 | again: | ||
182 | if (++depth > max_lock_depth) { | ||
183 | static int prev_max; | ||
184 | |||
185 | /* | ||
186 | * Print this only once. If the admin changes the limit, | ||
187 | * print a new message when reaching the limit again. | ||
188 | */ | ||
189 | if (prev_max != max_lock_depth) { | ||
190 | prev_max = max_lock_depth; | ||
191 | printk(KERN_WARNING "Maximum lock depth %d reached " | ||
192 | "task: %s (%d)\n", max_lock_depth, | ||
193 | top_task->comm, top_task->pid); | ||
194 | } | ||
195 | put_task_struct(task); | ||
196 | |||
197 | return deadlock_detect ? -EDEADLK : 0; | ||
198 | } | ||
199 | retry: | ||
200 | /* | ||
201 | * Task can not go away as we did a get_task() before ! | ||
202 | */ | ||
203 | spin_lock_irqsave(&task->pi_lock, flags); | ||
204 | |||
205 | waiter = task->pi_blocked_on; | ||
206 | /* | ||
207 | * Check whether the end of the boosting chain has been | ||
208 | * reached or the state of the chain has changed while we | ||
209 | * dropped the locks. | ||
210 | */ | ||
211 | if (!waiter || !waiter->task) | ||
212 | goto out_unlock_pi; | ||
213 | |||
214 | if (top_waiter && (!task_has_pi_waiters(task) || | ||
215 | top_waiter != task_top_pi_waiter(task))) | ||
216 | goto out_unlock_pi; | ||
217 | |||
218 | /* | ||
219 | * When deadlock detection is off then we check, if further | ||
220 | * priority adjustment is necessary. | ||
221 | */ | ||
222 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | ||
223 | goto out_unlock_pi; | ||
224 | |||
225 | lock = waiter->lock; | ||
226 | if (!spin_trylock(&lock->wait_lock)) { | ||
227 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
228 | cpu_relax(); | ||
229 | goto retry; | ||
230 | } | ||
231 | |||
232 | /* Deadlock detection */ | ||
233 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | ||
234 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | ||
235 | spin_unlock(&lock->wait_lock); | ||
236 | ret = deadlock_detect ? -EDEADLK : 0; | ||
237 | goto out_unlock_pi; | ||
238 | } | ||
239 | |||
240 | top_waiter = rt_mutex_top_waiter(lock); | ||
241 | |||
242 | /* Requeue the waiter */ | ||
243 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
244 | waiter->list_entry.prio = task->prio; | ||
245 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
246 | |||
247 | /* Release the task */ | ||
248 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
249 | put_task_struct(task); | ||
250 | |||
251 | /* Grab the next task */ | ||
252 | task = rt_mutex_owner(lock); | ||
253 | spin_lock_irqsave(&task->pi_lock, flags); | ||
254 | |||
255 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
256 | /* Boost the owner */ | ||
257 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | ||
258 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
259 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
260 | __rt_mutex_adjust_prio(task); | ||
261 | |||
262 | } else if (top_waiter == waiter) { | ||
263 | /* Deboost the owner */ | ||
264 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | ||
265 | waiter = rt_mutex_top_waiter(lock); | ||
266 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
267 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
268 | __rt_mutex_adjust_prio(task); | ||
269 | } | ||
270 | |||
271 | get_task_struct(task); | ||
272 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
273 | |||
274 | top_waiter = rt_mutex_top_waiter(lock); | ||
275 | spin_unlock(&lock->wait_lock); | ||
276 | |||
277 | if (!detect_deadlock && waiter != top_waiter) | ||
278 | goto out_put_task; | ||
279 | |||
280 | goto again; | ||
281 | |||
282 | out_unlock_pi: | ||
283 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
284 | out_put_task: | ||
285 | put_task_struct(task); | ||
286 | return ret; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Optimization: check if we can steal the lock from the | ||
291 | * assigned pending owner [which might not have taken the | ||
292 | * lock yet]: | ||
293 | */ | ||
294 | static inline int try_to_steal_lock(struct rt_mutex *lock) | ||
295 | { | ||
296 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
297 | struct rt_mutex_waiter *next; | ||
298 | unsigned long flags; | ||
299 | |||
300 | if (!rt_mutex_owner_pending(lock)) | ||
301 | return 0; | ||
302 | |||
303 | if (pendowner == current) | ||
304 | return 1; | ||
305 | |||
306 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
307 | if (current->prio >= pendowner->prio) { | ||
308 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Check if a waiter is enqueued on the pending owners | ||
314 | * pi_waiters list. Remove it and readjust pending owners | ||
315 | * priority. | ||
316 | */ | ||
317 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 1; | ||
320 | } | ||
321 | |||
322 | /* No chain handling, pending owner is not blocked on anything: */ | ||
323 | next = rt_mutex_top_waiter(lock); | ||
324 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
325 | __rt_mutex_adjust_prio(pendowner); | ||
326 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
327 | |||
328 | /* | ||
329 | * We are going to steal the lock and a waiter was | ||
330 | * enqueued on the pending owners pi_waiters queue. So | ||
331 | * we have to enqueue this waiter into | ||
332 | * current->pi_waiters list. This covers the case, | ||
333 | * where current is boosted because it holds another | ||
334 | * lock and gets unboosted because the booster is | ||
335 | * interrupted, so we would delay a waiter with higher | ||
336 | * priority as current->normal_prio. | ||
337 | * | ||
338 | * Note: in the rare case of a SCHED_OTHER task changing | ||
339 | * its priority and thus stealing the lock, next->task | ||
340 | * might be current: | ||
341 | */ | ||
342 | if (likely(next->task != current)) { | ||
343 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
344 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | ||
345 | __rt_mutex_adjust_prio(current); | ||
346 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
347 | } | ||
348 | return 1; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Try to take an rt-mutex | ||
353 | * | ||
354 | * This fails | ||
355 | * - when the lock has a real owner | ||
356 | * - when a different pending owner exists and has higher priority than current | ||
357 | * | ||
358 | * Must be called with lock->wait_lock held. | ||
359 | */ | ||
360 | static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) | ||
361 | { | ||
362 | /* | ||
363 | * We have to be careful here if the atomic speedups are | ||
364 | * enabled, such that, when | ||
365 | * - no other waiter is on the lock | ||
366 | * - the lock has been released since we did the cmpxchg | ||
367 | * the lock can be released or taken while we are doing the | ||
368 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | ||
369 | * | ||
370 | * The atomic acquire/release aware variant of | ||
371 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | ||
372 | * the WAITERS bit, the atomic release / acquire can not | ||
373 | * happen anymore and lock->wait_lock protects us from the | ||
374 | * non-atomic case. | ||
375 | * | ||
376 | * Note, that this might set lock->owner = | ||
377 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | ||
378 | * any more. This is fixed up when we take the ownership. | ||
379 | * This is the transitional state explained at the top of this file. | ||
380 | */ | ||
381 | mark_rt_mutex_waiters(lock); | ||
382 | |||
383 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | ||
384 | return 0; | ||
385 | |||
386 | /* We got the lock. */ | ||
387 | debug_rt_mutex_lock(lock __IP__); | ||
388 | |||
389 | rt_mutex_set_owner(lock, current, 0); | ||
390 | |||
391 | rt_mutex_deadlock_account_lock(lock, current); | ||
392 | |||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Task blocks on lock. | ||
398 | * | ||
399 | * Prepare waiter and propagate pi chain | ||
400 | * | ||
401 | * This must be called with lock->wait_lock held. | ||
402 | */ | ||
403 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | ||
404 | struct rt_mutex_waiter *waiter, | ||
405 | int detect_deadlock | ||
406 | __IP_DECL__) | ||
407 | { | ||
408 | struct rt_mutex_waiter *top_waiter = waiter; | ||
409 | task_t *owner = rt_mutex_owner(lock); | ||
410 | int boost = 0, res; | ||
411 | unsigned long flags; | ||
412 | |||
413 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
414 | __rt_mutex_adjust_prio(current); | ||
415 | waiter->task = current; | ||
416 | waiter->lock = lock; | ||
417 | plist_node_init(&waiter->list_entry, current->prio); | ||
418 | plist_node_init(&waiter->pi_list_entry, current->prio); | ||
419 | |||
420 | /* Get the top priority waiter on the lock */ | ||
421 | if (rt_mutex_has_waiters(lock)) | ||
422 | top_waiter = rt_mutex_top_waiter(lock); | ||
423 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
424 | |||
425 | current->pi_blocked_on = waiter; | ||
426 | |||
427 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
428 | |||
429 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
430 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
431 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | ||
432 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | ||
433 | |||
434 | __rt_mutex_adjust_prio(owner); | ||
435 | if (owner->pi_blocked_on) { | ||
436 | boost = 1; | ||
437 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
438 | get_task_struct(owner); | ||
439 | } | ||
440 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
441 | } | ||
442 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { | ||
443 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
444 | if (owner->pi_blocked_on) { | ||
445 | boost = 1; | ||
446 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
447 | get_task_struct(owner); | ||
448 | } | ||
449 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
450 | } | ||
451 | if (!boost) | ||
452 | return 0; | ||
453 | |||
454 | spin_unlock(&lock->wait_lock); | ||
455 | |||
456 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | ||
457 | current __IP__); | ||
458 | |||
459 | spin_lock(&lock->wait_lock); | ||
460 | |||
461 | return res; | ||
462 | } | ||
463 | |||
464 | /* | ||
465 | * Wake up the next waiter on the lock. | ||
466 | * | ||
467 | * Remove the top waiter from the current tasks waiter list and from | ||
468 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
469 | * | ||
470 | * Called with lock->wait_lock held. | ||
471 | */ | ||
472 | static void wakeup_next_waiter(struct rt_mutex *lock) | ||
473 | { | ||
474 | struct rt_mutex_waiter *waiter; | ||
475 | struct task_struct *pendowner; | ||
476 | unsigned long flags; | ||
477 | |||
478 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
479 | |||
480 | waiter = rt_mutex_top_waiter(lock); | ||
481 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
482 | |||
483 | /* | ||
484 | * Remove it from current->pi_waiters. We do not adjust a | ||
485 | * possible priority boost right now. We execute wakeup in the | ||
486 | * boosted mode and go back to normal after releasing | ||
487 | * lock->wait_lock. | ||
488 | */ | ||
489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | ||
490 | pendowner = waiter->task; | ||
491 | waiter->task = NULL; | ||
492 | |||
493 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | ||
494 | |||
495 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
496 | |||
497 | /* | ||
498 | * Clear the pi_blocked_on variable and enqueue a possible | ||
499 | * waiter into the pi_waiters list of the pending owner. This | ||
500 | * prevents that in case the pending owner gets unboosted a | ||
501 | * waiter with higher priority than pending-owner->normal_prio | ||
502 | * is blocked on the unboosted (pending) owner. | ||
503 | */ | ||
504 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
505 | |||
506 | WARN_ON(!pendowner->pi_blocked_on); | ||
507 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
508 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
509 | |||
510 | pendowner->pi_blocked_on = NULL; | ||
511 | |||
512 | if (rt_mutex_has_waiters(lock)) { | ||
513 | struct rt_mutex_waiter *next; | ||
514 | |||
515 | next = rt_mutex_top_waiter(lock); | ||
516 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
517 | } | ||
518 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
519 | |||
520 | wake_up_process(pendowner); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Remove a waiter from a lock | ||
525 | * | ||
526 | * Must be called with lock->wait_lock held | ||
527 | */ | ||
528 | static void remove_waiter(struct rt_mutex *lock, | ||
529 | struct rt_mutex_waiter *waiter __IP_DECL__) | ||
530 | { | ||
531 | int first = (waiter == rt_mutex_top_waiter(lock)); | ||
532 | int boost = 0; | ||
533 | task_t *owner = rt_mutex_owner(lock); | ||
534 | unsigned long flags; | ||
535 | |||
536 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
537 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
538 | waiter->task = NULL; | ||
539 | current->pi_blocked_on = NULL; | ||
540 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
541 | |||
542 | if (first && owner != current) { | ||
543 | |||
544 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
545 | |||
546 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | ||
547 | |||
548 | if (rt_mutex_has_waiters(lock)) { | ||
549 | struct rt_mutex_waiter *next; | ||
550 | |||
551 | next = rt_mutex_top_waiter(lock); | ||
552 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | ||
553 | } | ||
554 | __rt_mutex_adjust_prio(owner); | ||
555 | |||
556 | if (owner->pi_blocked_on) { | ||
557 | boost = 1; | ||
558 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
559 | get_task_struct(owner); | ||
560 | } | ||
561 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
562 | } | ||
563 | |||
564 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
565 | |||
566 | if (!boost) | ||
567 | return; | ||
568 | |||
569 | spin_unlock(&lock->wait_lock); | ||
570 | |||
571 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); | ||
572 | |||
573 | spin_lock(&lock->wait_lock); | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Recheck the pi chain, in case we got a priority setting | ||
578 | * | ||
579 | * Called from sched_setscheduler | ||
580 | */ | ||
581 | void rt_mutex_adjust_pi(struct task_struct *task) | ||
582 | { | ||
583 | struct rt_mutex_waiter *waiter; | ||
584 | unsigned long flags; | ||
585 | |||
586 | spin_lock_irqsave(&task->pi_lock, flags); | ||
587 | |||
588 | waiter = task->pi_blocked_on; | ||
589 | if (!waiter || waiter->list_entry.prio == task->prio) { | ||
590 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
591 | return; | ||
592 | } | ||
593 | |||
594 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
595 | get_task_struct(task); | ||
596 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
597 | |||
598 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); | ||
599 | } | ||
600 | |||
601 | /* | ||
602 | * Slow path lock function: | ||
603 | */ | ||
604 | static int __sched | ||
605 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
606 | struct hrtimer_sleeper *timeout, | ||
607 | int detect_deadlock __IP_DECL__) | ||
608 | { | ||
609 | struct rt_mutex_waiter waiter; | ||
610 | int ret = 0; | ||
611 | |||
612 | debug_rt_mutex_init_waiter(&waiter); | ||
613 | waiter.task = NULL; | ||
614 | |||
615 | spin_lock(&lock->wait_lock); | ||
616 | |||
617 | /* Try to acquire the lock again: */ | ||
618 | if (try_to_take_rt_mutex(lock __IP__)) { | ||
619 | spin_unlock(&lock->wait_lock); | ||
620 | return 0; | ||
621 | } | ||
622 | |||
623 | set_current_state(state); | ||
624 | |||
625 | /* Setup the timer, when timeout != NULL */ | ||
626 | if (unlikely(timeout)) | ||
627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | ||
628 | HRTIMER_ABS); | ||
629 | |||
630 | for (;;) { | ||
631 | /* Try to acquire the lock: */ | ||
632 | if (try_to_take_rt_mutex(lock __IP__)) | ||
633 | break; | ||
634 | |||
635 | /* | ||
636 | * TASK_INTERRUPTIBLE checks for signals and | ||
637 | * timeout. Ignored otherwise. | ||
638 | */ | ||
639 | if (unlikely(state == TASK_INTERRUPTIBLE)) { | ||
640 | /* Signal pending? */ | ||
641 | if (signal_pending(current)) | ||
642 | ret = -EINTR; | ||
643 | if (timeout && !timeout->task) | ||
644 | ret = -ETIMEDOUT; | ||
645 | if (ret) | ||
646 | break; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * waiter.task is NULL the first time we come here and | ||
651 | * when we have been woken up by the previous owner | ||
652 | * but the lock got stolen by a higher prio task. | ||
653 | */ | ||
654 | if (!waiter.task) { | ||
655 | ret = task_blocks_on_rt_mutex(lock, &waiter, | ||
656 | detect_deadlock __IP__); | ||
657 | /* | ||
658 | * If we got woken up by the owner then start loop | ||
659 | * all over without going into schedule to try | ||
660 | * to get the lock now: | ||
661 | */ | ||
662 | if (unlikely(!waiter.task)) | ||
663 | continue; | ||
664 | |||
665 | if (unlikely(ret)) | ||
666 | break; | ||
667 | } | ||
668 | |||
669 | spin_unlock(&lock->wait_lock); | ||
670 | |||
671 | debug_rt_mutex_print_deadlock(&waiter); | ||
672 | |||
673 | if (waiter.task) | ||
674 | schedule_rt_mutex(lock); | ||
675 | |||
676 | spin_lock(&lock->wait_lock); | ||
677 | set_current_state(state); | ||
678 | } | ||
679 | |||
680 | set_current_state(TASK_RUNNING); | ||
681 | |||
682 | if (unlikely(waiter.task)) | ||
683 | remove_waiter(lock, &waiter __IP__); | ||
684 | |||
685 | /* | ||
686 | * try_to_take_rt_mutex() sets the waiter bit | ||
687 | * unconditionally. We might have to fix that up. | ||
688 | */ | ||
689 | fixup_rt_mutex_waiters(lock); | ||
690 | |||
691 | spin_unlock(&lock->wait_lock); | ||
692 | |||
693 | /* Remove pending timer: */ | ||
694 | if (unlikely(timeout)) | ||
695 | hrtimer_cancel(&timeout->timer); | ||
696 | |||
697 | /* | ||
698 | * Readjust priority, when we did not get the lock. We might | ||
699 | * have been the pending owner and boosted. Since we did not | ||
700 | * take the lock, the PI boost has to go. | ||
701 | */ | ||
702 | if (unlikely(ret)) | ||
703 | rt_mutex_adjust_prio(current); | ||
704 | |||
705 | debug_rt_mutex_free_waiter(&waiter); | ||
706 | |||
707 | return ret; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Slow path try-lock function: | ||
712 | */ | ||
713 | static inline int | ||
714 | rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) | ||
715 | { | ||
716 | int ret = 0; | ||
717 | |||
718 | spin_lock(&lock->wait_lock); | ||
719 | |||
720 | if (likely(rt_mutex_owner(lock) != current)) { | ||
721 | |||
722 | ret = try_to_take_rt_mutex(lock __IP__); | ||
723 | /* | ||
724 | * try_to_take_rt_mutex() sets the lock waiters | ||
725 | * bit unconditionally. Clean this up. | ||
726 | */ | ||
727 | fixup_rt_mutex_waiters(lock); | ||
728 | } | ||
729 | |||
730 | spin_unlock(&lock->wait_lock); | ||
731 | |||
732 | return ret; | ||
733 | } | ||
734 | |||
735 | /* | ||
736 | * Slow path to release a rt-mutex: | ||
737 | */ | ||
738 | static void __sched | ||
739 | rt_mutex_slowunlock(struct rt_mutex *lock) | ||
740 | { | ||
741 | spin_lock(&lock->wait_lock); | ||
742 | |||
743 | debug_rt_mutex_unlock(lock); | ||
744 | |||
745 | rt_mutex_deadlock_account_unlock(current); | ||
746 | |||
747 | if (!rt_mutex_has_waiters(lock)) { | ||
748 | lock->owner = NULL; | ||
749 | spin_unlock(&lock->wait_lock); | ||
750 | return; | ||
751 | } | ||
752 | |||
753 | wakeup_next_waiter(lock); | ||
754 | |||
755 | spin_unlock(&lock->wait_lock); | ||
756 | |||
757 | /* Undo pi boosting if necessary: */ | ||
758 | rt_mutex_adjust_prio(current); | ||
759 | } | ||
760 | |||
761 | /* | ||
762 | * debug aware fast / slowpath lock,trylock,unlock | ||
763 | * | ||
764 | * The atomic acquire/release ops are compiled away, when either the | ||
765 | * architecture does not support cmpxchg or when debugging is enabled. | ||
766 | */ | ||
767 | static inline int | ||
768 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | ||
769 | int detect_deadlock, | ||
770 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
771 | struct hrtimer_sleeper *timeout, | ||
772 | int detect_deadlock __IP_DECL__)) | ||
773 | { | ||
774 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
775 | rt_mutex_deadlock_account_lock(lock, current); | ||
776 | return 0; | ||
777 | } else | ||
778 | return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); | ||
779 | } | ||
780 | |||
781 | static inline int | ||
782 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | ||
783 | struct hrtimer_sleeper *timeout, int detect_deadlock, | ||
784 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
785 | struct hrtimer_sleeper *timeout, | ||
786 | int detect_deadlock __IP_DECL__)) | ||
787 | { | ||
788 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
789 | rt_mutex_deadlock_account_lock(lock, current); | ||
790 | return 0; | ||
791 | } else | ||
792 | return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); | ||
793 | } | ||
794 | |||
795 | static inline int | ||
796 | rt_mutex_fasttrylock(struct rt_mutex *lock, | ||
797 | int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) | ||
798 | { | ||
799 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
800 | rt_mutex_deadlock_account_lock(lock, current); | ||
801 | return 1; | ||
802 | } | ||
803 | return slowfn(lock __RET_IP__); | ||
804 | } | ||
805 | |||
806 | static inline void | ||
807 | rt_mutex_fastunlock(struct rt_mutex *lock, | ||
808 | void (*slowfn)(struct rt_mutex *lock)) | ||
809 | { | ||
810 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) | ||
811 | rt_mutex_deadlock_account_unlock(current); | ||
812 | else | ||
813 | slowfn(lock); | ||
814 | } | ||
815 | |||
816 | /** | ||
817 | * rt_mutex_lock - lock a rt_mutex | ||
818 | * | ||
819 | * @lock: the rt_mutex to be locked | ||
820 | */ | ||
821 | void __sched rt_mutex_lock(struct rt_mutex *lock) | ||
822 | { | ||
823 | might_sleep(); | ||
824 | |||
825 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | ||
826 | } | ||
827 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | ||
828 | |||
829 | /** | ||
830 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | ||
831 | * | ||
832 | * @lock: the rt_mutex to be locked | ||
833 | * @detect_deadlock: deadlock detection on/off | ||
834 | * | ||
835 | * Returns: | ||
836 | * 0 on success | ||
837 | * -EINTR when interrupted by a signal | ||
838 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
839 | */ | ||
840 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | ||
841 | int detect_deadlock) | ||
842 | { | ||
843 | might_sleep(); | ||
844 | |||
845 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | ||
846 | detect_deadlock, rt_mutex_slowlock); | ||
847 | } | ||
848 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | ||
849 | |||
850 | /** | ||
851 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | ||
852 | * the timeout structure is provided | ||
853 | * by the caller | ||
854 | * | ||
855 | * @lock: the rt_mutex to be locked | ||
856 | * @timeout: timeout structure or NULL (no timeout) | ||
857 | * @detect_deadlock: deadlock detection on/off | ||
858 | * | ||
859 | * Returns: | ||
860 | * 0 on success | ||
861 | * -EINTR when interrupted by a signal | ||
862 | * -ETIMEOUT when the timeout expired | ||
863 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
864 | */ | ||
865 | int | ||
866 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | ||
867 | int detect_deadlock) | ||
868 | { | ||
869 | might_sleep(); | ||
870 | |||
871 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
872 | detect_deadlock, rt_mutex_slowlock); | ||
873 | } | ||
874 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | ||
875 | |||
876 | /** | ||
877 | * rt_mutex_trylock - try to lock a rt_mutex | ||
878 | * | ||
879 | * @lock: the rt_mutex to be locked | ||
880 | * | ||
881 | * Returns 1 on success and 0 on contention | ||
882 | */ | ||
883 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | ||
884 | { | ||
885 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | ||
886 | } | ||
887 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | ||
888 | |||
889 | /** | ||
890 | * rt_mutex_unlock - unlock a rt_mutex | ||
891 | * | ||
892 | * @lock: the rt_mutex to be unlocked | ||
893 | */ | ||
894 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | ||
895 | { | ||
896 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | ||
897 | } | ||
898 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | ||
899 | |||
900 | /*** | ||
901 | * rt_mutex_destroy - mark a mutex unusable | ||
902 | * @lock: the mutex to be destroyed | ||
903 | * | ||
904 | * This function marks the mutex uninitialized, and any subsequent | ||
905 | * use of the mutex is forbidden. The mutex must not be locked when | ||
906 | * this function is called. | ||
907 | */ | ||
908 | void rt_mutex_destroy(struct rt_mutex *lock) | ||
909 | { | ||
910 | WARN_ON(rt_mutex_is_locked(lock)); | ||
911 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
912 | lock->magic = NULL; | ||
913 | #endif | ||
914 | } | ||
915 | |||
916 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | ||
917 | |||
918 | /** | ||
919 | * __rt_mutex_init - initialize the rt lock | ||
920 | * | ||
921 | * @lock: the rt lock to be initialized | ||
922 | * | ||
923 | * Initialize the rt lock to unlocked state. | ||
924 | * | ||
925 | * Initializing of a locked rt lock is not allowed | ||
926 | */ | ||
927 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
928 | { | ||
929 | lock->owner = NULL; | ||
930 | spin_lock_init(&lock->wait_lock); | ||
931 | plist_head_init(&lock->wait_list, &lock->wait_lock); | ||
932 | |||
933 | debug_rt_mutex_init(lock, name); | ||
934 | } | ||
935 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | ||
936 | |||
937 | /** | ||
938 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | ||
939 | * proxy owner | ||
940 | * | ||
941 | * @lock: the rt_mutex to be locked | ||
942 | * @proxy_owner:the task to set as owner | ||
943 | * | ||
944 | * No locking. Caller has to do serializing itself | ||
945 | * Special API call for PI-futex support | ||
946 | */ | ||
947 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
948 | struct task_struct *proxy_owner) | ||
949 | { | ||
950 | __rt_mutex_init(lock, NULL); | ||
951 | debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); | ||
952 | rt_mutex_set_owner(lock, proxy_owner, 0); | ||
953 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
954 | } | ||
955 | |||
956 | /** | ||
957 | * rt_mutex_proxy_unlock - release a lock on behalf of owner | ||
958 | * | ||
959 | * @lock: the rt_mutex to be locked | ||
960 | * | ||
961 | * No locking. Caller has to do serializing itself | ||
962 | * Special API call for PI-futex support | ||
963 | */ | ||
964 | void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
965 | struct task_struct *proxy_owner) | ||
966 | { | ||
967 | debug_rt_mutex_proxy_unlock(lock); | ||
968 | rt_mutex_set_owner(lock, NULL, 0); | ||
969 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
970 | } | ||
971 | |||
972 | /** | ||
973 | * rt_mutex_next_owner - return the next owner of the lock | ||
974 | * | ||
975 | * @lock: the rt lock query | ||
976 | * | ||
977 | * Returns the next owner of the lock or NULL | ||
978 | * | ||
979 | * Caller has to serialize against other accessors to the lock | ||
980 | * itself. | ||
981 | * | ||
982 | * Special API call for PI-futex support | ||
983 | */ | ||
984 | struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | ||
985 | { | ||
986 | if (!rt_mutex_has_waiters(lock)) | ||
987 | return NULL; | ||
988 | |||
989 | return rt_mutex_top_waiter(lock)->task; | ||
990 | } | ||
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 000000000000..1e0fca13ff72 --- /dev/null +++ b/kernel/rtmutex.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. | ||
10 | * Non-debug version. | ||
11 | */ | ||
12 | |||
13 | #define __IP_DECL__ | ||
14 | #define __IP__ | ||
15 | #define __RET_IP__ | ||
16 | #define rt_mutex_deadlock_check(l) (0) | ||
17 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
18 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
19 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | ||
20 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | ||
21 | #define debug_rt_mutex_lock(l) do { } while (0) | ||
22 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | ||
23 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | ||
24 | #define debug_rt_mutex_unlock(l) do { } while (0) | ||
25 | #define debug_rt_mutex_init(m, n) do { } while (0) | ||
26 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | ||
27 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | ||
28 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
29 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 000000000000..9c75856e791e --- /dev/null +++ b/kernel/rtmutex_common.h | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * RT Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains the private data structure and API definitions. | ||
10 | */ | ||
11 | |||
12 | #ifndef __KERNEL_RTMUTEX_COMMON_H | ||
13 | #define __KERNEL_RTMUTEX_COMMON_H | ||
14 | |||
15 | #include <linux/rtmutex.h> | ||
16 | |||
17 | /* | ||
18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
20 | * belong to the tester. That way we can delay the wakeup path of those | ||
21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
22 | */ | ||
23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
24 | |||
25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
26 | |||
27 | #define schedule_rt_mutex(_lock) \ | ||
28 | do { \ | ||
29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
30 | schedule(); \ | ||
31 | else \ | ||
32 | schedule_rt_mutex_test(_lock); \ | ||
33 | } while (0) | ||
34 | |||
35 | #else | ||
36 | # define schedule_rt_mutex(_lock) schedule() | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * This is the control structure for tasks blocked on a rt_mutex, | ||
41 | * which is allocated on the kernel stack on of the blocked task. | ||
42 | * | ||
43 | * @list_entry: pi node to enqueue into the mutex waiters list | ||
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | ||
45 | * @task: task reference to the blocked task | ||
46 | */ | ||
47 | struct rt_mutex_waiter { | ||
48 | struct plist_node list_entry; | ||
49 | struct plist_node pi_list_entry; | ||
50 | struct task_struct *task; | ||
51 | struct rt_mutex *lock; | ||
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
53 | unsigned long ip; | ||
54 | pid_t deadlock_task_pid; | ||
55 | struct rt_mutex *deadlock_lock; | ||
56 | #endif | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * Various helpers to access the waiters-plist: | ||
61 | */ | ||
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | ||
63 | { | ||
64 | return !plist_head_empty(&lock->wait_list); | ||
65 | } | ||
66 | |||
67 | static inline struct rt_mutex_waiter * | ||
68 | rt_mutex_top_waiter(struct rt_mutex *lock) | ||
69 | { | ||
70 | struct rt_mutex_waiter *w; | ||
71 | |||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | ||
73 | list_entry); | ||
74 | BUG_ON(w->lock != lock); | ||
75 | |||
76 | return w; | ||
77 | } | ||
78 | |||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | ||
80 | { | ||
81 | return !plist_head_empty(&p->pi_waiters); | ||
82 | } | ||
83 | |||
84 | static inline struct rt_mutex_waiter * | ||
85 | task_top_pi_waiter(struct task_struct *p) | ||
86 | { | ||
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | ||
88 | pi_list_entry); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * lock->owner state tracking: | ||
93 | */ | ||
94 | #define RT_MUTEX_OWNER_PENDING 1UL | ||
95 | #define RT_MUTEX_HAS_WAITERS 2UL | ||
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | |||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | ||
99 | { | ||
100 | return (struct task_struct *) | ||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | ||
102 | } | ||
103 | |||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * PI-futex support (proxy locking functions, etc.): | ||
117 | */ | ||
118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
120 | struct task_struct *proxy_owner); | ||
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
122 | struct task_struct *proxy_owner); | ||
123 | #endif | ||
diff --git a/kernel/sched.c b/kernel/sched.c index a856040c200a..2629c1711fd6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -168,15 +168,21 @@ | |||
168 | */ | 168 | */ |
169 | 169 | ||
170 | #define SCALE_PRIO(x, prio) \ | 170 | #define SCALE_PRIO(x, prio) \ |
171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
172 | 172 | ||
173 | static unsigned int task_timeslice(task_t *p) | 173 | static unsigned int static_prio_timeslice(int static_prio) |
174 | { | 174 | { |
175 | if (p->static_prio < NICE_TO_PRIO(0)) | 175 | if (static_prio < NICE_TO_PRIO(0)) |
176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 176 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
177 | else | 177 | else |
178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
179 | } | 179 | } |
180 | |||
181 | static inline unsigned int task_timeslice(task_t *p) | ||
182 | { | ||
183 | return static_prio_timeslice(p->static_prio); | ||
184 | } | ||
185 | |||
180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 186 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
181 | < (long long) (sd)->cache_hot_time) | 187 | < (long long) (sd)->cache_hot_time) |
182 | 188 | ||
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p) | |||
184 | * These are the runqueue data structures: | 190 | * These are the runqueue data structures: |
185 | */ | 191 | */ |
186 | 192 | ||
187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
188 | |||
189 | typedef struct runqueue runqueue_t; | 193 | typedef struct runqueue runqueue_t; |
190 | 194 | ||
191 | struct prio_array { | 195 | struct prio_array { |
192 | unsigned int nr_active; | 196 | unsigned int nr_active; |
193 | unsigned long bitmap[BITMAP_SIZE]; | 197 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
194 | struct list_head queue[MAX_PRIO]; | 198 | struct list_head queue[MAX_PRIO]; |
195 | }; | 199 | }; |
196 | 200 | ||
@@ -209,6 +213,7 @@ struct runqueue { | |||
209 | * remote CPUs use both these fields when doing load calculation. | 213 | * remote CPUs use both these fields when doing load calculation. |
210 | */ | 214 | */ |
211 | unsigned long nr_running; | 215 | unsigned long nr_running; |
216 | unsigned long raw_weighted_load; | ||
212 | #ifdef CONFIG_SMP | 217 | #ifdef CONFIG_SMP |
213 | unsigned long cpu_load[3]; | 218 | unsigned long cpu_load[3]; |
214 | #endif | 219 | #endif |
@@ -239,7 +244,6 @@ struct runqueue { | |||
239 | 244 | ||
240 | task_t *migration_thread; | 245 | task_t *migration_thread; |
241 | struct list_head migration_queue; | 246 | struct list_head migration_queue; |
242 | int cpu; | ||
243 | #endif | 247 | #endif |
244 | 248 | ||
245 | #ifdef CONFIG_SCHEDSTATS | 249 | #ifdef CONFIG_SCHEDSTATS |
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 355 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
352 | 356 | ||
353 | /* | 357 | /* |
358 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
359 | * Must be called interrupts disabled. | ||
360 | */ | ||
361 | static inline runqueue_t *__task_rq_lock(task_t *p) | ||
362 | __acquires(rq->lock) | ||
363 | { | ||
364 | struct runqueue *rq; | ||
365 | |||
366 | repeat_lock_task: | ||
367 | rq = task_rq(p); | ||
368 | spin_lock(&rq->lock); | ||
369 | if (unlikely(rq != task_rq(p))) { | ||
370 | spin_unlock(&rq->lock); | ||
371 | goto repeat_lock_task; | ||
372 | } | ||
373 | return rq; | ||
374 | } | ||
375 | |||
376 | /* | ||
354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 377 | * task_rq_lock - lock the runqueue a given task resides on and disable |
355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 378 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
356 | * explicitly disabling preemption. | 379 | * explicitly disabling preemption. |
357 | */ | 380 | */ |
358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 381 | static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) |
359 | __acquires(rq->lock) | 382 | __acquires(rq->lock) |
360 | { | 383 | { |
361 | struct runqueue *rq; | 384 | struct runqueue *rq; |
@@ -371,6 +394,12 @@ repeat_lock_task: | |||
371 | return rq; | 394 | return rq; |
372 | } | 395 | } |
373 | 396 | ||
397 | static inline void __task_rq_unlock(runqueue_t *rq) | ||
398 | __releases(rq->lock) | ||
399 | { | ||
400 | spin_unlock(&rq->lock); | ||
401 | } | ||
402 | |||
374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 403 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) |
375 | __releases(rq->lock) | 404 | __releases(rq->lock) |
376 | { | 405 | { |
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
634 | } | 663 | } |
635 | 664 | ||
636 | /* | 665 | /* |
637 | * effective_prio - return the priority that is based on the static | 666 | * __normal_prio - return the priority that is based on the static |
638 | * priority but is modified by bonuses/penalties. | 667 | * priority but is modified by bonuses/penalties. |
639 | * | 668 | * |
640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 669 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
647 | * | 676 | * |
648 | * Both properties are important to certain workloads. | 677 | * Both properties are important to certain workloads. |
649 | */ | 678 | */ |
650 | static int effective_prio(task_t *p) | 679 | |
680 | static inline int __normal_prio(task_t *p) | ||
651 | { | 681 | { |
652 | int bonus, prio; | 682 | int bonus, prio; |
653 | 683 | ||
654 | if (rt_task(p)) | ||
655 | return p->prio; | ||
656 | |||
657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 684 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
658 | 685 | ||
659 | prio = p->static_prio - bonus; | 686 | prio = p->static_prio - bonus; |
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p) | |||
665 | } | 692 | } |
666 | 693 | ||
667 | /* | 694 | /* |
695 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
696 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
697 | * each task makes to its run queue's load is weighted according to its | ||
698 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
699 | * scaled version of the new time slice allocation that they receive on time | ||
700 | * slice expiry etc. | ||
701 | */ | ||
702 | |||
703 | /* | ||
704 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
705 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
706 | * this code will need modification | ||
707 | */ | ||
708 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
709 | #define LOAD_WEIGHT(lp) \ | ||
710 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
711 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
712 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
713 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
714 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
715 | |||
716 | static void set_load_weight(task_t *p) | ||
717 | { | ||
718 | if (has_rt_policy(p)) { | ||
719 | #ifdef CONFIG_SMP | ||
720 | if (p == task_rq(p)->migration_thread) | ||
721 | /* | ||
722 | * The migration thread does the actual balancing. | ||
723 | * Giving its load any weight will skew balancing | ||
724 | * adversely. | ||
725 | */ | ||
726 | p->load_weight = 0; | ||
727 | else | ||
728 | #endif | ||
729 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
730 | } else | ||
731 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
732 | } | ||
733 | |||
734 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
735 | { | ||
736 | rq->raw_weighted_load += p->load_weight; | ||
737 | } | ||
738 | |||
739 | static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
740 | { | ||
741 | rq->raw_weighted_load -= p->load_weight; | ||
742 | } | ||
743 | |||
744 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
745 | { | ||
746 | rq->nr_running++; | ||
747 | inc_raw_weighted_load(rq, p); | ||
748 | } | ||
749 | |||
750 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
751 | { | ||
752 | rq->nr_running--; | ||
753 | dec_raw_weighted_load(rq, p); | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * Calculate the expected normal priority: i.e. priority | ||
758 | * without taking RT-inheritance into account. Might be | ||
759 | * boosted by interactivity modifiers. Changes upon fork, | ||
760 | * setprio syscalls, and whenever the interactivity | ||
761 | * estimator recalculates. | ||
762 | */ | ||
763 | static inline int normal_prio(task_t *p) | ||
764 | { | ||
765 | int prio; | ||
766 | |||
767 | if (has_rt_policy(p)) | ||
768 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
769 | else | ||
770 | prio = __normal_prio(p); | ||
771 | return prio; | ||
772 | } | ||
773 | |||
774 | /* | ||
775 | * Calculate the current priority, i.e. the priority | ||
776 | * taken into account by the scheduler. This value might | ||
777 | * be boosted by RT tasks, or might be boosted by | ||
778 | * interactivity modifiers. Will be RT if the task got | ||
779 | * RT-boosted. If not then it returns p->normal_prio. | ||
780 | */ | ||
781 | static int effective_prio(task_t *p) | ||
782 | { | ||
783 | p->normal_prio = normal_prio(p); | ||
784 | /* | ||
785 | * If we are RT tasks or we were boosted to RT priority, | ||
786 | * keep the priority unchanged. Otherwise, update priority | ||
787 | * to the normal priority: | ||
788 | */ | ||
789 | if (!rt_prio(p->prio)) | ||
790 | return p->normal_prio; | ||
791 | return p->prio; | ||
792 | } | ||
793 | |||
794 | /* | ||
668 | * __activate_task - move a task to the runqueue. | 795 | * __activate_task - move a task to the runqueue. |
669 | */ | 796 | */ |
670 | static void __activate_task(task_t *p, runqueue_t *rq) | 797 | static void __activate_task(task_t *p, runqueue_t *rq) |
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
674 | if (batch_task(p)) | 801 | if (batch_task(p)) |
675 | target = rq->expired; | 802 | target = rq->expired; |
676 | enqueue_task(p, target); | 803 | enqueue_task(p, target); |
677 | rq->nr_running++; | 804 | inc_nr_running(p, rq); |
678 | } | 805 | } |
679 | 806 | ||
680 | /* | 807 | /* |
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 810 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
684 | { | 811 | { |
685 | enqueue_task_head(p, rq->active); | 812 | enqueue_task_head(p, rq->active); |
686 | rq->nr_running++; | 813 | inc_nr_running(p, rq); |
687 | } | 814 | } |
688 | 815 | ||
816 | /* | ||
817 | * Recalculate p->normal_prio and p->prio after having slept, | ||
818 | * updating the sleep-average too: | ||
819 | */ | ||
689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 820 | static int recalc_task_prio(task_t *p, unsigned long long now) |
690 | { | 821 | { |
691 | /* Caller must always ensure 'now >= p->timestamp' */ | 822 | /* Caller must always ensure 'now >= p->timestamp' */ |
692 | unsigned long long __sleep_time = now - p->timestamp; | 823 | unsigned long sleep_time = now - p->timestamp; |
693 | unsigned long sleep_time; | ||
694 | 824 | ||
695 | if (batch_task(p)) | 825 | if (batch_task(p)) |
696 | sleep_time = 0; | 826 | sleep_time = 0; |
697 | else { | ||
698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
700 | else | ||
701 | sleep_time = (unsigned long)__sleep_time; | ||
702 | } | ||
703 | 827 | ||
704 | if (likely(sleep_time > 0)) { | 828 | if (likely(sleep_time > 0)) { |
705 | /* | 829 | /* |
706 | * User tasks that sleep a long time are categorised as | 830 | * This ceiling is set to the lowest priority that would allow |
707 | * idle. They will only have their sleep_avg increased to a | 831 | * a task to be reinserted into the active array on timeslice |
708 | * level that makes them just interactive priority to stay | 832 | * completion. |
709 | * active yet prevent them suddenly becoming cpu hogs and | ||
710 | * starving other processes. | ||
711 | */ | 833 | */ |
712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 834 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
713 | unsigned long ceiling; | ||
714 | 835 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 836 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
716 | DEF_TIMESLICE); | 837 | /* |
717 | if (p->sleep_avg < ceiling) | 838 | * Prevents user tasks from achieving best priority |
718 | p->sleep_avg = ceiling; | 839 | * with one single large enough sleep. |
840 | */ | ||
841 | p->sleep_avg = ceiling; | ||
842 | /* | ||
843 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
844 | * nice(0) task 1ms sleep away from promotion, and | ||
845 | * gives it 700ms to round-robin with no chance of | ||
846 | * being demoted. This is more than generous, so | ||
847 | * mark this sleep as non-interactive to prevent the | ||
848 | * on-runqueue bonus logic from intervening should | ||
849 | * this task not receive cpu immediately. | ||
850 | */ | ||
851 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
719 | } else { | 852 | } else { |
720 | /* | 853 | /* |
721 | * Tasks waking from uninterruptible sleep are | 854 | * Tasks waking from uninterruptible sleep are |
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
723 | * are likely to be waiting on I/O | 856 | * are likely to be waiting on I/O |
724 | */ | 857 | */ |
725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 858 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 859 | if (p->sleep_avg >= ceiling) |
727 | sleep_time = 0; | 860 | sleep_time = 0; |
728 | else if (p->sleep_avg + sleep_time >= | 861 | else if (p->sleep_avg + sleep_time >= |
729 | INTERACTIVE_SLEEP(p)) { | 862 | ceiling) { |
730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 863 | p->sleep_avg = ceiling; |
731 | sleep_time = 0; | 864 | sleep_time = 0; |
732 | } | 865 | } |
733 | } | 866 | } |
734 | 867 | ||
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
742 | */ | 875 | */ |
743 | p->sleep_avg += sleep_time; | 876 | p->sleep_avg += sleep_time; |
744 | 877 | ||
745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
747 | } | 878 | } |
879 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
880 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
748 | } | 881 | } |
749 | 882 | ||
750 | return effective_prio(p); | 883 | return effective_prio(p); |
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
805 | */ | 938 | */ |
806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 939 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
807 | { | 940 | { |
808 | rq->nr_running--; | 941 | dec_nr_running(p, rq); |
809 | dequeue_task(p, p->array); | 942 | dequeue_task(p, p->array); |
810 | p->array = NULL; | 943 | p->array = NULL; |
811 | } | 944 | } |
@@ -860,6 +993,12 @@ inline int task_curr(const task_t *p) | |||
860 | return cpu_curr(task_cpu(p)) == p; | 993 | return cpu_curr(task_cpu(p)) == p; |
861 | } | 994 | } |
862 | 995 | ||
996 | /* Used instead of source_load when we know the type == 0 */ | ||
997 | unsigned long weighted_cpuload(const int cpu) | ||
998 | { | ||
999 | return cpu_rq(cpu)->raw_weighted_load; | ||
1000 | } | ||
1001 | |||
863 | #ifdef CONFIG_SMP | 1002 | #ifdef CONFIG_SMP |
864 | typedef struct { | 1003 | typedef struct { |
865 | struct list_head list; | 1004 | struct list_head list; |
@@ -949,7 +1088,8 @@ void kick_process(task_t *p) | |||
949 | } | 1088 | } |
950 | 1089 | ||
951 | /* | 1090 | /* |
952 | * Return a low guess at the load of a migration-source cpu. | 1091 | * Return a low guess at the load of a migration-source cpu weighted |
1092 | * according to the scheduling class and "nice" value. | ||
953 | * | 1093 | * |
954 | * We want to under-estimate the load of migration sources, to | 1094 | * We want to under-estimate the load of migration sources, to |
955 | * balance conservatively. | 1095 | * balance conservatively. |
@@ -957,24 +1097,36 @@ void kick_process(task_t *p) | |||
957 | static inline unsigned long source_load(int cpu, int type) | 1097 | static inline unsigned long source_load(int cpu, int type) |
958 | { | 1098 | { |
959 | runqueue_t *rq = cpu_rq(cpu); | 1099 | runqueue_t *rq = cpu_rq(cpu); |
960 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1100 | |
961 | if (type == 0) | 1101 | if (type == 0) |
962 | return load_now; | 1102 | return rq->raw_weighted_load; |
963 | 1103 | ||
964 | return min(rq->cpu_load[type-1], load_now); | 1104 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
965 | } | 1105 | } |
966 | 1106 | ||
967 | /* | 1107 | /* |
968 | * Return a high guess at the load of a migration-target cpu | 1108 | * Return a high guess at the load of a migration-target cpu weighted |
1109 | * according to the scheduling class and "nice" value. | ||
969 | */ | 1110 | */ |
970 | static inline unsigned long target_load(int cpu, int type) | 1111 | static inline unsigned long target_load(int cpu, int type) |
971 | { | 1112 | { |
972 | runqueue_t *rq = cpu_rq(cpu); | 1113 | runqueue_t *rq = cpu_rq(cpu); |
973 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1114 | |
974 | if (type == 0) | 1115 | if (type == 0) |
975 | return load_now; | 1116 | return rq->raw_weighted_load; |
1117 | |||
1118 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Return the average load per task on the cpu's run queue | ||
1123 | */ | ||
1124 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
1125 | { | ||
1126 | runqueue_t *rq = cpu_rq(cpu); | ||
1127 | unsigned long n = rq->nr_running; | ||
976 | 1128 | ||
977 | return max(rq->cpu_load[type-1], load_now); | 1129 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; |
978 | } | 1130 | } |
979 | 1131 | ||
980 | /* | 1132 | /* |
@@ -1047,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1047 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1199 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1048 | 1200 | ||
1049 | for_each_cpu_mask(i, tmp) { | 1201 | for_each_cpu_mask(i, tmp) { |
1050 | load = source_load(i, 0); | 1202 | load = weighted_cpuload(i); |
1051 | 1203 | ||
1052 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1204 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1053 | min_load = load; | 1205 | min_load = load; |
@@ -1074,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag) | |||
1074 | struct task_struct *t = current; | 1226 | struct task_struct *t = current; |
1075 | struct sched_domain *tmp, *sd = NULL; | 1227 | struct sched_domain *tmp, *sd = NULL; |
1076 | 1228 | ||
1077 | for_each_domain(cpu, tmp) | 1229 | for_each_domain(cpu, tmp) { |
1230 | /* | ||
1231 | * If power savings logic is enabled for a domain, stop there. | ||
1232 | */ | ||
1233 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1234 | break; | ||
1078 | if (tmp->flags & flag) | 1235 | if (tmp->flags & flag) |
1079 | sd = tmp; | 1236 | sd = tmp; |
1237 | } | ||
1080 | 1238 | ||
1081 | while (sd) { | 1239 | while (sd) { |
1082 | cpumask_t span; | 1240 | cpumask_t span; |
@@ -1226,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1226 | 1384 | ||
1227 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1385 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1228 | unsigned long tl = this_load; | 1386 | unsigned long tl = this_load; |
1387 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1388 | |||
1229 | /* | 1389 | /* |
1230 | * If sync wakeup then subtract the (maximum possible) | 1390 | * If sync wakeup then subtract the (maximum possible) |
1231 | * effect of the currently running task from the load | 1391 | * effect of the currently running task from the load |
1232 | * of the current CPU: | 1392 | * of the current CPU: |
1233 | */ | 1393 | */ |
1234 | if (sync) | 1394 | if (sync) |
1235 | tl -= SCHED_LOAD_SCALE; | 1395 | tl -= current->load_weight; |
1236 | 1396 | ||
1237 | if ((tl <= load && | 1397 | if ((tl <= load && |
1238 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1398 | tl + target_load(cpu, idx) <= tl_per_task) || |
1239 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1399 | 100*(tl + p->load_weight) <= imbalance*load) { |
1240 | /* | 1400 | /* |
1241 | * This domain has SD_WAKE_AFFINE and | 1401 | * This domain has SD_WAKE_AFFINE and |
1242 | * p is cache cold in this domain, and | 1402 | * p is cache cold in this domain, and |
@@ -1353,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1353 | * event cannot wake it up and insert it on the runqueue either. | 1513 | * event cannot wake it up and insert it on the runqueue either. |
1354 | */ | 1514 | */ |
1355 | p->state = TASK_RUNNING; | 1515 | p->state = TASK_RUNNING; |
1516 | |||
1517 | /* | ||
1518 | * Make sure we do not leak PI boosting priority to the child: | ||
1519 | */ | ||
1520 | p->prio = current->normal_prio; | ||
1521 | |||
1356 | INIT_LIST_HEAD(&p->run_list); | 1522 | INIT_LIST_HEAD(&p->run_list); |
1357 | p->array = NULL; | 1523 | p->array = NULL; |
1358 | #ifdef CONFIG_SCHEDSTATS | 1524 | #ifdef CONFIG_SCHEDSTATS |
@@ -1432,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1432 | __activate_task(p, rq); | 1598 | __activate_task(p, rq); |
1433 | else { | 1599 | else { |
1434 | p->prio = current->prio; | 1600 | p->prio = current->prio; |
1601 | p->normal_prio = current->normal_prio; | ||
1435 | list_add_tail(&p->run_list, ¤t->run_list); | 1602 | list_add_tail(&p->run_list, ¤t->run_list); |
1436 | p->array = current->array; | 1603 | p->array = current->array; |
1437 | p->array->nr_active++; | 1604 | p->array->nr_active++; |
1438 | rq->nr_running++; | 1605 | inc_nr_running(p, rq); |
1439 | } | 1606 | } |
1440 | set_need_resched(); | 1607 | set_need_resched(); |
1441 | } else | 1608 | } else |
@@ -1653,7 +1820,8 @@ unsigned long nr_uninterruptible(void) | |||
1653 | 1820 | ||
1654 | unsigned long long nr_context_switches(void) | 1821 | unsigned long long nr_context_switches(void) |
1655 | { | 1822 | { |
1656 | unsigned long long i, sum = 0; | 1823 | int i; |
1824 | unsigned long long sum = 0; | ||
1657 | 1825 | ||
1658 | for_each_possible_cpu(i) | 1826 | for_each_possible_cpu(i) |
1659 | sum += cpu_rq(i)->nr_switches; | 1827 | sum += cpu_rq(i)->nr_switches; |
@@ -1691,9 +1859,6 @@ unsigned long nr_active(void) | |||
1691 | /* | 1859 | /* |
1692 | * double_rq_lock - safely lock two runqueues | 1860 | * double_rq_lock - safely lock two runqueues |
1693 | * | 1861 | * |
1694 | * We must take them in cpu order to match code in | ||
1695 | * dependent_sleeper and wake_dependent_sleeper. | ||
1696 | * | ||
1697 | * Note this does not disable interrupts like task_rq_lock, | 1862 | * Note this does not disable interrupts like task_rq_lock, |
1698 | * you need to do so manually before calling. | 1863 | * you need to do so manually before calling. |
1699 | */ | 1864 | */ |
@@ -1705,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1705 | spin_lock(&rq1->lock); | 1870 | spin_lock(&rq1->lock); |
1706 | __acquire(rq2->lock); /* Fake it out ;) */ | 1871 | __acquire(rq2->lock); /* Fake it out ;) */ |
1707 | } else { | 1872 | } else { |
1708 | if (rq1->cpu < rq2->cpu) { | 1873 | if (rq1 < rq2) { |
1709 | spin_lock(&rq1->lock); | 1874 | spin_lock(&rq1->lock); |
1710 | spin_lock(&rq2->lock); | 1875 | spin_lock(&rq2->lock); |
1711 | } else { | 1876 | } else { |
@@ -1741,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1741 | __acquires(this_rq->lock) | 1906 | __acquires(this_rq->lock) |
1742 | { | 1907 | { |
1743 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1908 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1744 | if (busiest->cpu < this_rq->cpu) { | 1909 | if (busiest < this_rq) { |
1745 | spin_unlock(&this_rq->lock); | 1910 | spin_unlock(&this_rq->lock); |
1746 | spin_lock(&busiest->lock); | 1911 | spin_lock(&busiest->lock); |
1747 | spin_lock(&this_rq->lock); | 1912 | spin_lock(&this_rq->lock); |
@@ -1804,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1804 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1969 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1805 | { | 1970 | { |
1806 | dequeue_task(p, src_array); | 1971 | dequeue_task(p, src_array); |
1807 | src_rq->nr_running--; | 1972 | dec_nr_running(p, src_rq); |
1808 | set_task_cpu(p, this_cpu); | 1973 | set_task_cpu(p, this_cpu); |
1809 | this_rq->nr_running++; | 1974 | inc_nr_running(p, this_rq); |
1810 | enqueue_task(p, this_array); | 1975 | enqueue_task(p, this_array); |
1811 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1976 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1812 | + this_rq->timestamp_last_tick; | 1977 | + this_rq->timestamp_last_tick; |
@@ -1853,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1853 | return 1; | 2018 | return 1; |
1854 | } | 2019 | } |
1855 | 2020 | ||
2021 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
1856 | /* | 2022 | /* |
1857 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2023 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1858 | * as part of a balancing operation within "domain". Returns the number of | 2024 | * load from busiest to this_rq, as part of a balancing operation within |
1859 | * tasks moved. | 2025 | * "domain". Returns the number of tasks moved. |
1860 | * | 2026 | * |
1861 | * Called with both runqueues locked. | 2027 | * Called with both runqueues locked. |
1862 | */ | 2028 | */ |
1863 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2029 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1864 | unsigned long max_nr_move, struct sched_domain *sd, | 2030 | unsigned long max_nr_move, unsigned long max_load_move, |
1865 | enum idle_type idle, int *all_pinned) | 2031 | struct sched_domain *sd, enum idle_type idle, |
2032 | int *all_pinned) | ||
1866 | { | 2033 | { |
1867 | prio_array_t *array, *dst_array; | 2034 | prio_array_t *array, *dst_array; |
1868 | struct list_head *head, *curr; | 2035 | struct list_head *head, *curr; |
1869 | int idx, pulled = 0, pinned = 0; | 2036 | int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; |
2037 | int busiest_best_prio_seen; | ||
2038 | int skip_for_load; /* skip the task based on weighted load issues */ | ||
2039 | long rem_load_move; | ||
1870 | task_t *tmp; | 2040 | task_t *tmp; |
1871 | 2041 | ||
1872 | if (max_nr_move == 0) | 2042 | if (max_nr_move == 0 || max_load_move == 0) |
1873 | goto out; | 2043 | goto out; |
1874 | 2044 | ||
2045 | rem_load_move = max_load_move; | ||
1875 | pinned = 1; | 2046 | pinned = 1; |
2047 | this_best_prio = rq_best_prio(this_rq); | ||
2048 | busiest_best_prio = rq_best_prio(busiest); | ||
2049 | /* | ||
2050 | * Enable handling of the case where there is more than one task | ||
2051 | * with the best priority. If the current running task is one | ||
2052 | * of those with prio==busiest_best_prio we know it won't be moved | ||
2053 | * and therefore it's safe to override the skip (based on load) of | ||
2054 | * any task we find with that prio. | ||
2055 | */ | ||
2056 | busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; | ||
1876 | 2057 | ||
1877 | /* | 2058 | /* |
1878 | * We first consider expired tasks. Those will likely not be | 2059 | * We first consider expired tasks. Those will likely not be |
@@ -1912,7 +2093,17 @@ skip_queue: | |||
1912 | 2093 | ||
1913 | curr = curr->prev; | 2094 | curr = curr->prev; |
1914 | 2095 | ||
1915 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2096 | /* |
2097 | * To help distribute high priority tasks accross CPUs we don't | ||
2098 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2099 | * prio value) on its new queue regardless of its load weight | ||
2100 | */ | ||
2101 | skip_for_load = tmp->load_weight > rem_load_move; | ||
2102 | if (skip_for_load && idx < this_best_prio) | ||
2103 | skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; | ||
2104 | if (skip_for_load || | ||
2105 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
2106 | busiest_best_prio_seen |= idx == busiest_best_prio; | ||
1916 | if (curr != head) | 2107 | if (curr != head) |
1917 | goto skip_queue; | 2108 | goto skip_queue; |
1918 | idx++; | 2109 | idx++; |
@@ -1926,9 +2117,15 @@ skip_queue: | |||
1926 | 2117 | ||
1927 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2118 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1928 | pulled++; | 2119 | pulled++; |
2120 | rem_load_move -= tmp->load_weight; | ||
1929 | 2121 | ||
1930 | /* We only want to steal up to the prescribed number of tasks. */ | 2122 | /* |
1931 | if (pulled < max_nr_move) { | 2123 | * We only want to steal up to the prescribed number of tasks |
2124 | * and the prescribed amount of weighted load. | ||
2125 | */ | ||
2126 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
2127 | if (idx < this_best_prio) | ||
2128 | this_best_prio = idx; | ||
1932 | if (curr != head) | 2129 | if (curr != head) |
1933 | goto skip_queue; | 2130 | goto skip_queue; |
1934 | idx++; | 2131 | idx++; |
@@ -1949,7 +2146,7 @@ out: | |||
1949 | 2146 | ||
1950 | /* | 2147 | /* |
1951 | * find_busiest_group finds and returns the busiest CPU group within the | 2148 | * find_busiest_group finds and returns the busiest CPU group within the |
1952 | * domain. It calculates and returns the number of tasks which should be | 2149 | * domain. It calculates and returns the amount of weighted load which should be |
1953 | * moved to restore balance via the imbalance parameter. | 2150 | * moved to restore balance via the imbalance parameter. |
1954 | */ | 2151 | */ |
1955 | static struct sched_group * | 2152 | static struct sched_group * |
@@ -1959,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1959 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2156 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1960 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2157 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1961 | unsigned long max_pull; | 2158 | unsigned long max_pull; |
2159 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
2160 | unsigned long this_load_per_task, this_nr_running; | ||
1962 | int load_idx; | 2161 | int load_idx; |
2162 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2163 | int power_savings_balance = 1; | ||
2164 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2165 | unsigned long min_nr_running = ULONG_MAX; | ||
2166 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2167 | #endif | ||
1963 | 2168 | ||
1964 | max_load = this_load = total_load = total_pwr = 0; | 2169 | max_load = this_load = total_load = total_pwr = 0; |
2170 | busiest_load_per_task = busiest_nr_running = 0; | ||
2171 | this_load_per_task = this_nr_running = 0; | ||
1965 | if (idle == NOT_IDLE) | 2172 | if (idle == NOT_IDLE) |
1966 | load_idx = sd->busy_idx; | 2173 | load_idx = sd->busy_idx; |
1967 | else if (idle == NEWLY_IDLE) | 2174 | else if (idle == NEWLY_IDLE) |
@@ -1970,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1970 | load_idx = sd->idle_idx; | 2177 | load_idx = sd->idle_idx; |
1971 | 2178 | ||
1972 | do { | 2179 | do { |
1973 | unsigned long load; | 2180 | unsigned long load, group_capacity; |
1974 | int local_group; | 2181 | int local_group; |
1975 | int i; | 2182 | int i; |
2183 | unsigned long sum_nr_running, sum_weighted_load; | ||
1976 | 2184 | ||
1977 | local_group = cpu_isset(this_cpu, group->cpumask); | 2185 | local_group = cpu_isset(this_cpu, group->cpumask); |
1978 | 2186 | ||
1979 | /* Tally up the load of all CPUs in the group */ | 2187 | /* Tally up the load of all CPUs in the group */ |
1980 | avg_load = 0; | 2188 | sum_weighted_load = sum_nr_running = avg_load = 0; |
1981 | 2189 | ||
1982 | for_each_cpu_mask(i, group->cpumask) { | 2190 | for_each_cpu_mask(i, group->cpumask) { |
2191 | runqueue_t *rq = cpu_rq(i); | ||
2192 | |||
1983 | if (*sd_idle && !idle_cpu(i)) | 2193 | if (*sd_idle && !idle_cpu(i)) |
1984 | *sd_idle = 0; | 2194 | *sd_idle = 0; |
1985 | 2195 | ||
@@ -1990,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1990 | load = source_load(i, load_idx); | 2200 | load = source_load(i, load_idx); |
1991 | 2201 | ||
1992 | avg_load += load; | 2202 | avg_load += load; |
2203 | sum_nr_running += rq->nr_running; | ||
2204 | sum_weighted_load += rq->raw_weighted_load; | ||
1993 | } | 2205 | } |
1994 | 2206 | ||
1995 | total_load += avg_load; | 2207 | total_load += avg_load; |
@@ -1998,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1998 | /* Adjust by relative CPU power of the group */ | 2210 | /* Adjust by relative CPU power of the group */ |
1999 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2211 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2000 | 2212 | ||
2213 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2214 | |||
2001 | if (local_group) { | 2215 | if (local_group) { |
2002 | this_load = avg_load; | 2216 | this_load = avg_load; |
2003 | this = group; | 2217 | this = group; |
2004 | } else if (avg_load > max_load) { | 2218 | this_nr_running = sum_nr_running; |
2219 | this_load_per_task = sum_weighted_load; | ||
2220 | } else if (avg_load > max_load && | ||
2221 | sum_nr_running > group_capacity) { | ||
2005 | max_load = avg_load; | 2222 | max_load = avg_load; |
2006 | busiest = group; | 2223 | busiest = group; |
2224 | busiest_nr_running = sum_nr_running; | ||
2225 | busiest_load_per_task = sum_weighted_load; | ||
2007 | } | 2226 | } |
2227 | |||
2228 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2229 | /* | ||
2230 | * Busy processors will not participate in power savings | ||
2231 | * balance. | ||
2232 | */ | ||
2233 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2234 | goto group_next; | ||
2235 | |||
2236 | /* | ||
2237 | * If the local group is idle or completely loaded | ||
2238 | * no need to do power savings balance at this domain | ||
2239 | */ | ||
2240 | if (local_group && (this_nr_running >= group_capacity || | ||
2241 | !this_nr_running)) | ||
2242 | power_savings_balance = 0; | ||
2243 | |||
2244 | /* | ||
2245 | * If a group is already running at full capacity or idle, | ||
2246 | * don't include that group in power savings calculations | ||
2247 | */ | ||
2248 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2249 | || !sum_nr_running) | ||
2250 | goto group_next; | ||
2251 | |||
2252 | /* | ||
2253 | * Calculate the group which has the least non-idle load. | ||
2254 | * This is the group from where we need to pick up the load | ||
2255 | * for saving power | ||
2256 | */ | ||
2257 | if ((sum_nr_running < min_nr_running) || | ||
2258 | (sum_nr_running == min_nr_running && | ||
2259 | first_cpu(group->cpumask) < | ||
2260 | first_cpu(group_min->cpumask))) { | ||
2261 | group_min = group; | ||
2262 | min_nr_running = sum_nr_running; | ||
2263 | min_load_per_task = sum_weighted_load / | ||
2264 | sum_nr_running; | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | * Calculate the group which is almost near its | ||
2269 | * capacity but still has some space to pick up some load | ||
2270 | * from other group and save more power | ||
2271 | */ | ||
2272 | if (sum_nr_running <= group_capacity - 1) | ||
2273 | if (sum_nr_running > leader_nr_running || | ||
2274 | (sum_nr_running == leader_nr_running && | ||
2275 | first_cpu(group->cpumask) > | ||
2276 | first_cpu(group_leader->cpumask))) { | ||
2277 | group_leader = group; | ||
2278 | leader_nr_running = sum_nr_running; | ||
2279 | } | ||
2280 | |||
2281 | group_next: | ||
2282 | #endif | ||
2008 | group = group->next; | 2283 | group = group->next; |
2009 | } while (group != sd->groups); | 2284 | } while (group != sd->groups); |
2010 | 2285 | ||
2011 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2286 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2012 | goto out_balanced; | 2287 | goto out_balanced; |
2013 | 2288 | ||
2014 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2289 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -2017,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2017 | 100*max_load <= sd->imbalance_pct*this_load) | 2292 | 100*max_load <= sd->imbalance_pct*this_load) |
2018 | goto out_balanced; | 2293 | goto out_balanced; |
2019 | 2294 | ||
2295 | busiest_load_per_task /= busiest_nr_running; | ||
2020 | /* | 2296 | /* |
2021 | * We're trying to get all the cpus to the average_load, so we don't | 2297 | * We're trying to get all the cpus to the average_load, so we don't |
2022 | * want to push ourselves above the average load, nor do we wish to | 2298 | * want to push ourselves above the average load, nor do we wish to |
@@ -2028,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2028 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2304 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2029 | * appear as very large values with unsigned longs. | 2305 | * appear as very large values with unsigned longs. |
2030 | */ | 2306 | */ |
2307 | if (max_load <= busiest_load_per_task) | ||
2308 | goto out_balanced; | ||
2309 | |||
2310 | /* | ||
2311 | * In the presence of smp nice balancing, certain scenarios can have | ||
2312 | * max load less than avg load(as we skip the groups at or below | ||
2313 | * its cpu_power, while calculating max_load..) | ||
2314 | */ | ||
2315 | if (max_load < avg_load) { | ||
2316 | *imbalance = 0; | ||
2317 | goto small_imbalance; | ||
2318 | } | ||
2031 | 2319 | ||
2032 | /* Don't want to pull so many tasks that a group would go idle */ | 2320 | /* Don't want to pull so many tasks that a group would go idle */ |
2033 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2321 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2034 | 2322 | ||
2035 | /* How much load to actually move to equalise the imbalance */ | 2323 | /* How much load to actually move to equalise the imbalance */ |
2036 | *imbalance = min(max_pull * busiest->cpu_power, | 2324 | *imbalance = min(max_pull * busiest->cpu_power, |
2037 | (avg_load - this_load) * this->cpu_power) | 2325 | (avg_load - this_load) * this->cpu_power) |
2038 | / SCHED_LOAD_SCALE; | 2326 | / SCHED_LOAD_SCALE; |
2039 | 2327 | ||
2040 | if (*imbalance < SCHED_LOAD_SCALE) { | 2328 | /* |
2041 | unsigned long pwr_now = 0, pwr_move = 0; | 2329 | * if *imbalance is less than the average load per runnable task |
2330 | * there is no gaurantee that any tasks will be moved so we'll have | ||
2331 | * a think about bumping its value to force at least one task to be | ||
2332 | * moved | ||
2333 | */ | ||
2334 | if (*imbalance < busiest_load_per_task) { | ||
2335 | unsigned long pwr_now, pwr_move; | ||
2042 | unsigned long tmp; | 2336 | unsigned long tmp; |
2337 | unsigned int imbn; | ||
2338 | |||
2339 | small_imbalance: | ||
2340 | pwr_move = pwr_now = 0; | ||
2341 | imbn = 2; | ||
2342 | if (this_nr_running) { | ||
2343 | this_load_per_task /= this_nr_running; | ||
2344 | if (busiest_load_per_task > this_load_per_task) | ||
2345 | imbn = 1; | ||
2346 | } else | ||
2347 | this_load_per_task = SCHED_LOAD_SCALE; | ||
2043 | 2348 | ||
2044 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2349 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2045 | *imbalance = 1; | 2350 | *imbalance = busiest_load_per_task; |
2046 | return busiest; | 2351 | return busiest; |
2047 | } | 2352 | } |
2048 | 2353 | ||
@@ -2052,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2052 | * moving them. | 2357 | * moving them. |
2053 | */ | 2358 | */ |
2054 | 2359 | ||
2055 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2360 | pwr_now += busiest->cpu_power * |
2056 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2361 | min(busiest_load_per_task, max_load); |
2362 | pwr_now += this->cpu_power * | ||
2363 | min(this_load_per_task, this_load); | ||
2057 | pwr_now /= SCHED_LOAD_SCALE; | 2364 | pwr_now /= SCHED_LOAD_SCALE; |
2058 | 2365 | ||
2059 | /* Amount of load we'd subtract */ | 2366 | /* Amount of load we'd subtract */ |
2060 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2367 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
2061 | if (max_load > tmp) | 2368 | if (max_load > tmp) |
2062 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2369 | pwr_move += busiest->cpu_power * |
2063 | max_load - tmp); | 2370 | min(busiest_load_per_task, max_load - tmp); |
2064 | 2371 | ||
2065 | /* Amount of load we'd add */ | 2372 | /* Amount of load we'd add */ |
2066 | if (max_load*busiest->cpu_power < | 2373 | if (max_load*busiest->cpu_power < |
2067 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2374 | busiest_load_per_task*SCHED_LOAD_SCALE) |
2068 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2375 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2069 | else | 2376 | else |
2070 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2377 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
2071 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2378 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
2072 | pwr_move /= SCHED_LOAD_SCALE; | 2379 | pwr_move /= SCHED_LOAD_SCALE; |
2073 | 2380 | ||
2074 | /* Move if we gain throughput */ | 2381 | /* Move if we gain throughput */ |
2075 | if (pwr_move <= pwr_now) | 2382 | if (pwr_move <= pwr_now) |
2076 | goto out_balanced; | 2383 | goto out_balanced; |
2077 | 2384 | ||
2078 | *imbalance = 1; | 2385 | *imbalance = busiest_load_per_task; |
2079 | return busiest; | ||
2080 | } | 2386 | } |
2081 | 2387 | ||
2082 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
2083 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
2084 | return busiest; | 2388 | return busiest; |
2085 | 2389 | ||
2086 | out_balanced: | 2390 | out_balanced: |
2391 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2392 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2393 | goto ret; | ||
2087 | 2394 | ||
2395 | if (this == group_leader && group_leader != group_min) { | ||
2396 | *imbalance = min_load_per_task; | ||
2397 | return group_min; | ||
2398 | } | ||
2399 | ret: | ||
2400 | #endif | ||
2088 | *imbalance = 0; | 2401 | *imbalance = 0; |
2089 | return NULL; | 2402 | return NULL; |
2090 | } | 2403 | } |
@@ -2093,18 +2406,21 @@ out_balanced: | |||
2093 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2406 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2094 | */ | 2407 | */ |
2095 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2408 | static runqueue_t *find_busiest_queue(struct sched_group *group, |
2096 | enum idle_type idle) | 2409 | enum idle_type idle, unsigned long imbalance) |
2097 | { | 2410 | { |
2098 | unsigned long load, max_load = 0; | 2411 | unsigned long max_load = 0; |
2099 | runqueue_t *busiest = NULL; | 2412 | runqueue_t *busiest = NULL, *rqi; |
2100 | int i; | 2413 | int i; |
2101 | 2414 | ||
2102 | for_each_cpu_mask(i, group->cpumask) { | 2415 | for_each_cpu_mask(i, group->cpumask) { |
2103 | load = source_load(i, 0); | 2416 | rqi = cpu_rq(i); |
2417 | |||
2418 | if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) | ||
2419 | continue; | ||
2104 | 2420 | ||
2105 | if (load > max_load) { | 2421 | if (rqi->raw_weighted_load > max_load) { |
2106 | max_load = load; | 2422 | max_load = rqi->raw_weighted_load; |
2107 | busiest = cpu_rq(i); | 2423 | busiest = rqi; |
2108 | } | 2424 | } |
2109 | } | 2425 | } |
2110 | 2426 | ||
@@ -2117,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2117 | */ | 2433 | */ |
2118 | #define MAX_PINNED_INTERVAL 512 | 2434 | #define MAX_PINNED_INTERVAL 512 |
2119 | 2435 | ||
2436 | #define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) | ||
2120 | /* | 2437 | /* |
2121 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2438 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2122 | * tasks if there is an imbalance. | 2439 | * tasks if there is an imbalance. |
@@ -2133,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2133 | int active_balance = 0; | 2450 | int active_balance = 0; |
2134 | int sd_idle = 0; | 2451 | int sd_idle = 0; |
2135 | 2452 | ||
2136 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2453 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2454 | !sched_smt_power_savings) | ||
2137 | sd_idle = 1; | 2455 | sd_idle = 1; |
2138 | 2456 | ||
2139 | schedstat_inc(sd, lb_cnt[idle]); | 2457 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2144,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2144 | goto out_balanced; | 2462 | goto out_balanced; |
2145 | } | 2463 | } |
2146 | 2464 | ||
2147 | busiest = find_busiest_queue(group, idle); | 2465 | busiest = find_busiest_queue(group, idle, imbalance); |
2148 | if (!busiest) { | 2466 | if (!busiest) { |
2149 | schedstat_inc(sd, lb_nobusyq[idle]); | 2467 | schedstat_inc(sd, lb_nobusyq[idle]); |
2150 | goto out_balanced; | 2468 | goto out_balanced; |
@@ -2164,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2164 | */ | 2482 | */ |
2165 | double_rq_lock(this_rq, busiest); | 2483 | double_rq_lock(this_rq, busiest); |
2166 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2484 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2485 | minus_1_or_zero(busiest->nr_running), | ||
2167 | imbalance, sd, idle, &all_pinned); | 2486 | imbalance, sd, idle, &all_pinned); |
2168 | double_rq_unlock(this_rq, busiest); | 2487 | double_rq_unlock(this_rq, busiest); |
2169 | 2488 | ||
@@ -2221,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2221 | sd->balance_interval *= 2; | 2540 | sd->balance_interval *= 2; |
2222 | } | 2541 | } |
2223 | 2542 | ||
2224 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2543 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2544 | !sched_smt_power_savings) | ||
2225 | return -1; | 2545 | return -1; |
2226 | return nr_moved; | 2546 | return nr_moved; |
2227 | 2547 | ||
@@ -2236,7 +2556,7 @@ out_one_pinned: | |||
2236 | (sd->balance_interval < sd->max_interval)) | 2556 | (sd->balance_interval < sd->max_interval)) |
2237 | sd->balance_interval *= 2; | 2557 | sd->balance_interval *= 2; |
2238 | 2558 | ||
2239 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2559 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2240 | return -1; | 2560 | return -1; |
2241 | return 0; | 2561 | return 0; |
2242 | } | 2562 | } |
@@ -2257,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2257 | int nr_moved = 0; | 2577 | int nr_moved = 0; |
2258 | int sd_idle = 0; | 2578 | int sd_idle = 0; |
2259 | 2579 | ||
2260 | if (sd->flags & SD_SHARE_CPUPOWER) | 2580 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2261 | sd_idle = 1; | 2581 | sd_idle = 1; |
2262 | 2582 | ||
2263 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2583 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2267,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2267 | goto out_balanced; | 2587 | goto out_balanced; |
2268 | } | 2588 | } |
2269 | 2589 | ||
2270 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2590 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); |
2271 | if (!busiest) { | 2591 | if (!busiest) { |
2272 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2592 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2273 | goto out_balanced; | 2593 | goto out_balanced; |
@@ -2282,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2282 | /* Attempt to move tasks */ | 2602 | /* Attempt to move tasks */ |
2283 | double_lock_balance(this_rq, busiest); | 2603 | double_lock_balance(this_rq, busiest); |
2284 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2604 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2605 | minus_1_or_zero(busiest->nr_running), | ||
2285 | imbalance, sd, NEWLY_IDLE, NULL); | 2606 | imbalance, sd, NEWLY_IDLE, NULL); |
2286 | spin_unlock(&busiest->lock); | 2607 | spin_unlock(&busiest->lock); |
2287 | } | 2608 | } |
@@ -2297,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2297 | 2618 | ||
2298 | out_balanced: | 2619 | out_balanced: |
2299 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2620 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2300 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2621 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2301 | return -1; | 2622 | return -1; |
2302 | sd->nr_balance_failed = 0; | 2623 | sd->nr_balance_failed = 0; |
2303 | return 0; | 2624 | return 0; |
@@ -2352,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
2352 | double_lock_balance(busiest_rq, target_rq); | 2673 | double_lock_balance(busiest_rq, target_rq); |
2353 | 2674 | ||
2354 | /* Search for an sd spanning us and the target CPU. */ | 2675 | /* Search for an sd spanning us and the target CPU. */ |
2355 | for_each_domain(target_cpu, sd) | 2676 | for_each_domain(target_cpu, sd) { |
2356 | if ((sd->flags & SD_LOAD_BALANCE) && | 2677 | if ((sd->flags & SD_LOAD_BALANCE) && |
2357 | cpu_isset(busiest_cpu, sd->span)) | 2678 | cpu_isset(busiest_cpu, sd->span)) |
2358 | break; | 2679 | break; |
2680 | } | ||
2359 | 2681 | ||
2360 | if (unlikely(sd == NULL)) | 2682 | if (unlikely(sd == NULL)) |
2361 | goto out; | 2683 | goto out; |
2362 | 2684 | ||
2363 | schedstat_inc(sd, alb_cnt); | 2685 | schedstat_inc(sd, alb_cnt); |
2364 | 2686 | ||
2365 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2687 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2688 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) | ||
2366 | schedstat_inc(sd, alb_pushed); | 2689 | schedstat_inc(sd, alb_pushed); |
2367 | else | 2690 | else |
2368 | schedstat_inc(sd, alb_failed); | 2691 | schedstat_inc(sd, alb_failed); |
@@ -2390,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2390 | struct sched_domain *sd; | 2713 | struct sched_domain *sd; |
2391 | int i; | 2714 | int i; |
2392 | 2715 | ||
2393 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2716 | this_load = this_rq->raw_weighted_load; |
2394 | /* Update our load */ | 2717 | /* Update our load */ |
2395 | for (i = 0; i < 3; i++) { | 2718 | for (i = 0; i < 3; i++) { |
2396 | unsigned long new_load = this_load; | 2719 | unsigned long new_load = this_load; |
@@ -2691,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
2691 | resched_task(rq->idle); | 3014 | resched_task(rq->idle); |
2692 | } | 3015 | } |
2693 | 3016 | ||
2694 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3017 | /* |
3018 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
3019 | */ | ||
3020 | static void wake_sleeping_dependent(int this_cpu) | ||
2695 | { | 3021 | { |
2696 | struct sched_domain *tmp, *sd = NULL; | 3022 | struct sched_domain *tmp, *sd = NULL; |
2697 | cpumask_t sibling_map; | ||
2698 | int i; | 3023 | int i; |
2699 | 3024 | ||
2700 | for_each_domain(this_cpu, tmp) | 3025 | for_each_domain(this_cpu, tmp) { |
2701 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3026 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
2702 | sd = tmp; | 3027 | sd = tmp; |
3028 | break; | ||
3029 | } | ||
3030 | } | ||
2703 | 3031 | ||
2704 | if (!sd) | 3032 | if (!sd) |
2705 | return; | 3033 | return; |
2706 | 3034 | ||
2707 | /* | 3035 | for_each_cpu_mask(i, sd->span) { |
2708 | * Unlock the current runqueue because we have to lock in | ||
2709 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2710 | * unlock. We keep IRQs disabled. | ||
2711 | */ | ||
2712 | spin_unlock(&this_rq->lock); | ||
2713 | |||
2714 | sibling_map = sd->span; | ||
2715 | |||
2716 | for_each_cpu_mask(i, sibling_map) | ||
2717 | spin_lock(&cpu_rq(i)->lock); | ||
2718 | /* | ||
2719 | * We clear this CPU from the mask. This both simplifies the | ||
2720 | * inner loop and keps this_rq locked when we exit: | ||
2721 | */ | ||
2722 | cpu_clear(this_cpu, sibling_map); | ||
2723 | |||
2724 | for_each_cpu_mask(i, sibling_map) { | ||
2725 | runqueue_t *smt_rq = cpu_rq(i); | 3036 | runqueue_t *smt_rq = cpu_rq(i); |
2726 | 3037 | ||
3038 | if (i == this_cpu) | ||
3039 | continue; | ||
3040 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
3041 | continue; | ||
3042 | |||
2727 | wakeup_busy_runqueue(smt_rq); | 3043 | wakeup_busy_runqueue(smt_rq); |
3044 | spin_unlock(&smt_rq->lock); | ||
2728 | } | 3045 | } |
2729 | |||
2730 | for_each_cpu_mask(i, sibling_map) | ||
2731 | spin_unlock(&cpu_rq(i)->lock); | ||
2732 | /* | ||
2733 | * We exit with this_cpu's rq still held and IRQs | ||
2734 | * still disabled: | ||
2735 | */ | ||
2736 | } | 3046 | } |
2737 | 3047 | ||
2738 | /* | 3048 | /* |
@@ -2745,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
2745 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3055 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2746 | } | 3056 | } |
2747 | 3057 | ||
2748 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3058 | /* |
3059 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
3060 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
3061 | * acquire their lock. As we only trylock the normal locking order does not | ||
3062 | * need to be obeyed. | ||
3063 | */ | ||
3064 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
2749 | { | 3065 | { |
2750 | struct sched_domain *tmp, *sd = NULL; | 3066 | struct sched_domain *tmp, *sd = NULL; |
2751 | cpumask_t sibling_map; | ||
2752 | prio_array_t *array; | ||
2753 | int ret = 0, i; | 3067 | int ret = 0, i; |
2754 | task_t *p; | ||
2755 | 3068 | ||
2756 | for_each_domain(this_cpu, tmp) | 3069 | /* kernel/rt threads do not participate in dependent sleeping */ |
2757 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3070 | if (!p->mm || rt_task(p)) |
3071 | return 0; | ||
3072 | |||
3073 | for_each_domain(this_cpu, tmp) { | ||
3074 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
2758 | sd = tmp; | 3075 | sd = tmp; |
3076 | break; | ||
3077 | } | ||
3078 | } | ||
2759 | 3079 | ||
2760 | if (!sd) | 3080 | if (!sd) |
2761 | return 0; | 3081 | return 0; |
2762 | 3082 | ||
2763 | /* | 3083 | for_each_cpu_mask(i, sd->span) { |
2764 | * The same locking rules and details apply as for | 3084 | runqueue_t *smt_rq; |
2765 | * wake_sleeping_dependent(): | 3085 | task_t *smt_curr; |
2766 | */ | ||
2767 | spin_unlock(&this_rq->lock); | ||
2768 | sibling_map = sd->span; | ||
2769 | for_each_cpu_mask(i, sibling_map) | ||
2770 | spin_lock(&cpu_rq(i)->lock); | ||
2771 | cpu_clear(this_cpu, sibling_map); | ||
2772 | 3086 | ||
2773 | /* | 3087 | if (i == this_cpu) |
2774 | * Establish next task to be run - it might have gone away because | 3088 | continue; |
2775 | * we released the runqueue lock above: | ||
2776 | */ | ||
2777 | if (!this_rq->nr_running) | ||
2778 | goto out_unlock; | ||
2779 | array = this_rq->active; | ||
2780 | if (!array->nr_active) | ||
2781 | array = this_rq->expired; | ||
2782 | BUG_ON(!array->nr_active); | ||
2783 | 3089 | ||
2784 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3090 | smt_rq = cpu_rq(i); |
2785 | task_t, run_list); | 3091 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
3092 | continue; | ||
2786 | 3093 | ||
2787 | for_each_cpu_mask(i, sibling_map) { | 3094 | smt_curr = smt_rq->curr; |
2788 | runqueue_t *smt_rq = cpu_rq(i); | ||
2789 | task_t *smt_curr = smt_rq->curr; | ||
2790 | 3095 | ||
2791 | /* Kernel threads do not participate in dependent sleeping */ | 3096 | if (!smt_curr->mm) |
2792 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3097 | goto unlock; |
2793 | goto check_smt_task; | ||
2794 | 3098 | ||
2795 | /* | 3099 | /* |
2796 | * If a user task with lower static priority than the | 3100 | * If a user task with lower static priority than the |
@@ -2808,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2808 | if ((jiffies % DEF_TIMESLICE) > | 3112 | if ((jiffies % DEF_TIMESLICE) > |
2809 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3113 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2810 | ret = 1; | 3114 | ret = 1; |
2811 | } else | 3115 | } else { |
2812 | if (smt_curr->static_prio < p->static_prio && | 3116 | if (smt_curr->static_prio < p->static_prio && |
2813 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3117 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2814 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3118 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2815 | ret = 1; | 3119 | ret = 1; |
2816 | |||
2817 | check_smt_task: | ||
2818 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2819 | rt_task(smt_curr)) | ||
2820 | continue; | ||
2821 | if (!p->mm) { | ||
2822 | wakeup_busy_runqueue(smt_rq); | ||
2823 | continue; | ||
2824 | } | ||
2825 | |||
2826 | /* | ||
2827 | * Reschedule a lower priority task on the SMT sibling for | ||
2828 | * it to be put to sleep, or wake it up if it has been put to | ||
2829 | * sleep for priority reasons to see if it should run now. | ||
2830 | */ | ||
2831 | if (rt_task(p)) { | ||
2832 | if ((jiffies % DEF_TIMESLICE) > | ||
2833 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2834 | resched_task(smt_curr); | ||
2835 | } else { | ||
2836 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2837 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2838 | resched_task(smt_curr); | ||
2839 | else | ||
2840 | wakeup_busy_runqueue(smt_rq); | ||
2841 | } | 3120 | } |
3121 | unlock: | ||
3122 | spin_unlock(&smt_rq->lock); | ||
2842 | } | 3123 | } |
2843 | out_unlock: | ||
2844 | for_each_cpu_mask(i, sibling_map) | ||
2845 | spin_unlock(&cpu_rq(i)->lock); | ||
2846 | return ret; | 3124 | return ret; |
2847 | } | 3125 | } |
2848 | #else | 3126 | #else |
2849 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3127 | static inline void wake_sleeping_dependent(int this_cpu) |
2850 | { | 3128 | { |
2851 | } | 3129 | } |
2852 | 3130 | ||
2853 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3131 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, |
3132 | task_t *p) | ||
2854 | { | 3133 | { |
2855 | return 0; | 3134 | return 0; |
2856 | } | 3135 | } |
@@ -2972,32 +3251,13 @@ need_resched_nonpreemptible: | |||
2972 | 3251 | ||
2973 | cpu = smp_processor_id(); | 3252 | cpu = smp_processor_id(); |
2974 | if (unlikely(!rq->nr_running)) { | 3253 | if (unlikely(!rq->nr_running)) { |
2975 | go_idle: | ||
2976 | idle_balance(cpu, rq); | 3254 | idle_balance(cpu, rq); |
2977 | if (!rq->nr_running) { | 3255 | if (!rq->nr_running) { |
2978 | next = rq->idle; | 3256 | next = rq->idle; |
2979 | rq->expired_timestamp = 0; | 3257 | rq->expired_timestamp = 0; |
2980 | wake_sleeping_dependent(cpu, rq); | 3258 | wake_sleeping_dependent(cpu); |
2981 | /* | ||
2982 | * wake_sleeping_dependent() might have released | ||
2983 | * the runqueue, so break out if we got new | ||
2984 | * tasks meanwhile: | ||
2985 | */ | ||
2986 | if (!rq->nr_running) | ||
2987 | goto switch_tasks; | ||
2988 | } | ||
2989 | } else { | ||
2990 | if (dependent_sleeper(cpu, rq)) { | ||
2991 | next = rq->idle; | ||
2992 | goto switch_tasks; | 3259 | goto switch_tasks; |
2993 | } | 3260 | } |
2994 | /* | ||
2995 | * dependent_sleeper() releases and reacquires the runqueue | ||
2996 | * lock, hence go into the idle loop if the rq went | ||
2997 | * empty meanwhile: | ||
2998 | */ | ||
2999 | if (unlikely(!rq->nr_running)) | ||
3000 | goto go_idle; | ||
3001 | } | 3261 | } |
3002 | 3262 | ||
3003 | array = rq->active; | 3263 | array = rq->active; |
@@ -3035,6 +3295,8 @@ go_idle: | |||
3035 | } | 3295 | } |
3036 | } | 3296 | } |
3037 | next->sleep_type = SLEEP_NORMAL; | 3297 | next->sleep_type = SLEEP_NORMAL; |
3298 | if (dependent_sleeper(cpu, rq, next)) | ||
3299 | next = rq->idle; | ||
3038 | switch_tasks: | 3300 | switch_tasks: |
3039 | if (next == rq->idle) | 3301 | if (next == rq->idle) |
3040 | schedstat_inc(rq, sched_goidle); | 3302 | schedstat_inc(rq, sched_goidle); |
@@ -3478,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3478 | 3740 | ||
3479 | EXPORT_SYMBOL(sleep_on_timeout); | 3741 | EXPORT_SYMBOL(sleep_on_timeout); |
3480 | 3742 | ||
3743 | #ifdef CONFIG_RT_MUTEXES | ||
3744 | |||
3745 | /* | ||
3746 | * rt_mutex_setprio - set the current priority of a task | ||
3747 | * @p: task | ||
3748 | * @prio: prio value (kernel-internal form) | ||
3749 | * | ||
3750 | * This function changes the 'effective' priority of a task. It does | ||
3751 | * not touch ->normal_prio like __setscheduler(). | ||
3752 | * | ||
3753 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3754 | */ | ||
3755 | void rt_mutex_setprio(task_t *p, int prio) | ||
3756 | { | ||
3757 | unsigned long flags; | ||
3758 | prio_array_t *array; | ||
3759 | runqueue_t *rq; | ||
3760 | int oldprio; | ||
3761 | |||
3762 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3763 | |||
3764 | rq = task_rq_lock(p, &flags); | ||
3765 | |||
3766 | oldprio = p->prio; | ||
3767 | array = p->array; | ||
3768 | if (array) | ||
3769 | dequeue_task(p, array); | ||
3770 | p->prio = prio; | ||
3771 | |||
3772 | if (array) { | ||
3773 | /* | ||
3774 | * If changing to an RT priority then queue it | ||
3775 | * in the active array! | ||
3776 | */ | ||
3777 | if (rt_task(p)) | ||
3778 | array = rq->active; | ||
3779 | enqueue_task(p, array); | ||
3780 | /* | ||
3781 | * Reschedule if we are currently running on this runqueue and | ||
3782 | * our priority decreased, or if we are not currently running on | ||
3783 | * this runqueue and our priority is higher than the current's | ||
3784 | */ | ||
3785 | if (task_running(rq, p)) { | ||
3786 | if (p->prio > oldprio) | ||
3787 | resched_task(rq->curr); | ||
3788 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3789 | resched_task(rq->curr); | ||
3790 | } | ||
3791 | task_rq_unlock(rq, &flags); | ||
3792 | } | ||
3793 | |||
3794 | #endif | ||
3795 | |||
3481 | void set_user_nice(task_t *p, long nice) | 3796 | void set_user_nice(task_t *p, long nice) |
3482 | { | 3797 | { |
3483 | unsigned long flags; | 3798 | unsigned long flags; |
3484 | prio_array_t *array; | 3799 | prio_array_t *array; |
3485 | runqueue_t *rq; | 3800 | runqueue_t *rq; |
3486 | int old_prio, new_prio, delta; | 3801 | int old_prio, delta; |
3487 | 3802 | ||
3488 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3803 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3489 | return; | 3804 | return; |
@@ -3498,22 +3813,25 @@ void set_user_nice(task_t *p, long nice) | |||
3498 | * it wont have any effect on scheduling until the task is | 3813 | * it wont have any effect on scheduling until the task is |
3499 | * not SCHED_NORMAL/SCHED_BATCH: | 3814 | * not SCHED_NORMAL/SCHED_BATCH: |
3500 | */ | 3815 | */ |
3501 | if (rt_task(p)) { | 3816 | if (has_rt_policy(p)) { |
3502 | p->static_prio = NICE_TO_PRIO(nice); | 3817 | p->static_prio = NICE_TO_PRIO(nice); |
3503 | goto out_unlock; | 3818 | goto out_unlock; |
3504 | } | 3819 | } |
3505 | array = p->array; | 3820 | array = p->array; |
3506 | if (array) | 3821 | if (array) { |
3507 | dequeue_task(p, array); | 3822 | dequeue_task(p, array); |
3823 | dec_raw_weighted_load(rq, p); | ||
3824 | } | ||
3508 | 3825 | ||
3509 | old_prio = p->prio; | ||
3510 | new_prio = NICE_TO_PRIO(nice); | ||
3511 | delta = new_prio - old_prio; | ||
3512 | p->static_prio = NICE_TO_PRIO(nice); | 3826 | p->static_prio = NICE_TO_PRIO(nice); |
3513 | p->prio += delta; | 3827 | set_load_weight(p); |
3828 | old_prio = p->prio; | ||
3829 | p->prio = effective_prio(p); | ||
3830 | delta = p->prio - old_prio; | ||
3514 | 3831 | ||
3515 | if (array) { | 3832 | if (array) { |
3516 | enqueue_task(p, array); | 3833 | enqueue_task(p, array); |
3834 | inc_raw_weighted_load(rq, p); | ||
3517 | /* | 3835 | /* |
3518 | * If the task increased its priority or is running and | 3836 | * If the task increased its priority or is running and |
3519 | * lowered its priority, then reschedule its CPU: | 3837 | * lowered its priority, then reschedule its CPU: |
@@ -3524,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) | |||
3524 | out_unlock: | 3842 | out_unlock: |
3525 | task_rq_unlock(rq, &flags); | 3843 | task_rq_unlock(rq, &flags); |
3526 | } | 3844 | } |
3527 | |||
3528 | EXPORT_SYMBOL(set_user_nice); | 3845 | EXPORT_SYMBOL(set_user_nice); |
3529 | 3846 | ||
3530 | /* | 3847 | /* |
@@ -3639,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3639 | BUG_ON(p->array); | 3956 | BUG_ON(p->array); |
3640 | p->policy = policy; | 3957 | p->policy = policy; |
3641 | p->rt_priority = prio; | 3958 | p->rt_priority = prio; |
3642 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 3959 | p->normal_prio = normal_prio(p); |
3643 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3960 | /* we are holding p->pi_lock already */ |
3644 | } else { | 3961 | p->prio = rt_mutex_getprio(p); |
3645 | p->prio = p->static_prio; | 3962 | /* |
3646 | /* | 3963 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3647 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 3964 | */ |
3648 | */ | 3965 | if (policy == SCHED_BATCH) |
3649 | if (policy == SCHED_BATCH) | 3966 | p->sleep_avg = 0; |
3650 | p->sleep_avg = 0; | 3967 | set_load_weight(p); |
3651 | } | ||
3652 | } | 3968 | } |
3653 | 3969 | ||
3654 | /** | 3970 | /** |
@@ -3667,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
3667 | unsigned long flags; | 3983 | unsigned long flags; |
3668 | runqueue_t *rq; | 3984 | runqueue_t *rq; |
3669 | 3985 | ||
3986 | /* may grab non-irq protected spin_locks */ | ||
3987 | BUG_ON(in_interrupt()); | ||
3670 | recheck: | 3988 | recheck: |
3671 | /* double check policy once rq lock held */ | 3989 | /* double check policy once rq lock held */ |
3672 | if (policy < 0) | 3990 | if (policy < 0) |
@@ -3715,14 +4033,20 @@ recheck: | |||
3715 | if (retval) | 4033 | if (retval) |
3716 | return retval; | 4034 | return retval; |
3717 | /* | 4035 | /* |
4036 | * make sure no PI-waiters arrive (or leave) while we are | ||
4037 | * changing the priority of the task: | ||
4038 | */ | ||
4039 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4040 | /* | ||
3718 | * To be able to change p->policy safely, the apropriate | 4041 | * To be able to change p->policy safely, the apropriate |
3719 | * runqueue lock must be held. | 4042 | * runqueue lock must be held. |
3720 | */ | 4043 | */ |
3721 | rq = task_rq_lock(p, &flags); | 4044 | rq = __task_rq_lock(p); |
3722 | /* recheck policy now with rq lock held */ | 4045 | /* recheck policy now with rq lock held */ |
3723 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4046 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3724 | policy = oldpolicy = -1; | 4047 | policy = oldpolicy = -1; |
3725 | task_rq_unlock(rq, &flags); | 4048 | __task_rq_unlock(rq); |
4049 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3726 | goto recheck; | 4050 | goto recheck; |
3727 | } | 4051 | } |
3728 | array = p->array; | 4052 | array = p->array; |
@@ -3743,7 +4067,11 @@ recheck: | |||
3743 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4067 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3744 | resched_task(rq->curr); | 4068 | resched_task(rq->curr); |
3745 | } | 4069 | } |
3746 | task_rq_unlock(rq, &flags); | 4070 | __task_rq_unlock(rq); |
4071 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4072 | |||
4073 | rt_mutex_adjust_pi(p); | ||
4074 | |||
3747 | return 0; | 4075 | return 0; |
3748 | } | 4076 | } |
3749 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4077 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -3765,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3765 | read_unlock_irq(&tasklist_lock); | 4093 | read_unlock_irq(&tasklist_lock); |
3766 | return -ESRCH; | 4094 | return -ESRCH; |
3767 | } | 4095 | } |
3768 | retval = sched_setscheduler(p, policy, &lparam); | 4096 | get_task_struct(p); |
3769 | read_unlock_irq(&tasklist_lock); | 4097 | read_unlock_irq(&tasklist_lock); |
4098 | retval = sched_setscheduler(p, policy, &lparam); | ||
4099 | put_task_struct(p); | ||
3770 | return retval; | 4100 | return retval; |
3771 | } | 4101 | } |
3772 | 4102 | ||
@@ -4378,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4378 | idle->timestamp = sched_clock(); | 4708 | idle->timestamp = sched_clock(); |
4379 | idle->sleep_avg = 0; | 4709 | idle->sleep_avg = 0; |
4380 | idle->array = NULL; | 4710 | idle->array = NULL; |
4381 | idle->prio = MAX_PRIO; | 4711 | idle->prio = idle->normal_prio = MAX_PRIO; |
4382 | idle->state = TASK_RUNNING; | 4712 | idle->state = TASK_RUNNING; |
4383 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4713 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4384 | set_task_cpu(idle, cpu); | 4714 | set_task_cpu(idle, cpu); |
@@ -4474,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4474 | * | 4804 | * |
4475 | * So we race with normal scheduler movements, but that's OK, as long | 4805 | * So we race with normal scheduler movements, but that's OK, as long |
4476 | * as the task is no longer on this CPU. | 4806 | * as the task is no longer on this CPU. |
4807 | * | ||
4808 | * Returns non-zero if task was successfully migrated. | ||
4477 | */ | 4809 | */ |
4478 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4810 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4479 | { | 4811 | { |
4480 | runqueue_t *rq_dest, *rq_src; | 4812 | runqueue_t *rq_dest, *rq_src; |
4813 | int ret = 0; | ||
4481 | 4814 | ||
4482 | if (unlikely(cpu_is_offline(dest_cpu))) | 4815 | if (unlikely(cpu_is_offline(dest_cpu))) |
4483 | return; | 4816 | return ret; |
4484 | 4817 | ||
4485 | rq_src = cpu_rq(src_cpu); | 4818 | rq_src = cpu_rq(src_cpu); |
4486 | rq_dest = cpu_rq(dest_cpu); | 4819 | rq_dest = cpu_rq(dest_cpu); |
@@ -4508,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4508 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4841 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4509 | resched_task(rq_dest->curr); | 4842 | resched_task(rq_dest->curr); |
4510 | } | 4843 | } |
4511 | 4844 | ret = 1; | |
4512 | out: | 4845 | out: |
4513 | double_rq_unlock(rq_src, rq_dest); | 4846 | double_rq_unlock(rq_src, rq_dest); |
4847 | return ret; | ||
4514 | } | 4848 | } |
4515 | 4849 | ||
4516 | /* | 4850 | /* |
@@ -4580,9 +4914,12 @@ wait_to_die: | |||
4580 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4914 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4581 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4915 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) |
4582 | { | 4916 | { |
4917 | runqueue_t *rq; | ||
4918 | unsigned long flags; | ||
4583 | int dest_cpu; | 4919 | int dest_cpu; |
4584 | cpumask_t mask; | 4920 | cpumask_t mask; |
4585 | 4921 | ||
4922 | restart: | ||
4586 | /* On same node? */ | 4923 | /* On same node? */ |
4587 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 4924 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4588 | cpus_and(mask, mask, tsk->cpus_allowed); | 4925 | cpus_and(mask, mask, tsk->cpus_allowed); |
@@ -4594,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4594 | 4931 | ||
4595 | /* No more Mr. Nice Guy. */ | 4932 | /* No more Mr. Nice Guy. */ |
4596 | if (dest_cpu == NR_CPUS) { | 4933 | if (dest_cpu == NR_CPUS) { |
4934 | rq = task_rq_lock(tsk, &flags); | ||
4597 | cpus_setall(tsk->cpus_allowed); | 4935 | cpus_setall(tsk->cpus_allowed); |
4598 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 4936 | dest_cpu = any_online_cpu(tsk->cpus_allowed); |
4937 | task_rq_unlock(rq, &flags); | ||
4599 | 4938 | ||
4600 | /* | 4939 | /* |
4601 | * Don't tell them about moving exiting tasks or | 4940 | * Don't tell them about moving exiting tasks or |
@@ -4607,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4607 | "longer affine to cpu%d\n", | 4946 | "longer affine to cpu%d\n", |
4608 | tsk->pid, tsk->comm, dead_cpu); | 4947 | tsk->pid, tsk->comm, dead_cpu); |
4609 | } | 4948 | } |
4610 | __migrate_task(tsk, dead_cpu, dest_cpu); | 4949 | if (!__migrate_task(tsk, dead_cpu, dest_cpu)) |
4950 | goto restart; | ||
4611 | } | 4951 | } |
4612 | 4952 | ||
4613 | /* | 4953 | /* |
@@ -4734,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
4734 | * migration_call - callback that gets triggered when a CPU is added. | 5074 | * migration_call - callback that gets triggered when a CPU is added. |
4735 | * Here we can start up the necessary migration thread for the new CPU. | 5075 | * Here we can start up the necessary migration thread for the new CPU. |
4736 | */ | 5076 | */ |
4737 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5077 | static int __cpuinit migration_call(struct notifier_block *nfb, |
4738 | void *hcpu) | 5078 | unsigned long action, |
5079 | void *hcpu) | ||
4739 | { | 5080 | { |
4740 | int cpu = (long)hcpu; | 5081 | int cpu = (long)hcpu; |
4741 | struct task_struct *p; | 5082 | struct task_struct *p; |
@@ -4805,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4805 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5146 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4806 | * happens before everything else. | 5147 | * happens before everything else. |
4807 | */ | 5148 | */ |
4808 | static struct notifier_block migration_notifier = { | 5149 | static struct notifier_block __cpuinitdata migration_notifier = { |
4809 | .notifier_call = migration_call, | 5150 | .notifier_call = migration_call, |
4810 | .priority = 10 | 5151 | .priority = 10 |
4811 | }; | 5152 | }; |
@@ -5606,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5606 | } | 5947 | } |
5607 | #endif | 5948 | #endif |
5608 | 5949 | ||
5950 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5609 | /* | 5951 | /* |
5610 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5952 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5611 | * can switch it on easily if needed. | 5953 | * can switch it on easily if needed. |
@@ -5621,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu) | |||
5621 | 5963 | ||
5622 | #ifdef CONFIG_SCHED_MC | 5964 | #ifdef CONFIG_SCHED_MC |
5623 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 5965 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5624 | static struct sched_group sched_group_core[NR_CPUS]; | 5966 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
5625 | #endif | 5967 | #endif |
5626 | 5968 | ||
5627 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 5969 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
@@ -5637,7 +5979,7 @@ static int cpu_to_core_group(int cpu) | |||
5637 | #endif | 5979 | #endif |
5638 | 5980 | ||
5639 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5981 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5640 | static struct sched_group sched_group_phys[NR_CPUS]; | 5982 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
5641 | static int cpu_to_phys_group(int cpu) | 5983 | static int cpu_to_phys_group(int cpu) |
5642 | { | 5984 | { |
5643 | #if defined(CONFIG_SCHED_MC) | 5985 | #if defined(CONFIG_SCHED_MC) |
@@ -5694,13 +6036,74 @@ next_sg: | |||
5694 | } | 6036 | } |
5695 | #endif | 6037 | #endif |
5696 | 6038 | ||
6039 | /* Free memory allocated for various sched_group structures */ | ||
6040 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
6041 | { | ||
6042 | int cpu; | ||
6043 | #ifdef CONFIG_NUMA | ||
6044 | int i; | ||
6045 | |||
6046 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6047 | struct sched_group *sched_group_allnodes | ||
6048 | = sched_group_allnodes_bycpu[cpu]; | ||
6049 | struct sched_group **sched_group_nodes | ||
6050 | = sched_group_nodes_bycpu[cpu]; | ||
6051 | |||
6052 | if (sched_group_allnodes) { | ||
6053 | kfree(sched_group_allnodes); | ||
6054 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6055 | } | ||
6056 | |||
6057 | if (!sched_group_nodes) | ||
6058 | continue; | ||
6059 | |||
6060 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6061 | cpumask_t nodemask = node_to_cpumask(i); | ||
6062 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6063 | |||
6064 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6065 | if (cpus_empty(nodemask)) | ||
6066 | continue; | ||
6067 | |||
6068 | if (sg == NULL) | ||
6069 | continue; | ||
6070 | sg = sg->next; | ||
6071 | next_sg: | ||
6072 | oldsg = sg; | ||
6073 | sg = sg->next; | ||
6074 | kfree(oldsg); | ||
6075 | if (oldsg != sched_group_nodes[i]) | ||
6076 | goto next_sg; | ||
6077 | } | ||
6078 | kfree(sched_group_nodes); | ||
6079 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6080 | } | ||
6081 | #endif | ||
6082 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6083 | if (sched_group_phys_bycpu[cpu]) { | ||
6084 | kfree(sched_group_phys_bycpu[cpu]); | ||
6085 | sched_group_phys_bycpu[cpu] = NULL; | ||
6086 | } | ||
6087 | #ifdef CONFIG_SCHED_MC | ||
6088 | if (sched_group_core_bycpu[cpu]) { | ||
6089 | kfree(sched_group_core_bycpu[cpu]); | ||
6090 | sched_group_core_bycpu[cpu] = NULL; | ||
6091 | } | ||
6092 | #endif | ||
6093 | } | ||
6094 | } | ||
6095 | |||
5697 | /* | 6096 | /* |
5698 | * Build sched domains for a given set of cpus and attach the sched domains | 6097 | * Build sched domains for a given set of cpus and attach the sched domains |
5699 | * to the individual cpus | 6098 | * to the individual cpus |
5700 | */ | 6099 | */ |
5701 | void build_sched_domains(const cpumask_t *cpu_map) | 6100 | static int build_sched_domains(const cpumask_t *cpu_map) |
5702 | { | 6101 | { |
5703 | int i; | 6102 | int i; |
6103 | struct sched_group *sched_group_phys = NULL; | ||
6104 | #ifdef CONFIG_SCHED_MC | ||
6105 | struct sched_group *sched_group_core = NULL; | ||
6106 | #endif | ||
5704 | #ifdef CONFIG_NUMA | 6107 | #ifdef CONFIG_NUMA |
5705 | struct sched_group **sched_group_nodes = NULL; | 6108 | struct sched_group **sched_group_nodes = NULL; |
5706 | struct sched_group *sched_group_allnodes = NULL; | 6109 | struct sched_group *sched_group_allnodes = NULL; |
@@ -5708,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5708 | /* | 6111 | /* |
5709 | * Allocate the per-node list of sched groups | 6112 | * Allocate the per-node list of sched groups |
5710 | */ | 6113 | */ |
5711 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6114 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5712 | GFP_ATOMIC); | 6115 | GFP_KERNEL); |
5713 | if (!sched_group_nodes) { | 6116 | if (!sched_group_nodes) { |
5714 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6117 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5715 | return; | 6118 | return -ENOMEM; |
5716 | } | 6119 | } |
5717 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6120 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5718 | #endif | 6121 | #endif |
@@ -5738,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5738 | if (!sched_group_allnodes) { | 6141 | if (!sched_group_allnodes) { |
5739 | printk(KERN_WARNING | 6142 | printk(KERN_WARNING |
5740 | "Can not alloc allnodes sched group\n"); | 6143 | "Can not alloc allnodes sched group\n"); |
5741 | break; | 6144 | goto error; |
5742 | } | 6145 | } |
5743 | sched_group_allnodes_bycpu[i] | 6146 | sched_group_allnodes_bycpu[i] |
5744 | = sched_group_allnodes; | 6147 | = sched_group_allnodes; |
@@ -5759,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5759 | cpus_and(sd->span, sd->span, *cpu_map); | 6162 | cpus_and(sd->span, sd->span, *cpu_map); |
5760 | #endif | 6163 | #endif |
5761 | 6164 | ||
6165 | if (!sched_group_phys) { | ||
6166 | sched_group_phys | ||
6167 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6168 | GFP_KERNEL); | ||
6169 | if (!sched_group_phys) { | ||
6170 | printk (KERN_WARNING "Can not alloc phys sched" | ||
6171 | "group\n"); | ||
6172 | goto error; | ||
6173 | } | ||
6174 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
6175 | } | ||
6176 | |||
5762 | p = sd; | 6177 | p = sd; |
5763 | sd = &per_cpu(phys_domains, i); | 6178 | sd = &per_cpu(phys_domains, i); |
5764 | group = cpu_to_phys_group(i); | 6179 | group = cpu_to_phys_group(i); |
@@ -5768,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5768 | sd->groups = &sched_group_phys[group]; | 6183 | sd->groups = &sched_group_phys[group]; |
5769 | 6184 | ||
5770 | #ifdef CONFIG_SCHED_MC | 6185 | #ifdef CONFIG_SCHED_MC |
6186 | if (!sched_group_core) { | ||
6187 | sched_group_core | ||
6188 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6189 | GFP_KERNEL); | ||
6190 | if (!sched_group_core) { | ||
6191 | printk (KERN_WARNING "Can not alloc core sched" | ||
6192 | "group\n"); | ||
6193 | goto error; | ||
6194 | } | ||
6195 | sched_group_core_bycpu[i] = sched_group_core; | ||
6196 | } | ||
6197 | |||
5771 | p = sd; | 6198 | p = sd; |
5772 | sd = &per_cpu(core_domains, i); | 6199 | sd = &per_cpu(core_domains, i); |
5773 | group = cpu_to_core_group(i); | 6200 | group = cpu_to_core_group(i); |
@@ -5851,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5851 | domainspan = sched_domain_node_span(i); | 6278 | domainspan = sched_domain_node_span(i); |
5852 | cpus_and(domainspan, domainspan, *cpu_map); | 6279 | cpus_and(domainspan, domainspan, *cpu_map); |
5853 | 6280 | ||
5854 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6281 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6282 | if (!sg) { | ||
6283 | printk(KERN_WARNING "Can not alloc domain group for " | ||
6284 | "node %d\n", i); | ||
6285 | goto error; | ||
6286 | } | ||
5855 | sched_group_nodes[i] = sg; | 6287 | sched_group_nodes[i] = sg; |
5856 | for_each_cpu_mask(j, nodemask) { | 6288 | for_each_cpu_mask(j, nodemask) { |
5857 | struct sched_domain *sd; | 6289 | struct sched_domain *sd; |
5858 | sd = &per_cpu(node_domains, j); | 6290 | sd = &per_cpu(node_domains, j); |
5859 | sd->groups = sg; | 6291 | sd->groups = sg; |
5860 | if (sd->groups == NULL) { | ||
5861 | /* Turn off balancing if we have no groups */ | ||
5862 | sd->flags = 0; | ||
5863 | } | ||
5864 | } | ||
5865 | if (!sg) { | ||
5866 | printk(KERN_WARNING | ||
5867 | "Can not alloc domain group for node %d\n", i); | ||
5868 | continue; | ||
5869 | } | 6292 | } |
5870 | sg->cpu_power = 0; | 6293 | sg->cpu_power = 0; |
5871 | sg->cpumask = nodemask; | 6294 | sg->cpumask = nodemask; |
6295 | sg->next = sg; | ||
5872 | cpus_or(covered, covered, nodemask); | 6296 | cpus_or(covered, covered, nodemask); |
5873 | prev = sg; | 6297 | prev = sg; |
5874 | 6298 | ||
@@ -5887,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5887 | if (cpus_empty(tmp)) | 6311 | if (cpus_empty(tmp)) |
5888 | continue; | 6312 | continue; |
5889 | 6313 | ||
5890 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6314 | sg = kmalloc_node(sizeof(struct sched_group), |
6315 | GFP_KERNEL, i); | ||
5891 | if (!sg) { | 6316 | if (!sg) { |
5892 | printk(KERN_WARNING | 6317 | printk(KERN_WARNING |
5893 | "Can not alloc domain group for node %d\n", j); | 6318 | "Can not alloc domain group for node %d\n", j); |
5894 | break; | 6319 | goto error; |
5895 | } | 6320 | } |
5896 | sg->cpu_power = 0; | 6321 | sg->cpu_power = 0; |
5897 | sg->cpumask = tmp; | 6322 | sg->cpumask = tmp; |
6323 | sg->next = prev->next; | ||
5898 | cpus_or(covered, covered, tmp); | 6324 | cpus_or(covered, covered, tmp); |
5899 | prev->next = sg; | 6325 | prev->next = sg; |
5900 | prev = sg; | 6326 | prev = sg; |
5901 | } | 6327 | } |
5902 | prev->next = sched_group_nodes[i]; | ||
5903 | } | 6328 | } |
5904 | #endif | 6329 | #endif |
5905 | 6330 | ||
5906 | /* Calculate CPU power for physical packages and nodes */ | 6331 | /* Calculate CPU power for physical packages and nodes */ |
6332 | #ifdef CONFIG_SCHED_SMT | ||
5907 | for_each_cpu_mask(i, *cpu_map) { | 6333 | for_each_cpu_mask(i, *cpu_map) { |
5908 | int power; | ||
5909 | struct sched_domain *sd; | 6334 | struct sched_domain *sd; |
5910 | #ifdef CONFIG_SCHED_SMT | ||
5911 | sd = &per_cpu(cpu_domains, i); | 6335 | sd = &per_cpu(cpu_domains, i); |
5912 | power = SCHED_LOAD_SCALE; | 6336 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
5913 | sd->groups->cpu_power = power; | 6337 | } |
5914 | #endif | 6338 | #endif |
5915 | #ifdef CONFIG_SCHED_MC | 6339 | #ifdef CONFIG_SCHED_MC |
6340 | for_each_cpu_mask(i, *cpu_map) { | ||
6341 | int power; | ||
6342 | struct sched_domain *sd; | ||
5916 | sd = &per_cpu(core_domains, i); | 6343 | sd = &per_cpu(core_domains, i); |
5917 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6344 | if (sched_smt_power_savings) |
6345 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6346 | else | ||
6347 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5918 | * SCHED_LOAD_SCALE / 10; | 6348 | * SCHED_LOAD_SCALE / 10; |
5919 | sd->groups->cpu_power = power; | 6349 | sd->groups->cpu_power = power; |
6350 | } | ||
6351 | #endif | ||
5920 | 6352 | ||
6353 | for_each_cpu_mask(i, *cpu_map) { | ||
6354 | struct sched_domain *sd; | ||
6355 | #ifdef CONFIG_SCHED_MC | ||
5921 | sd = &per_cpu(phys_domains, i); | 6356 | sd = &per_cpu(phys_domains, i); |
6357 | if (i != first_cpu(sd->groups->cpumask)) | ||
6358 | continue; | ||
5922 | 6359 | ||
5923 | /* | 6360 | sd->groups->cpu_power = 0; |
5924 | * This has to be < 2 * SCHED_LOAD_SCALE | 6361 | if (sched_mc_power_savings || sched_smt_power_savings) { |
5925 | * Lets keep it SCHED_LOAD_SCALE, so that | 6362 | int j; |
5926 | * while calculating NUMA group's cpu_power | 6363 | |
5927 | * we can simply do | 6364 | for_each_cpu_mask(j, sd->groups->cpumask) { |
5928 | * numa_group->cpu_power += phys_group->cpu_power; | 6365 | struct sched_domain *sd1; |
5929 | * | 6366 | sd1 = &per_cpu(core_domains, j); |
5930 | * See "only add power once for each physical pkg" | 6367 | /* |
5931 | * comment below | 6368 | * for each core we will add once |
5932 | */ | 6369 | * to the group in physical domain |
5933 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6370 | */ |
6371 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6372 | continue; | ||
6373 | |||
6374 | if (sched_smt_power_savings) | ||
6375 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6376 | else | ||
6377 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6378 | } | ||
6379 | } else | ||
6380 | /* | ||
6381 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6382 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6383 | * while calculating NUMA group's cpu_power | ||
6384 | * we can simply do | ||
6385 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6386 | * | ||
6387 | * See "only add power once for each physical pkg" | ||
6388 | * comment below | ||
6389 | */ | ||
6390 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5934 | #else | 6391 | #else |
6392 | int power; | ||
5935 | sd = &per_cpu(phys_domains, i); | 6393 | sd = &per_cpu(phys_domains, i); |
5936 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6394 | if (sched_smt_power_savings) |
5937 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6395 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6396 | else | ||
6397 | power = SCHED_LOAD_SCALE; | ||
5938 | sd->groups->cpu_power = power; | 6398 | sd->groups->cpu_power = power; |
5939 | #endif | 6399 | #endif |
5940 | } | 6400 | } |
@@ -5962,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5962 | * Tune cache-hot values: | 6422 | * Tune cache-hot values: |
5963 | */ | 6423 | */ |
5964 | calibrate_migration_costs(cpu_map); | 6424 | calibrate_migration_costs(cpu_map); |
6425 | |||
6426 | return 0; | ||
6427 | |||
6428 | error: | ||
6429 | free_sched_groups(cpu_map); | ||
6430 | return -ENOMEM; | ||
5965 | } | 6431 | } |
5966 | /* | 6432 | /* |
5967 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6433 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5968 | */ | 6434 | */ |
5969 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6435 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
5970 | { | 6436 | { |
5971 | cpumask_t cpu_default_map; | 6437 | cpumask_t cpu_default_map; |
6438 | int err; | ||
5972 | 6439 | ||
5973 | /* | 6440 | /* |
5974 | * Setup mask for cpus without special case scheduling requirements. | 6441 | * Setup mask for cpus without special case scheduling requirements. |
@@ -5977,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
5977 | */ | 6444 | */ |
5978 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6445 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5979 | 6446 | ||
5980 | build_sched_domains(&cpu_default_map); | 6447 | err = build_sched_domains(&cpu_default_map); |
6448 | |||
6449 | return err; | ||
5981 | } | 6450 | } |
5982 | 6451 | ||
5983 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6452 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5984 | { | 6453 | { |
5985 | #ifdef CONFIG_NUMA | 6454 | free_sched_groups(cpu_map); |
5986 | int i; | ||
5987 | int cpu; | ||
5988 | |||
5989 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5990 | struct sched_group *sched_group_allnodes | ||
5991 | = sched_group_allnodes_bycpu[cpu]; | ||
5992 | struct sched_group **sched_group_nodes | ||
5993 | = sched_group_nodes_bycpu[cpu]; | ||
5994 | |||
5995 | if (sched_group_allnodes) { | ||
5996 | kfree(sched_group_allnodes); | ||
5997 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5998 | } | ||
5999 | |||
6000 | if (!sched_group_nodes) | ||
6001 | continue; | ||
6002 | |||
6003 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6004 | cpumask_t nodemask = node_to_cpumask(i); | ||
6005 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6006 | |||
6007 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6008 | if (cpus_empty(nodemask)) | ||
6009 | continue; | ||
6010 | |||
6011 | if (sg == NULL) | ||
6012 | continue; | ||
6013 | sg = sg->next; | ||
6014 | next_sg: | ||
6015 | oldsg = sg; | ||
6016 | sg = sg->next; | ||
6017 | kfree(oldsg); | ||
6018 | if (oldsg != sched_group_nodes[i]) | ||
6019 | goto next_sg; | ||
6020 | } | ||
6021 | kfree(sched_group_nodes); | ||
6022 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6023 | } | ||
6024 | #endif | ||
6025 | } | 6455 | } |
6026 | 6456 | ||
6027 | /* | 6457 | /* |
@@ -6046,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6046 | * correct sched domains | 6476 | * correct sched domains |
6047 | * Call with hotplug lock held | 6477 | * Call with hotplug lock held |
6048 | */ | 6478 | */ |
6049 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6479 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6050 | { | 6480 | { |
6051 | cpumask_t change_map; | 6481 | cpumask_t change_map; |
6482 | int err = 0; | ||
6052 | 6483 | ||
6053 | cpus_and(*partition1, *partition1, cpu_online_map); | 6484 | cpus_and(*partition1, *partition1, cpu_online_map); |
6054 | cpus_and(*partition2, *partition2, cpu_online_map); | 6485 | cpus_and(*partition2, *partition2, cpu_online_map); |
@@ -6057,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6057 | /* Detach sched domains from all of the affected cpus */ | 6488 | /* Detach sched domains from all of the affected cpus */ |
6058 | detach_destroy_domains(&change_map); | 6489 | detach_destroy_domains(&change_map); |
6059 | if (!cpus_empty(*partition1)) | 6490 | if (!cpus_empty(*partition1)) |
6060 | build_sched_domains(partition1); | 6491 | err = build_sched_domains(partition1); |
6061 | if (!cpus_empty(*partition2)) | 6492 | if (!err && !cpus_empty(*partition2)) |
6062 | build_sched_domains(partition2); | 6493 | err = build_sched_domains(partition2); |
6494 | |||
6495 | return err; | ||
6496 | } | ||
6497 | |||
6498 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6499 | int arch_reinit_sched_domains(void) | ||
6500 | { | ||
6501 | int err; | ||
6502 | |||
6503 | lock_cpu_hotplug(); | ||
6504 | detach_destroy_domains(&cpu_online_map); | ||
6505 | err = arch_init_sched_domains(&cpu_online_map); | ||
6506 | unlock_cpu_hotplug(); | ||
6507 | |||
6508 | return err; | ||
6509 | } | ||
6510 | |||
6511 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6512 | { | ||
6513 | int ret; | ||
6514 | |||
6515 | if (buf[0] != '0' && buf[0] != '1') | ||
6516 | return -EINVAL; | ||
6517 | |||
6518 | if (smt) | ||
6519 | sched_smt_power_savings = (buf[0] == '1'); | ||
6520 | else | ||
6521 | sched_mc_power_savings = (buf[0] == '1'); | ||
6522 | |||
6523 | ret = arch_reinit_sched_domains(); | ||
6524 | |||
6525 | return ret ? ret : count; | ||
6526 | } | ||
6527 | |||
6528 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6529 | { | ||
6530 | int err = 0; | ||
6531 | #ifdef CONFIG_SCHED_SMT | ||
6532 | if (smt_capable()) | ||
6533 | err = sysfs_create_file(&cls->kset.kobj, | ||
6534 | &attr_sched_smt_power_savings.attr); | ||
6535 | #endif | ||
6536 | #ifdef CONFIG_SCHED_MC | ||
6537 | if (!err && mc_capable()) | ||
6538 | err = sysfs_create_file(&cls->kset.kobj, | ||
6539 | &attr_sched_mc_power_savings.attr); | ||
6540 | #endif | ||
6541 | return err; | ||
6542 | } | ||
6543 | #endif | ||
6544 | |||
6545 | #ifdef CONFIG_SCHED_MC | ||
6546 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6547 | { | ||
6548 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6549 | } | ||
6550 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6551 | { | ||
6552 | return sched_power_savings_store(buf, count, 0); | ||
6553 | } | ||
6554 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6555 | sched_mc_power_savings_store); | ||
6556 | #endif | ||
6557 | |||
6558 | #ifdef CONFIG_SCHED_SMT | ||
6559 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6560 | { | ||
6561 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6562 | } | ||
6563 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6564 | { | ||
6565 | return sched_power_savings_store(buf, count, 1); | ||
6063 | } | 6566 | } |
6567 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6568 | sched_smt_power_savings_store); | ||
6569 | #endif | ||
6570 | |||
6064 | 6571 | ||
6065 | #ifdef CONFIG_HOTPLUG_CPU | 6572 | #ifdef CONFIG_HOTPLUG_CPU |
6066 | /* | 6573 | /* |
@@ -6143,7 +6650,6 @@ void __init sched_init(void) | |||
6143 | rq->push_cpu = 0; | 6650 | rq->push_cpu = 0; |
6144 | rq->migration_thread = NULL; | 6651 | rq->migration_thread = NULL; |
6145 | INIT_LIST_HEAD(&rq->migration_queue); | 6652 | INIT_LIST_HEAD(&rq->migration_queue); |
6146 | rq->cpu = i; | ||
6147 | #endif | 6653 | #endif |
6148 | atomic_set(&rq->nr_iowait, 0); | 6654 | atomic_set(&rq->nr_iowait, 0); |
6149 | 6655 | ||
@@ -6158,6 +6664,7 @@ void __init sched_init(void) | |||
6158 | } | 6664 | } |
6159 | } | 6665 | } |
6160 | 6666 | ||
6667 | set_load_weight(&init_task); | ||
6161 | /* | 6668 | /* |
6162 | * The boot idle thread does lazy MMU switching as well: | 6669 | * The boot idle thread does lazy MMU switching as well: |
6163 | */ | 6670 | */ |
@@ -6204,11 +6711,12 @@ void normalize_rt_tasks(void) | |||
6204 | runqueue_t *rq; | 6711 | runqueue_t *rq; |
6205 | 6712 | ||
6206 | read_lock_irq(&tasklist_lock); | 6713 | read_lock_irq(&tasklist_lock); |
6207 | for_each_process (p) { | 6714 | for_each_process(p) { |
6208 | if (!rt_task(p)) | 6715 | if (!rt_task(p)) |
6209 | continue; | 6716 | continue; |
6210 | 6717 | ||
6211 | rq = task_rq_lock(p, &flags); | 6718 | spin_lock_irqsave(&p->pi_lock, flags); |
6719 | rq = __task_rq_lock(p); | ||
6212 | 6720 | ||
6213 | array = p->array; | 6721 | array = p->array; |
6214 | if (array) | 6722 | if (array) |
@@ -6219,7 +6727,8 @@ void normalize_rt_tasks(void) | |||
6219 | resched_task(rq->curr); | 6727 | resched_task(rq->curr); |
6220 | } | 6728 | } |
6221 | 6729 | ||
6222 | task_rq_unlock(rq, &flags); | 6730 | __task_rq_unlock(rq); |
6731 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6223 | } | 6732 | } |
6224 | read_unlock_irq(&tasklist_lock); | 6733 | read_unlock_irq(&tasklist_lock); |
6225 | } | 6734 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 9e2f1c6e73d7..8f03e3b89b55 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
446 | } | 446 | } |
447 | #endif /* CONFIG_HOTPLUG_CPU */ | 447 | #endif /* CONFIG_HOTPLUG_CPU */ |
448 | 448 | ||
449 | static int cpu_callback(struct notifier_block *nfb, | 449 | static int __devinit cpu_callback(struct notifier_block *nfb, |
450 | unsigned long action, | 450 | unsigned long action, |
451 | void *hcpu) | 451 | void *hcpu) |
452 | { | 452 | { |
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb, | |||
486 | return NOTIFY_OK; | 486 | return NOTIFY_OK; |
487 | } | 487 | } |
488 | 488 | ||
489 | static struct notifier_block cpu_nfb = { | 489 | static struct notifier_block __devinitdata cpu_nfb = { |
490 | .notifier_call = cpu_callback | 490 | .notifier_call = cpu_callback |
491 | }; | 491 | }; |
492 | 492 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b5c3b94e01ce..6b76caa22981 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
104 | /* | 104 | /* |
105 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
106 | */ | 106 | */ |
107 | static int | 107 | static int __devinit |
108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
109 | { | 109 | { |
110 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
142 | return NOTIFY_OK; | 142 | return NOTIFY_OK; |
143 | } | 143 | } |
144 | 144 | ||
145 | static struct notifier_block cpu_nfb = { | 145 | static struct notifier_block __devinitdata cpu_nfb = { |
146 | .notifier_call = cpu_callback | 146 | .notifier_call = cpu_callback |
147 | }; | 147 | }; |
148 | 148 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f1a4eb1a655e..93a2c5398648 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -133,6 +133,10 @@ extern int acct_parm[]; | |||
133 | extern int no_unaligned_warning; | 133 | extern int no_unaligned_warning; |
134 | #endif | 134 | #endif |
135 | 135 | ||
136 | #ifdef CONFIG_RT_MUTEXES | ||
137 | extern int max_lock_depth; | ||
138 | #endif | ||
139 | |||
136 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 140 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, |
137 | ctl_table *, void **); | 141 | ctl_table *, void **); |
138 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | 142 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, |
@@ -688,6 +692,17 @@ static ctl_table kern_table[] = { | |||
688 | .proc_handler = &proc_dointvec, | 692 | .proc_handler = &proc_dointvec, |
689 | }, | 693 | }, |
690 | #endif | 694 | #endif |
695 | #ifdef CONFIG_RT_MUTEXES | ||
696 | { | ||
697 | .ctl_name = KERN_MAX_LOCK_DEPTH, | ||
698 | .procname = "max_lock_depth", | ||
699 | .data = &max_lock_depth, | ||
700 | .maxlen = sizeof(int), | ||
701 | .mode = 0644, | ||
702 | .proc_handler = &proc_dointvec, | ||
703 | }, | ||
704 | #endif | ||
705 | |||
691 | { .ctl_name = 0 } | 706 | { .ctl_name = 0 } |
692 | }; | 707 | }; |
693 | 708 | ||
@@ -928,6 +943,18 @@ static ctl_table vm_table[] = { | |||
928 | .strategy = &sysctl_jiffies, | 943 | .strategy = &sysctl_jiffies, |
929 | }, | 944 | }, |
930 | #endif | 945 | #endif |
946 | #ifdef CONFIG_X86_32 | ||
947 | { | ||
948 | .ctl_name = VM_VDSO_ENABLED, | ||
949 | .procname = "vdso_enabled", | ||
950 | .data = &vdso_enabled, | ||
951 | .maxlen = sizeof(vdso_enabled), | ||
952 | .mode = 0644, | ||
953 | .proc_handler = &proc_dointvec, | ||
954 | .strategy = &sysctl_intvec, | ||
955 | .extra1 = &zero, | ||
956 | }, | ||
957 | #endif | ||
931 | { .ctl_name = 0 } | 958 | { .ctl_name = 0 } |
932 | }; | 959 | }; |
933 | 960 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index 5bb6b7976eec..5a8960253063 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1652,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) | |||
1652 | } | 1652 | } |
1653 | #endif /* CONFIG_HOTPLUG_CPU */ | 1653 | #endif /* CONFIG_HOTPLUG_CPU */ |
1654 | 1654 | ||
1655 | static int timer_cpu_notify(struct notifier_block *self, | 1655 | static int __devinit timer_cpu_notify(struct notifier_block *self, |
1656 | unsigned long action, void *hcpu) | 1656 | unsigned long action, void *hcpu) |
1657 | { | 1657 | { |
1658 | long cpu = (long)hcpu; | 1658 | long cpu = (long)hcpu; |
@@ -1672,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self, | |||
1672 | return NOTIFY_OK; | 1672 | return NOTIFY_OK; |
1673 | } | 1673 | } |
1674 | 1674 | ||
1675 | static struct notifier_block timers_nb = { | 1675 | static struct notifier_block __devinitdata timers_nb = { |
1676 | .notifier_call = timer_cpu_notify, | 1676 | .notifier_call = timer_cpu_notify, |
1677 | }; | 1677 | }; |
1678 | 1678 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 565cf7a1febd..59f0b42bd89e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
559 | } | 559 | } |
560 | 560 | ||
561 | /* We're holding the cpucontrol mutex here */ | 561 | /* We're holding the cpucontrol mutex here */ |
562 | static int workqueue_cpu_callback(struct notifier_block *nfb, | 562 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
563 | unsigned long action, | 563 | unsigned long action, |
564 | void *hcpu) | 564 | void *hcpu) |
565 | { | 565 | { |