aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2007-05-09 05:35:04 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-09 15:30:55 -0400
commit34f01cc1f512fa783302982776895c73714ebbc2 (patch)
tree776b50ee9592803853b3b4c1845f8ba527b868b9
parentd0aa7a70bf03b9de9e995ab272293be1f7937822 (diff)
FUTEX: new PRIVATE futexes
Analysis of current linux futex code : -------------------------------------- A central hash table futex_queues[] holds all contexts (futex_q) of waiting threads. Each futex_wait()/futex_wait() has to obtain a spinlock on a hash slot to perform lookups or insert/deletion of a futex_q. When a futex_wait() is done, calling thread has to : 1) - Obtain a read lock on mmap_sem to be able to validate the user pointer (calling find_vma()). This validation tells us if the futex uses an inode based store (mapped file), or mm based store (anonymous mem) 2) - compute a hash key 3) - Atomic increment of reference counter on an inode or a mm_struct 4) - lock part of futex_queues[] hash table 5) - perform the test on value of futex. (rollback is value != expected_value, returns EWOULDBLOCK) (various loops if test triggers mm faults) 6) queue the context into hash table, release the lock got in 4) 7) - release the read_lock on mmap_sem <block> 8) Eventually unqueue the context (but rarely, as this part  may be done by the futex_wake()) Futexes were designed to improve scalability but current implementation has various problems : - Central hashtable : This means scalability problems if many processes/threads want to use futexes at the same time. This means NUMA unbalance because this hashtable is located on one node. - Using mmap_sem on every futex() syscall : Even if mmap_sem is a rw_semaphore, up_read()/down_read() are doing atomic ops on mmap_sem, dirtying cache line : - lot of cache line ping pongs on SMP configurations. mmap_sem is also extensively used by mm code (page faults, mmap()/munmap()) Highly threaded processes might suffer from mmap_sem contention. mmap_sem is also used by oprofile code. Enabling oprofile hurts threaded programs because of contention on the mmap_sem cache line. - Using an atomic_inc()/atomic_dec() on inode ref counter or mm ref counter: It's also a cache line ping pong on SMP. It also increases mmap_sem hold time because of cache misses. Most of these scalability problems come from the fact that futexes are in one global namespace. As we use a central hash table, we must make sure they are all using the same reference (given by the mm subsystem). We chose to force all futexes be 'shared'. This has a cost. But fact is POSIX defined PRIVATE and SHARED, allowing clear separation, and optimal performance if carefuly implemented. Time has come for linux to have better threading performance. The goal is to permit new futex commands to avoid : - Taking the mmap_sem semaphore, conflicting with other subsystems. - Modifying a ref_count on mm or an inode, still conflicting with mm or fs. This is possible because, for one process using PTHREAD_PROCESS_PRIVATE futexes, we only need to distinguish futexes by their virtual address, no matter the underlying mm storage is. If glibc wants to exploit this new infrastructure, it should use new _PRIVATE futex subcommands for PTHREAD_PROCESS_PRIVATE futexes. And be prepared to fallback on old subcommands for old kernels. Using one global variable with the FUTEX_PRIVATE_FLAG or 0 value should be OK. PTHREAD_PROCESS_SHARED futexes should still use the old subcommands. Compatibility with old applications is preserved, they still hit the scalability problems, but new applications can fly :) Note : the same SHARED futex (mapped on a file) can be used by old binaries *and* new binaries, because both binaries will use the old subcommands. Note : Vast majority of futexes should be using PROCESS_PRIVATE semantic, as this is the default semantic. Almost all applications should benefit of this changes (new kernel and updated libc) Some bench results on a Pentium M 1.6 GHz (SMP kernel on a UP machine) /* calling futex_wait(addr, value) with value != *addr */ 433 cycles per futex(FUTEX_WAIT) call (mixing 2 futexes) 424 cycles per futex(FUTEX_WAIT) call (using one futex) 334 cycles per futex(FUTEX_WAIT_PRIVATE) call (mixing 2 futexes) 334 cycles per futex(FUTEX_WAIT_PRIVATE) call (using one futex) For reference : 187 cycles per getppid() call 188 cycles per umask() call 181 cycles per ni_syscall() call Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Pierre Peiffer <pierre.peiffer@bull.net> Cc: "Ulrich Drepper" <drepper@gmail.com> Cc: "Nick Piggin" <nickpiggin@yahoo.com.au> Cc: "Ingo Molnar" <mingo@elte.hu> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/futex.h29
-rw-r--r--kernel/futex.c324
2 files changed, 236 insertions, 117 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1bd8dfcb037b..899fc7f20edd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -19,6 +19,18 @@ union ktime;
19#define FUTEX_TRYLOCK_PI 8 19#define FUTEX_TRYLOCK_PI 8
20#define FUTEX_CMP_REQUEUE_PI 9 20#define FUTEX_CMP_REQUEUE_PI 9
21 21
22#define FUTEX_PRIVATE_FLAG 128
23#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
24
25#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
26#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
27#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
28#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
29#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
30#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
31#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
32#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
33
22/* 34/*
23 * Support for robust futexes: the kernel cleans up held futexes at 35 * Support for robust futexes: the kernel cleans up held futexes at
24 * thread exit time. 36 * thread exit time.
@@ -114,8 +126,18 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
114 * Don't rearrange members without looking at hash_futex(). 126 * Don't rearrange members without looking at hash_futex().
115 * 127 *
116 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. 128 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
117 * We set bit 0 to indicate if it's an inode-based key. 129 * We use the two low order bits of offset to tell what is the kind of key :
118 */ 130 * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
131 * (no reference on an inode or mm)
132 * 01 : Shared futex (PTHREAD_PROCESS_SHARED)
133 * mapped on a file (reference on the underlying inode)
134 * 10 : Shared futex (PTHREAD_PROCESS_SHARED)
135 * (but private mapping on an mm, and reference taken on it)
136*/
137
138#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
139#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
140
119union futex_key { 141union futex_key {
120 u32 __user *uaddr; 142 u32 __user *uaddr;
121 struct { 143 struct {
@@ -134,7 +156,8 @@ union futex_key {
134 int offset; 156 int offset;
135 } both; 157 } both;
136}; 158};
137int get_futex_key(u32 __user *uaddr, union futex_key *key); 159int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
160 union futex_key *key);
138void get_futex_key_refs(union futex_key *key); 161void get_futex_key_refs(union futex_key *key);
139void drop_futex_key_refs(union futex_key *key); 162void drop_futex_key_refs(union futex_key *key);
140 163
diff --git a/kernel/futex.c b/kernel/futex.c
index 4a60ef55dab4..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -150,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
150 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
151} 154}
152 155
153/* 156/**
154 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
155 * 165 *
156 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
157 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
158 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
159 * 169 *
160 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
161 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
162 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
163 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
164 */ 173 */
165int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
166{ 176{
167 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
168 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -174,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
174 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
175 */ 185 */
176 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
177 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
178 return -EINVAL; 188 return -EINVAL;
179 address -= key->both.offset; 189 address -= key->both.offset;
180 190
181 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
182 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
183 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
184 */ 208 */
@@ -205,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
205 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
206 */ 230 */
207 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
208 key->private.mm = mm; 233 key->private.mm = mm;
209 key->private.address = address; 234 key->private.address = address;
210 return 0; 235 return 0;
@@ -214,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
214 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
215 */ 240 */
216 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
217 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
218 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
219 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
220 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -242,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
242 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
243 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
244 * 269 *
245 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
246 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
247 */ 270 */
248inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
249{ 272{
250 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
252 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
253 else 278 break;
279 case FUT_OFF_MMSHARED:
254 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
255 } 282 }
256} 283}
257EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -262,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
262 */ 289 */
263void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
264{ 291{
265 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
266 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
267 iput(key->shared.inode); 296 iput(key->shared.inode);
268 else 297 break;
298 case FUT_OFF_MMSHARED:
269 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
270 } 301 }
271} 302}
272EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -283,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
283} 314}
284 315
285/* 316/*
286 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
287 */ 319 */
288static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
289{ 322{
290 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
291 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
292 326
293 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
294 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
295 return -EFAULT;
296 329
297 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
298 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
299 current->min_flt++; 332 vma = find_vma(mm, address);
300 break; 333 if (vma && address >= vma->vm_start &&
301 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
302 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
303 break; 336 case VM_FAULT_MINOR:
304 default: 337 ret = 0;
305 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
306 } 345 }
307 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
308} 349}
309 350
310/* 351/*
@@ -647,7 +688,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
647 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
648 * to this virtual address: 689 * to this virtual address:
649 */ 690 */
650static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
651{ 693{
652 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
653 struct futex_q *this, *next; 695 struct futex_q *this, *next;
@@ -655,9 +697,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
655 union futex_key key; 697 union futex_key key;
656 int ret; 698 int ret;
657 699
658 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
659 702
660 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
661 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
662 goto out; 705 goto out;
663 706
@@ -679,7 +722,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
679 722
680 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
681out: 724out:
682 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
683 return ret; 727 return ret;
684} 728}
685 729
@@ -746,7 +790,9 @@ retry:
746 * and requeue the next nr_requeue waiters following hashed on 790 * and requeue the next nr_requeue waiters following hashed on
747 * one physical page to another physical page (PI-futex uaddr2) 791 * one physical page to another physical page (PI-futex uaddr2)
748 */ 792 */
749static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2, 793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
750 int nr_wake, int nr_requeue, u32 *cmpval) 796 int nr_wake, int nr_requeue, u32 *cmpval)
751{ 797{
752 union futex_key key1, key2; 798 union futex_key key1, key2;
@@ -765,12 +811,13 @@ retry:
765 /* 811 /*
766 * First take all the futex related locks: 812 * First take all the futex related locks:
767 */ 813 */
768 down_read(&current->mm->mmap_sem); 814 if (fshared)
815 down_read(fshared);
769 816
770 ret = get_futex_key(uaddr1, &key1); 817 ret = get_futex_key(uaddr1, fshared, &key1);
771 if (unlikely(ret != 0)) 818 if (unlikely(ret != 0))
772 goto out; 819 goto out;
773 ret = get_futex_key(uaddr2, &key2); 820 ret = get_futex_key(uaddr2, fshared, &key2);
774 if (unlikely(ret != 0)) 821 if (unlikely(ret != 0))
775 goto out; 822 goto out;
776 823
@@ -793,7 +840,8 @@ retry:
793 * If we would have faulted, release mmap_sem, fault 840 * If we would have faulted, release mmap_sem, fault
794 * it in and start all over again. 841 * it in and start all over again.
795 */ 842 */
796 up_read(&current->mm->mmap_sem); 843 if (fshared)
844 up_read(fshared);
797 845
798 ret = get_user(curval, uaddr1); 846 ret = get_user(curval, uaddr1);
799 847
@@ -927,7 +975,8 @@ out_unlock:
927 drop_futex_key_refs(&key1); 975 drop_futex_key_refs(&key1);
928 976
929out: 977out:
930 up_read(&current->mm->mmap_sem); 978 if (fshared)
979 up_read(fshared);
931 return ret; 980 return ret;
932} 981}
933 982
@@ -936,7 +985,8 @@ out:
936 * to this virtual address: 985 * to this virtual address:
937 */ 986 */
938static int 987static int
939futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
940 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
941{ 991{
942 union futex_key key1, key2; 992 union futex_key key1, key2;
@@ -946,12 +996,13 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
946 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
947 997
948retryfull: 998retryfull:
949 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
950 1001
951 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
952 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
953 goto out; 1004 goto out;
954 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
955 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
956 goto out; 1007 goto out;
957 1008
@@ -991,11 +1042,10 @@ retry:
991 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
992 */ 1043 */
993 if (attempt++) { 1044 if (attempt++) {
994 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
995 attempt)) { 1046 fshared, attempt);
996 ret = -EFAULT; 1047 if (ret)
997 goto out; 1048 goto out;
998 }
999 goto retry; 1049 goto retry;
1000 } 1050 }
1001 1051
@@ -1003,7 +1053,8 @@ retry:
1003 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
1004 * fault it in and start all over again. 1054 * fault it in and start all over again.
1005 */ 1055 */
1006 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
1007 1058
1008 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
1009 if (ret) 1060 if (ret)
@@ -1040,7 +1091,8 @@ retry:
1040 if (hb1 != hb2) 1091 if (hb1 != hb2)
1041 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
1042out: 1093out:
1043 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
1044 return ret; 1096 return ret;
1045} 1097}
1046 1098
@@ -1048,7 +1100,8 @@ out:
1048 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
1049 * physical page. 1101 * physical page.
1050 */ 1102 */
1051static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
1052 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
1053{ 1106{
1054 union futex_key key1, key2; 1107 union futex_key key1, key2;
@@ -1058,12 +1111,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
1058 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
1059 1112
1060 retry: 1113 retry:
1061 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
1062 1116
1063 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
1064 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
1065 goto out; 1119 goto out;
1066 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
1067 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
1068 goto out; 1122 goto out;
1069 1123
@@ -1086,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
1086 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
1087 * it in and start all over again. 1141 * it in and start all over again.
1088 */ 1142 */
1089 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
1090 1145
1091 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
1092 1147
@@ -1139,7 +1194,8 @@ out_unlock:
1139 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
1140 1195
1141out: 1196out:
1142 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
1143 return ret; 1199 return ret;
1144} 1200}
1145 1201
@@ -1273,7 +1329,8 @@ static void unqueue_me_pi(struct futex_q *q)
1273 * The cur->mm semaphore must be held, it is released at return of this 1329 * The cur->mm semaphore must be held, it is released at return of this
1274 * function. 1330 * function.
1275 */ 1331 */
1276static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1277 struct futex_hash_bucket *hb, 1334 struct futex_hash_bucket *hb,
1278 struct task_struct *curr) 1335 struct task_struct *curr)
1279{ 1336{
@@ -1300,7 +1357,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1300 1357
1301 /* Unqueue and drop the lock */ 1358 /* Unqueue and drop the lock */
1302 unqueue_me_pi(q); 1359 unqueue_me_pi(q);
1303 up_read(&curr->mm->mmap_sem); 1360 if (fshared)
1361 up_read(fshared);
1304 /* 1362 /*
1305 * We own it, so we have to replace the pending owner 1363 * We own it, so we have to replace the pending owner
1306 * TID. This must be atomic as we have preserve the 1364 * TID. This must be atomic as we have preserve the
@@ -1321,8 +1379,15 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1321 return ret; 1379 return ret;
1322} 1380}
1323 1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
1324static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
1325static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1390 u32 val, ktime_t *abs_time)
1326{ 1391{
1327 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
1328 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
@@ -1335,9 +1400,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1335 1400
1336 q.pi_state = NULL; 1401 q.pi_state = NULL;
1337 retry: 1402 retry:
1338 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
1339 1405
1340 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1341 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1342 goto out_release_sem; 1408 goto out_release_sem;
1343 1409
@@ -1360,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1360 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1361 * rare, but normal. 1427 * rare, but normal.
1362 * 1428 *
1363 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1364 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1365 */ 1431 */
1366 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1367 1433
@@ -1372,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1372 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1373 * start all over again. 1439 * start all over again.
1374 */ 1440 */
1375 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1376 1443
1377 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1378 1445
@@ -1399,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1399 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1400 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1401 */ 1468 */
1402 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1403 1471
1404 /* 1472 /*
1405 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1469,7 +1537,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1469 else 1537 else
1470 ret = rt_mutex_timed_lock(lock, to, 1); 1538 ret = rt_mutex_timed_lock(lock, to, 1);
1471 1539
1472 down_read(&curr->mm->mmap_sem); 1540 if (fshared)
1541 down_read(fshared);
1473 spin_lock(q.lock_ptr); 1542 spin_lock(q.lock_ptr);
1474 1543
1475 /* 1544 /*
@@ -1486,7 +1555,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1486 1555
1487 /* mmap_sem and hash_bucket lock are unlocked at 1556 /* mmap_sem and hash_bucket lock are unlocked at
1488 return of this function */ 1557 return of this function */
1489 ret = fixup_pi_state_owner(uaddr, &q, hb, curr); 1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1490 } else { 1560 } else {
1491 /* 1561 /*
1492 * Catch the rare case, where the lock was released 1562 * Catch the rare case, where the lock was released
@@ -1499,7 +1569,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1499 } 1569 }
1500 /* Unqueue and drop the lock */ 1570 /* Unqueue and drop the lock */
1501 unqueue_me_pi(&q); 1571 unqueue_me_pi(&q);
1502 up_read(&curr->mm->mmap_sem); 1572 if (fshared)
1573 up_read(fshared);
1503 } 1574 }
1504 1575
1505 debug_rt_mutex_free_waiter(&q.waiter); 1576 debug_rt_mutex_free_waiter(&q.waiter);
@@ -1528,6 +1599,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1528 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1529 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1530 restart->arg2 = (unsigned long)abs_time; 1601 restart->arg2 = (unsigned long)abs_time;
1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1531 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1532 } 1606 }
1533 1607
@@ -1535,7 +1609,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1535 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1536 1610
1537 out_release_sem: 1611 out_release_sem:
1538 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1539 return ret; 1614 return ret;
1540} 1615}
1541 1616
@@ -1545,9 +1620,12 @@ static long futex_wait_restart(struct restart_block *restart)
1545 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1546 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1547 ktime_t *abs_time = (ktime_t *)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1623 struct rw_semaphore *fshared = NULL;
1548 1624
1549 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1550 return (long)futex_wait(uaddr, val, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1551} 1629}
1552 1630
1553 1631
@@ -1602,8 +1680,8 @@ static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1602 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1603 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1604 */ 1682 */
1605static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1606 int trylock) 1684 int detect, ktime_t *time, int trylock)
1607{ 1685{
1608 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1609 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
@@ -1624,9 +1702,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1624 1702
1625 q.pi_state = NULL; 1703 q.pi_state = NULL;
1626 retry: 1704 retry:
1627 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1628 1707
1629 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1630 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1631 goto out_release_sem; 1710 goto out_release_sem;
1632 1711
@@ -1747,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1747 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1748 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1749 */ 1828 */
1750 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1751 1831
1752 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1753 /* 1833 /*
@@ -1761,7 +1841,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1761 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1762 } 1842 }
1763 1843
1764 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1765 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1766 1847
1767 /* 1848 /*
@@ -1770,7 +1851,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1770 */ 1851 */
1771 if (!ret && q.pi_state->owner != curr) 1852 if (!ret && q.pi_state->owner != curr)
1772 /* mmap_sem is unlocked at return of this function */ 1853 /* mmap_sem is unlocked at return of this function */
1773 ret = fixup_pi_state_owner(uaddr, &q, hb, curr); 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1774 else { 1855 else {
1775 /* 1856 /*
1776 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
@@ -1783,7 +1864,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1783 } 1864 }
1784 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1785 unqueue_me_pi(&q); 1866 unqueue_me_pi(&q);
1786 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1787 } 1869 }
1788 1870
1789 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1795,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1795 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1796 1878
1797 out_release_sem: 1879 out_release_sem:
1798 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1799 return ret; 1882 return ret;
1800 1883
1801 uaddr_faulted: 1884 uaddr_faulted:
@@ -1806,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1806 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1807 */ 1890 */
1808 if (attempt++) { 1891 if (attempt++) {
1809 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1810 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1811 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1812 }
1813 goto retry_locked; 1896 goto retry_locked;
1814 } 1897 }
1815 1898
1816 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1817 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1818 1902
1819 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1820 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1828,7 +1912,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1828 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1829 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1830 */ 1914 */
1831static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1832{ 1916{
1833 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1834 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
@@ -1848,9 +1932,10 @@ retry:
1848 /* 1932 /*
1849 * First take all the futex related locks: 1933 * First take all the futex related locks:
1850 */ 1934 */
1851 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1852 1937
1853 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1854 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1855 goto out; 1940 goto out;
1856 1941
@@ -1909,7 +1994,8 @@ retry_locked:
1909out_unlock: 1994out_unlock:
1910 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1911out: 1996out:
1912 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1913 1999
1914 return ret; 2000 return ret;
1915 2001
@@ -1921,15 +2007,16 @@ pi_faulted:
1921 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1922 */ 2008 */
1923 if (attempt++) { 2009 if (attempt++) {
1924 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1925 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1926 goto out_unlock; 2013 goto out_unlock;
1927 }
1928 goto retry_locked; 2014 goto retry_locked;
1929 } 2015 }
1930 2016
1931 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1932 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1933 2020
1934 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1935 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1981,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1981 struct futex_q *q; 2068 struct futex_q *q;
1982 struct file *filp; 2069 struct file *filp;
1983 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1984 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1985 2073
1986 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -2022,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
2022 } 2110 }
2023 q->pi_state = NULL; 2111 q->pi_state = NULL;
2024 2112
2025 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
2026 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
2027 2116
2028 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
2029 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
2030 kfree(q); 2119 kfree(q);
2031 goto error; 2120 goto error;
2032 } 2121 }
@@ -2038,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
2038 filp->private_data = q; 2127 filp->private_data = q;
2039 2128
2040 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
2041 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
2042 2131
2043 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
2044 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -2167,7 +2256,7 @@ retry:
2167 */ 2256 */
2168 if (!pi) { 2257 if (!pi) {
2169 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
2170 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
2171 } 2260 }
2172 } 2261 }
2173 return 0; 2262 return 0;
@@ -2223,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
2223 return; 2312 return;
2224 2313
2225 if (pending) 2314 if (pending)
2226 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
2227 2317
2228 while (entry != &head->list) { 2318 while (entry != &head->list) {
2229 /* 2319 /*
@@ -2253,38 +2343,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2253 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
2254{ 2344{
2255 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
2256 2351
2257 switch (op) { 2352 switch (cmd) {
2258 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
2259 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
2260 break; 2355 break;
2261 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
2262 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
2263 break; 2358 break;
2264 case FUTEX_FD: 2359 case FUTEX_FD:
2265 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
2266 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
2267 break; 2362 break;
2268 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
2269 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2270 break; 2365 break;
2271 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
2272 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
2273 break; 2368 break;
2274 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
2275 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
2276 break; 2371 break;
2277 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
2278 ret = futex_lock_pi(uaddr, val, timeout, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
2279 break; 2374 break;
2280 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
2281 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
2282 break; 2377 break;
2283 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
2284 ret = futex_lock_pi(uaddr, 0, timeout, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2285 break; 2380 break;
2286 case FUTEX_CMP_REQUEUE_PI: 2381 case FUTEX_CMP_REQUEUE_PI:
2287 ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3); 2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
2288 break; 2383 break;
2289 default: 2384 default:
2290 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -2300,23 +2395,24 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2300 struct timespec ts; 2395 struct timespec ts;
2301 ktime_t t, *tp = NULL; 2396 ktime_t t, *tp = NULL;
2302 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
2303 2399
2304 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
2305 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2306 return -EFAULT; 2402 return -EFAULT;
2307 if (!timespec_valid(&ts)) 2403 if (!timespec_valid(&ts))
2308 return -EINVAL; 2404 return -EINVAL;
2309 2405
2310 t = timespec_to_ktime(ts); 2406 t = timespec_to_ktime(ts);
2311 if (op == FUTEX_WAIT) 2407 if (cmd == FUTEX_WAIT)
2312 t = ktime_add(ktime_get(), t); 2408 t = ktime_add(ktime_get(), t);
2313 tp = &t; 2409 tp = &t;
2314 } 2410 }
2315 /* 2411 /*
2316 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
2317 */ 2413 */
2318 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2319 || op == FUTEX_CMP_REQUEUE_PI) 2415 || cmd == FUTEX_CMP_REQUEUE_PI)
2320 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
2321 2417
2322 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);