aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/futex.h29
-rw-r--r--kernel/futex.c324
2 files changed, 236 insertions, 117 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 1bd8dfcb037b..899fc7f20edd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -19,6 +19,18 @@ union ktime;
19#define FUTEX_TRYLOCK_PI 8 19#define FUTEX_TRYLOCK_PI 8
20#define FUTEX_CMP_REQUEUE_PI 9 20#define FUTEX_CMP_REQUEUE_PI 9
21 21
22#define FUTEX_PRIVATE_FLAG 128
23#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG
24
25#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG)
26#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG)
27#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG)
28#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG)
29#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG)
30#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG)
31#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG)
32#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
33
22/* 34/*
23 * Support for robust futexes: the kernel cleans up held futexes at 35 * Support for robust futexes: the kernel cleans up held futexes at
24 * thread exit time. 36 * thread exit time.
@@ -114,8 +126,18 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
114 * Don't rearrange members without looking at hash_futex(). 126 * Don't rearrange members without looking at hash_futex().
115 * 127 *
116 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. 128 * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
117 * We set bit 0 to indicate if it's an inode-based key. 129 * We use the two low order bits of offset to tell what is the kind of key :
118 */ 130 * 00 : Private process futex (PTHREAD_PROCESS_PRIVATE)
131 * (no reference on an inode or mm)
132 * 01 : Shared futex (PTHREAD_PROCESS_SHARED)
133 * mapped on a file (reference on the underlying inode)
134 * 10 : Shared futex (PTHREAD_PROCESS_SHARED)
135 * (but private mapping on an mm, and reference taken on it)
136*/
137
138#define FUT_OFF_INODE 1 /* We set bit 0 if key has a reference on inode */
139#define FUT_OFF_MMSHARED 2 /* We set bit 1 if key has a reference on mm */
140
119union futex_key { 141union futex_key {
120 u32 __user *uaddr; 142 u32 __user *uaddr;
121 struct { 143 struct {
@@ -134,7 +156,8 @@ union futex_key {
134 int offset; 156 int offset;
135 } both; 157 } both;
136}; 158};
137int get_futex_key(u32 __user *uaddr, union futex_key *key); 159int get_futex_key(u32 __user *uaddr, struct rw_semaphore *shared,
160 union futex_key *key);
138void get_futex_key_refs(union futex_key *key); 161void get_futex_key_refs(union futex_key *key);
139void drop_futex_key_refs(union futex_key *key); 162void drop_futex_key_refs(union futex_key *key);
140 163
diff --git a/kernel/futex.c b/kernel/futex.c
index 4a60ef55dab4..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -150,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
150 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
151} 154}
152 155
153/* 156/**
154 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
155 * 165 *
156 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
157 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
158 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
159 * 169 *
160 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
161 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
162 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
163 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
164 */ 173 */
165int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
166{ 176{
167 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
168 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -174,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
174 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
175 */ 185 */
176 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
177 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
178 return -EINVAL; 188 return -EINVAL;
179 address -= key->both.offset; 189 address -= key->both.offset;
180 190
181 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
182 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
183 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
184 */ 208 */
@@ -205,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
205 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
206 */ 230 */
207 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
208 key->private.mm = mm; 233 key->private.mm = mm;
209 key->private.address = address; 234 key->private.address = address;
210 return 0; 235 return 0;
@@ -214,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
214 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
215 */ 240 */
216 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
217 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
218 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
219 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
220 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -242,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
242 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
243 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
244 * 269 *
245 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
246 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
247 */ 270 */
248inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
249{ 272{
250 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
252 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
253 else 278 break;
279 case FUT_OFF_MMSHARED:
254 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
255 } 282 }
256} 283}
257EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -262,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
262 */ 289 */
263void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
264{ 291{
265 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
266 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
267 iput(key->shared.inode); 296 iput(key->shared.inode);
268 else 297 break;
298 case FUT_OFF_MMSHARED:
269 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
270 } 301 }
271} 302}
272EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -283,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
283} 314}
284 315
285/* 316/*
286 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
287 */ 319 */
288static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
289{ 322{
290 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
291 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
292 326
293 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
294 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
295 return -EFAULT;
296 329
297 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
298 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
299 current->min_flt++; 332 vma = find_vma(mm, address);
300 break; 333 if (vma && address >= vma->vm_start &&
301 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
302 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
303 break; 336 case VM_FAULT_MINOR:
304 default: 337 ret = 0;
305 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
306 } 345 }
307 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
308} 349}
309 350
310/* 351/*
@@ -647,7 +688,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
647 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
648 * to this virtual address: 689 * to this virtual address:
649 */ 690 */
650static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
651{ 693{
652 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
653 struct futex_q *this, *next; 695 struct futex_q *this, *next;
@@ -655,9 +697,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
655 union futex_key key; 697 union futex_key key;
656 int ret; 698 int ret;
657 699
658 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
659 702
660 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
661 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
662 goto out; 705 goto out;
663 706
@@ -679,7 +722,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
679 722
680 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
681out: 724out:
682 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
683 return ret; 727 return ret;
684} 728}
685 729
@@ -746,7 +790,9 @@ retry:
746 * and requeue the next nr_requeue waiters following hashed on 790 * and requeue the next nr_requeue waiters following hashed on
747 * one physical page to another physical page (PI-futex uaddr2) 791 * one physical page to another physical page (PI-futex uaddr2)
748 */ 792 */
749static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2, 793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
750 int nr_wake, int nr_requeue, u32 *cmpval) 796 int nr_wake, int nr_requeue, u32 *cmpval)
751{ 797{
752 union futex_key key1, key2; 798 union futex_key key1, key2;
@@ -765,12 +811,13 @@ retry:
765 /* 811 /*
766 * First take all the futex related locks: 812 * First take all the futex related locks:
767 */ 813 */
768 down_read(&current->mm->mmap_sem); 814 if (fshared)
815 down_read(fshared);
769 816
770 ret = get_futex_key(uaddr1, &key1); 817 ret = get_futex_key(uaddr1, fshared, &key1);
771 if (unlikely(ret != 0)) 818 if (unlikely(ret != 0))
772 goto out; 819 goto out;
773 ret = get_futex_key(uaddr2, &key2); 820 ret = get_futex_key(uaddr2, fshared, &key2);
774 if (unlikely(ret != 0)) 821 if (unlikely(ret != 0))
775 goto out; 822 goto out;
776 823
@@ -793,7 +840,8 @@ retry:
793 * If we would have faulted, release mmap_sem, fault 840 * If we would have faulted, release mmap_sem, fault
794 * it in and start all over again. 841 * it in and start all over again.
795 */ 842 */
796 up_read(&current->mm->mmap_sem); 843 if (fshared)
844 up_read(fshared);
797 845
798 ret = get_user(curval, uaddr1); 846 ret = get_user(curval, uaddr1);
799 847
@@ -927,7 +975,8 @@ out_unlock:
927 drop_futex_key_refs(&key1); 975 drop_futex_key_refs(&key1);
928 976
929out: 977out:
930 up_read(&current->mm->mmap_sem); 978 if (fshared)
979 up_read(fshared);
931 return ret; 980 return ret;
932} 981}
933 982
@@ -936,7 +985,8 @@ out:
936 * to this virtual address: 985 * to this virtual address:
937 */ 986 */
938static int 987static int
939futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
940 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
941{ 991{
942 union futex_key key1, key2; 992 union futex_key key1, key2;
@@ -946,12 +996,13 @@ futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
946 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
947 997
948retryfull: 998retryfull:
949 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
950 1001
951 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
952 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
953 goto out; 1004 goto out;
954 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
955 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
956 goto out; 1007 goto out;
957 1008
@@ -991,11 +1042,10 @@ retry:
991 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
992 */ 1043 */
993 if (attempt++) { 1044 if (attempt++) {
994 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
995 attempt)) { 1046 fshared, attempt);
996 ret = -EFAULT; 1047 if (ret)
997 goto out; 1048 goto out;
998 }
999 goto retry; 1049 goto retry;
1000 } 1050 }
1001 1051
@@ -1003,7 +1053,8 @@ retry:
1003 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
1004 * fault it in and start all over again. 1054 * fault it in and start all over again.
1005 */ 1055 */
1006 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
1007 1058
1008 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
1009 if (ret) 1060 if (ret)
@@ -1040,7 +1091,8 @@ retry:
1040 if (hb1 != hb2) 1091 if (hb1 != hb2)
1041 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
1042out: 1093out:
1043 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
1044 return ret; 1096 return ret;
1045} 1097}
1046 1098
@@ -1048,7 +1100,8 @@ out:
1048 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
1049 * physical page. 1101 * physical page.
1050 */ 1102 */
1051static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
1052 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
1053{ 1106{
1054 union futex_key key1, key2; 1107 union futex_key key1, key2;
@@ -1058,12 +1111,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
1058 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
1059 1112
1060 retry: 1113 retry:
1061 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
1062 1116
1063 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
1064 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
1065 goto out; 1119 goto out;
1066 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
1067 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
1068 goto out; 1122 goto out;
1069 1123
@@ -1086,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
1086 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
1087 * it in and start all over again. 1141 * it in and start all over again.
1088 */ 1142 */
1089 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
1090 1145
1091 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
1092 1147
@@ -1139,7 +1194,8 @@ out_unlock:
1139 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
1140 1195
1141out: 1196out:
1142 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
1143 return ret; 1199 return ret;
1144} 1200}
1145 1201
@@ -1273,7 +1329,8 @@ static void unqueue_me_pi(struct futex_q *q)
1273 * The cur->mm semaphore must be held, it is released at return of this 1329 * The cur->mm semaphore must be held, it is released at return of this
1274 * function. 1330 * function.
1275 */ 1331 */
1276static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1277 struct futex_hash_bucket *hb, 1334 struct futex_hash_bucket *hb,
1278 struct task_struct *curr) 1335 struct task_struct *curr)
1279{ 1336{
@@ -1300,7 +1357,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1300 1357
1301 /* Unqueue and drop the lock */ 1358 /* Unqueue and drop the lock */
1302 unqueue_me_pi(q); 1359 unqueue_me_pi(q);
1303 up_read(&curr->mm->mmap_sem); 1360 if (fshared)
1361 up_read(fshared);
1304 /* 1362 /*
1305 * We own it, so we have to replace the pending owner 1363 * We own it, so we have to replace the pending owner
1306 * TID. This must be atomic as we have preserve the 1364 * TID. This must be atomic as we have preserve the
@@ -1321,8 +1379,15 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1321 return ret; 1379 return ret;
1322} 1380}
1323 1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
1324static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
1325static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1390 u32 val, ktime_t *abs_time)
1326{ 1391{
1327 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
1328 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
@@ -1335,9 +1400,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1335 1400
1336 q.pi_state = NULL; 1401 q.pi_state = NULL;
1337 retry: 1402 retry:
1338 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
1339 1405
1340 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1341 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1342 goto out_release_sem; 1408 goto out_release_sem;
1343 1409
@@ -1360,8 +1426,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1360 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1361 * rare, but normal. 1427 * rare, but normal.
1362 * 1428 *
1363 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1364 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1365 */ 1431 */
1366 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1367 1433
@@ -1372,7 +1438,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1372 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1373 * start all over again. 1439 * start all over again.
1374 */ 1440 */
1375 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1376 1443
1377 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1378 1445
@@ -1399,7 +1466,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1399 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1400 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1401 */ 1468 */
1402 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1403 1471
1404 /* 1472 /*
1405 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1469,7 +1537,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1469 else 1537 else
1470 ret = rt_mutex_timed_lock(lock, to, 1); 1538 ret = rt_mutex_timed_lock(lock, to, 1);
1471 1539
1472 down_read(&curr->mm->mmap_sem); 1540 if (fshared)
1541 down_read(fshared);
1473 spin_lock(q.lock_ptr); 1542 spin_lock(q.lock_ptr);
1474 1543
1475 /* 1544 /*
@@ -1486,7 +1555,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1486 1555
1487 /* mmap_sem and hash_bucket lock are unlocked at 1556 /* mmap_sem and hash_bucket lock are unlocked at
1488 return of this function */ 1557 return of this function */
1489 ret = fixup_pi_state_owner(uaddr, &q, hb, curr); 1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1490 } else { 1560 } else {
1491 /* 1561 /*
1492 * Catch the rare case, where the lock was released 1562 * Catch the rare case, where the lock was released
@@ -1499,7 +1569,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1499 } 1569 }
1500 /* Unqueue and drop the lock */ 1570 /* Unqueue and drop the lock */
1501 unqueue_me_pi(&q); 1571 unqueue_me_pi(&q);
1502 up_read(&curr->mm->mmap_sem); 1572 if (fshared)
1573 up_read(fshared);
1503 } 1574 }
1504 1575
1505 debug_rt_mutex_free_waiter(&q.waiter); 1576 debug_rt_mutex_free_waiter(&q.waiter);
@@ -1528,6 +1599,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1528 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1529 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1530 restart->arg2 = (unsigned long)abs_time; 1601 restart->arg2 = (unsigned long)abs_time;
1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1531 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1532 } 1606 }
1533 1607
@@ -1535,7 +1609,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1535 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1536 1610
1537 out_release_sem: 1611 out_release_sem:
1538 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1539 return ret; 1614 return ret;
1540} 1615}
1541 1616
@@ -1545,9 +1620,12 @@ static long futex_wait_restart(struct restart_block *restart)
1545 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1546 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1547 ktime_t *abs_time = (ktime_t *)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1623 struct rw_semaphore *fshared = NULL;
1548 1624
1549 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1550 return (long)futex_wait(uaddr, val, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1551} 1629}
1552 1630
1553 1631
@@ -1602,8 +1680,8 @@ static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1602 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1603 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1604 */ 1682 */
1605static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1606 int trylock) 1684 int detect, ktime_t *time, int trylock)
1607{ 1685{
1608 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1609 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
@@ -1624,9 +1702,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1624 1702
1625 q.pi_state = NULL; 1703 q.pi_state = NULL;
1626 retry: 1704 retry:
1627 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1628 1707
1629 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1630 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1631 goto out_release_sem; 1710 goto out_release_sem;
1632 1711
@@ -1747,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1747 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1748 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1749 */ 1828 */
1750 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1751 1831
1752 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1753 /* 1833 /*
@@ -1761,7 +1841,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1761 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1762 } 1842 }
1763 1843
1764 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1765 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1766 1847
1767 /* 1848 /*
@@ -1770,7 +1851,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1770 */ 1851 */
1771 if (!ret && q.pi_state->owner != curr) 1852 if (!ret && q.pi_state->owner != curr)
1772 /* mmap_sem is unlocked at return of this function */ 1853 /* mmap_sem is unlocked at return of this function */
1773 ret = fixup_pi_state_owner(uaddr, &q, hb, curr); 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1774 else { 1855 else {
1775 /* 1856 /*
1776 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
@@ -1783,7 +1864,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1783 } 1864 }
1784 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1785 unqueue_me_pi(&q); 1866 unqueue_me_pi(&q);
1786 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1787 } 1869 }
1788 1870
1789 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1795,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1795 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1796 1878
1797 out_release_sem: 1879 out_release_sem:
1798 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1799 return ret; 1882 return ret;
1800 1883
1801 uaddr_faulted: 1884 uaddr_faulted:
@@ -1806,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1806 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1807 */ 1890 */
1808 if (attempt++) { 1891 if (attempt++) {
1809 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1810 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1811 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1812 }
1813 goto retry_locked; 1896 goto retry_locked;
1814 } 1897 }
1815 1898
1816 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1817 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1818 1902
1819 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1820 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1828,7 +1912,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1828 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1829 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1830 */ 1914 */
1831static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1832{ 1916{
1833 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1834 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
@@ -1848,9 +1932,10 @@ retry:
1848 /* 1932 /*
1849 * First take all the futex related locks: 1933 * First take all the futex related locks:
1850 */ 1934 */
1851 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1852 1937
1853 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1854 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1855 goto out; 1940 goto out;
1856 1941
@@ -1909,7 +1994,8 @@ retry_locked:
1909out_unlock: 1994out_unlock:
1910 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1911out: 1996out:
1912 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1913 1999
1914 return ret; 2000 return ret;
1915 2001
@@ -1921,15 +2007,16 @@ pi_faulted:
1921 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1922 */ 2008 */
1923 if (attempt++) { 2009 if (attempt++) {
1924 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1925 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1926 goto out_unlock; 2013 goto out_unlock;
1927 }
1928 goto retry_locked; 2014 goto retry_locked;
1929 } 2015 }
1930 2016
1931 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1932 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1933 2020
1934 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1935 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1981,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1981 struct futex_q *q; 2068 struct futex_q *q;
1982 struct file *filp; 2069 struct file *filp;
1983 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1984 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1985 2073
1986 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -2022,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
2022 } 2110 }
2023 q->pi_state = NULL; 2111 q->pi_state = NULL;
2024 2112
2025 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
2026 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
2027 2116
2028 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
2029 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
2030 kfree(q); 2119 kfree(q);
2031 goto error; 2120 goto error;
2032 } 2121 }
@@ -2038,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
2038 filp->private_data = q; 2127 filp->private_data = q;
2039 2128
2040 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
2041 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
2042 2131
2043 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
2044 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -2167,7 +2256,7 @@ retry:
2167 */ 2256 */
2168 if (!pi) { 2257 if (!pi) {
2169 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
2170 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
2171 } 2260 }
2172 } 2261 }
2173 return 0; 2262 return 0;
@@ -2223,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
2223 return; 2312 return;
2224 2313
2225 if (pending) 2314 if (pending)
2226 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
2227 2317
2228 while (entry != &head->list) { 2318 while (entry != &head->list) {
2229 /* 2319 /*
@@ -2253,38 +2343,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2253 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
2254{ 2344{
2255 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
2256 2351
2257 switch (op) { 2352 switch (cmd) {
2258 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
2259 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
2260 break; 2355 break;
2261 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
2262 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
2263 break; 2358 break;
2264 case FUTEX_FD: 2359 case FUTEX_FD:
2265 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
2266 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
2267 break; 2362 break;
2268 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
2269 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
2270 break; 2365 break;
2271 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
2272 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
2273 break; 2368 break;
2274 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
2275 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
2276 break; 2371 break;
2277 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
2278 ret = futex_lock_pi(uaddr, val, timeout, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
2279 break; 2374 break;
2280 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
2281 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
2282 break; 2377 break;
2283 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
2284 ret = futex_lock_pi(uaddr, 0, timeout, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2285 break; 2380 break;
2286 case FUTEX_CMP_REQUEUE_PI: 2381 case FUTEX_CMP_REQUEUE_PI:
2287 ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3); 2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
2288 break; 2383 break;
2289 default: 2384 default:
2290 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -2300,23 +2395,24 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2300 struct timespec ts; 2395 struct timespec ts;
2301 ktime_t t, *tp = NULL; 2396 ktime_t t, *tp = NULL;
2302 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
2303 2399
2304 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
2305 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2306 return -EFAULT; 2402 return -EFAULT;
2307 if (!timespec_valid(&ts)) 2403 if (!timespec_valid(&ts))
2308 return -EINVAL; 2404 return -EINVAL;
2309 2405
2310 t = timespec_to_ktime(ts); 2406 t = timespec_to_ktime(ts);
2311 if (op == FUTEX_WAIT) 2407 if (cmd == FUTEX_WAIT)
2312 t = ktime_add(ktime_get(), t); 2408 t = ktime_add(ktime_get(), t);
2313 tp = &t; 2409 tp = &t;
2314 } 2410 }
2315 /* 2411 /*
2316 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
2317 */ 2413 */
2318 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2319 || op == FUTEX_CMP_REQUEUE_PI) 2415 || cmd == FUTEX_CMP_REQUEUE_PI)
2320 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
2321 2417
2322 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);