diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 1095 | 
1 files changed, 937 insertions, 158 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5a..4b6770e9806d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c  | |||
| @@ -12,6 +12,10 @@ | |||
| 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 
| 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 
| 14 | * | 14 | * | 
| 15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
| 16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 18 | * | ||
| 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 
| 16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew | 
| 17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. | 
| @@ -46,6 +50,8 @@ | |||
| 46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> | 
| 47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> | 
| 48 | 52 | ||
| 53 | #include "rtmutex_common.h" | ||
| 54 | |||
| 49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 
| 50 | 56 | ||
| 51 | /* | 57 | /* | 
| @@ -63,7 +69,7 @@ union futex_key { | |||
| 63 | int offset; | 69 | int offset; | 
| 64 | } shared; | 70 | } shared; | 
| 65 | struct { | 71 | struct { | 
| 66 | unsigned long uaddr; | 72 | unsigned long address; | 
| 67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; | 
| 68 | int offset; | 74 | int offset; | 
| 69 | } private; | 75 | } private; | 
| @@ -75,6 +81,27 @@ union futex_key { | |||
| 75 | }; | 81 | }; | 
| 76 | 82 | ||
| 77 | /* | 83 | /* | 
| 84 | * Priority Inheritance state: | ||
| 85 | */ | ||
| 86 | struct futex_pi_state { | ||
| 87 | /* | ||
| 88 | * list of 'owned' pi_state instances - these have to be | ||
| 89 | * cleaned up in do_exit() if the task exits prematurely: | ||
| 90 | */ | ||
| 91 | struct list_head list; | ||
| 92 | |||
| 93 | /* | ||
| 94 | * The PI object: | ||
| 95 | */ | ||
| 96 | struct rt_mutex pi_mutex; | ||
| 97 | |||
| 98 | struct task_struct *owner; | ||
| 99 | atomic_t refcount; | ||
| 100 | |||
| 101 | union futex_key key; | ||
| 102 | }; | ||
| 103 | |||
| 104 | /* | ||
| 78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 
| 79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). | 
| 80 | * | 107 | * | 
| @@ -87,15 +114,19 @@ struct futex_q { | |||
| 87 | struct list_head list; | 114 | struct list_head list; | 
| 88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; | 
| 89 | 116 | ||
| 90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ | 
| 91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; | 
| 92 | 119 | ||
| 93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ | 
| 94 | union futex_key key; | 121 | union futex_key key; | 
| 95 | 122 | ||
| 96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ | 
| 97 | int fd; | 124 | int fd; | 
| 98 | struct file *filp; | 125 | struct file *filp; | 
| 126 | |||
| 127 | /* Optional priority inheritance state: */ | ||
| 128 | struct futex_pi_state *pi_state; | ||
| 129 | struct task_struct *task; | ||
| 99 | }; | 130 | }; | 
| 100 | 131 | ||
| 101 | /* | 132 | /* | 
| @@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
| 144 | * | 175 | * | 
| 145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 
| 146 | */ | 177 | */ | 
| 147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) | 
| 148 | { | 179 | { | 
| 180 | unsigned long address = (unsigned long)uaddr; | ||
| 149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; | 
| 150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; | 
| 151 | struct page *page; | 183 | struct page *page; | 
| @@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 154 | /* | 186 | /* | 
| 155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. | 
| 156 | */ | 188 | */ | 
| 157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; | 
| 158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 
| 159 | return -EINVAL; | 191 | return -EINVAL; | 
| 160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; | 
| 161 | 193 | ||
| 162 | /* | 194 | /* | 
| 163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether | 
| 164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. | 
| 165 | */ | 197 | */ | 
| 166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); | 
| 167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) | 
| 168 | return -EFAULT; | 200 | return -EFAULT; | 
| 169 | 201 | ||
| @@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 184 | */ | 216 | */ | 
| 185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 
| 186 | key->private.mm = mm; | 218 | key->private.mm = mm; | 
| 187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; | 
| 188 | return 0; | 220 | return 0; | 
| 189 | } | 221 | } | 
| 190 | 222 | ||
| @@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 
| 195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 
| 196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 
| 197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) | 
| 198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); | 
| 199 | return 0; | 231 | return 0; | 
| 200 | } | 232 | } | 
| @@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
| 205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here | 
| 206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. | 
| 207 | */ | 239 | */ | 
| 208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); | 
| 209 | if (err >= 0) { | 241 | if (err >= 0) { | 
| 210 | key->shared.pgoff = | 242 | key->shared.pgoff = | 
| 211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 
| @@ -246,18 +278,259 @@ static void drop_key_refs(union futex_key *key) | |||
| 246 | } | 278 | } | 
| 247 | } | 279 | } | 
| 248 | 280 | ||
| 249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | 
| 250 | { | 282 | { | 
| 251 | int ret; | 283 | int ret; | 
| 252 | 284 | ||
| 253 | inc_preempt_count(); | 285 | inc_preempt_count(); | 
| 254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); | 
| 255 | dec_preempt_count(); | 287 | dec_preempt_count(); | 
| 256 | 288 | ||
| 257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; | 
| 258 | } | 290 | } | 
| 259 | 291 | ||
| 260 | /* | 292 | /* | 
| 293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
| 294 | */ | ||
| 295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
| 296 | { | ||
| 297 | struct vm_area_struct * vma; | ||
| 298 | struct mm_struct *mm = current->mm; | ||
| 299 | |||
| 300 | if (attempt > 2 || !(vma = find_vma(mm, address)) || | ||
| 301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
| 302 | return -EFAULT; | ||
| 303 | |||
| 304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
| 305 | case VM_FAULT_MINOR: | ||
| 306 | current->min_flt++; | ||
| 307 | break; | ||
| 308 | case VM_FAULT_MAJOR: | ||
| 309 | current->maj_flt++; | ||
| 310 | break; | ||
| 311 | default: | ||
| 312 | return -EFAULT; | ||
| 313 | } | ||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * PI code: | ||
| 319 | */ | ||
| 320 | static int refill_pi_state_cache(void) | ||
| 321 | { | ||
| 322 | struct futex_pi_state *pi_state; | ||
| 323 | |||
| 324 | if (likely(current->pi_state_cache)) | ||
| 325 | return 0; | ||
| 326 | |||
| 327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
| 328 | |||
| 329 | if (!pi_state) | ||
| 330 | return -ENOMEM; | ||
| 331 | |||
| 332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
| 333 | INIT_LIST_HEAD(&pi_state->list); | ||
| 334 | /* pi_mutex gets initialized later */ | ||
| 335 | pi_state->owner = NULL; | ||
| 336 | atomic_set(&pi_state->refcount, 1); | ||
| 337 | |||
| 338 | current->pi_state_cache = pi_state; | ||
| 339 | |||
| 340 | return 0; | ||
| 341 | } | ||
| 342 | |||
| 343 | static struct futex_pi_state * alloc_pi_state(void) | ||
| 344 | { | ||
| 345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
| 346 | |||
| 347 | WARN_ON(!pi_state); | ||
| 348 | current->pi_state_cache = NULL; | ||
| 349 | |||
| 350 | return pi_state; | ||
| 351 | } | ||
| 352 | |||
| 353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
| 354 | { | ||
| 355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
| 356 | return; | ||
| 357 | |||
| 358 | /* | ||
| 359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
| 360 | * and has cleaned up the pi_state already | ||
| 361 | */ | ||
| 362 | if (pi_state->owner) { | ||
| 363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
| 364 | list_del_init(&pi_state->list); | ||
| 365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
| 366 | |||
| 367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
| 368 | } | ||
| 369 | |||
| 370 | if (current->pi_state_cache) | ||
| 371 | kfree(pi_state); | ||
| 372 | else { | ||
| 373 | /* | ||
| 374 | * pi_state->list is already empty. | ||
| 375 | * clear pi_state->owner. | ||
| 376 | * refcount is at 0 - put it back to 1. | ||
| 377 | */ | ||
| 378 | pi_state->owner = NULL; | ||
| 379 | atomic_set(&pi_state->refcount, 1); | ||
| 380 | current->pi_state_cache = pi_state; | ||
| 381 | } | ||
| 382 | } | ||
| 383 | |||
| 384 | /* | ||
| 385 | * Look up the task based on what TID userspace gave us. | ||
| 386 | * We dont trust it. | ||
| 387 | */ | ||
| 388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
| 389 | { | ||
| 390 | struct task_struct *p; | ||
| 391 | |||
| 392 | rcu_read_lock(); | ||
| 393 | p = find_task_by_pid(pid); | ||
| 394 | if (!p) | ||
| 395 | goto out_unlock; | ||
| 396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
| 397 | p = NULL; | ||
| 398 | goto out_unlock; | ||
| 399 | } | ||
| 400 | if (p->exit_state != 0) { | ||
| 401 | p = NULL; | ||
| 402 | goto out_unlock; | ||
| 403 | } | ||
| 404 | get_task_struct(p); | ||
| 405 | out_unlock: | ||
| 406 | rcu_read_unlock(); | ||
| 407 | |||
| 408 | return p; | ||
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * This task is holding PI mutexes at exit time => bad. | ||
| 413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
| 414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
| 415 | */ | ||
| 416 | void exit_pi_state_list(struct task_struct *curr) | ||
| 417 | { | ||
| 418 | struct list_head *next, *head = &curr->pi_state_list; | ||
| 419 | struct futex_pi_state *pi_state; | ||
| 420 | struct futex_hash_bucket *hb; | ||
| 421 | union futex_key key; | ||
| 422 | |||
| 423 | /* | ||
| 424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
| 425 | * pi_state_list anymore, but we have to be careful | ||
| 426 | * versus waiters unqueueing themselves: | ||
| 427 | */ | ||
| 428 | spin_lock_irq(&curr->pi_lock); | ||
| 429 | while (!list_empty(head)) { | ||
| 430 | |||
| 431 | next = head->next; | ||
| 432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
| 433 | key = pi_state->key; | ||
| 434 | hb = hash_futex(&key); | ||
| 435 | spin_unlock_irq(&curr->pi_lock); | ||
| 436 | |||
| 437 | spin_lock(&hb->lock); | ||
| 438 | |||
| 439 | spin_lock_irq(&curr->pi_lock); | ||
| 440 | /* | ||
| 441 | * We dropped the pi-lock, so re-check whether this | ||
| 442 | * task still owns the PI-state: | ||
| 443 | */ | ||
| 444 | if (head->next != next) { | ||
| 445 | spin_unlock(&hb->lock); | ||
| 446 | continue; | ||
| 447 | } | ||
| 448 | |||
| 449 | WARN_ON(pi_state->owner != curr); | ||
| 450 | WARN_ON(list_empty(&pi_state->list)); | ||
| 451 | list_del_init(&pi_state->list); | ||
| 452 | pi_state->owner = NULL; | ||
| 453 | spin_unlock_irq(&curr->pi_lock); | ||
| 454 | |||
| 455 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 456 | |||
| 457 | spin_unlock(&hb->lock); | ||
| 458 | |||
| 459 | spin_lock_irq(&curr->pi_lock); | ||
| 460 | } | ||
| 461 | spin_unlock_irq(&curr->pi_lock); | ||
| 462 | } | ||
| 463 | |||
| 464 | static int | ||
| 465 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
| 466 | { | ||
| 467 | struct futex_pi_state *pi_state = NULL; | ||
| 468 | struct futex_q *this, *next; | ||
| 469 | struct list_head *head; | ||
| 470 | struct task_struct *p; | ||
| 471 | pid_t pid; | ||
| 472 | |||
| 473 | head = &hb->chain; | ||
| 474 | |||
| 475 | list_for_each_entry_safe(this, next, head, list) { | ||
| 476 | if (match_futex(&this->key, &me->key)) { | ||
| 477 | /* | ||
| 478 | * Another waiter already exists - bump up | ||
| 479 | * the refcount and return its pi_state: | ||
| 480 | */ | ||
| 481 | pi_state = this->pi_state; | ||
| 482 | /* | ||
| 483 | * Userspace might have messed up non PI and PI futexes | ||
| 484 | */ | ||
| 485 | if (unlikely(!pi_state)) | ||
| 486 | return -EINVAL; | ||
| 487 | |||
| 488 | WARN_ON(!atomic_read(&pi_state->refcount)); | ||
| 489 | |||
| 490 | atomic_inc(&pi_state->refcount); | ||
| 491 | me->pi_state = pi_state; | ||
| 492 | |||
| 493 | return 0; | ||
| 494 | } | ||
| 495 | } | ||
| 496 | |||
| 497 | /* | ||
| 498 | * We are the first waiter - try to look up the real owner and attach | ||
| 499 | * the new pi_state to it, but bail out when the owner died bit is set | ||
| 500 | * and TID = 0: | ||
| 501 | */ | ||
| 502 | pid = uval & FUTEX_TID_MASK; | ||
| 503 | if (!pid && (uval & FUTEX_OWNER_DIED)) | ||
| 504 | return -ESRCH; | ||
| 505 | p = futex_find_get_task(pid); | ||
| 506 | if (!p) | ||
| 507 | return -ESRCH; | ||
| 508 | |||
| 509 | pi_state = alloc_pi_state(); | ||
| 510 | |||
| 511 | /* | ||
| 512 | * Initialize the pi_mutex in locked state and make 'p' | ||
| 513 | * the owner of it: | ||
| 514 | */ | ||
| 515 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
| 516 | |||
| 517 | /* Store the key for possible exit cleanups: */ | ||
| 518 | pi_state->key = me->key; | ||
| 519 | |||
| 520 | spin_lock_irq(&p->pi_lock); | ||
| 521 | WARN_ON(!list_empty(&pi_state->list)); | ||
| 522 | list_add(&pi_state->list, &p->pi_state_list); | ||
| 523 | pi_state->owner = p; | ||
| 524 | spin_unlock_irq(&p->pi_lock); | ||
| 525 | |||
| 526 | put_task_struct(p); | ||
| 527 | |||
| 528 | me->pi_state = pi_state; | ||
| 529 | |||
| 530 | return 0; | ||
| 531 | } | ||
| 532 | |||
| 533 | /* | ||
| 261 | * The hash bucket lock must be held when this is called. | 534 | * The hash bucket lock must be held when this is called. | 
| 262 | * Afterwards, the futex_q must not be accessed. | 535 | * Afterwards, the futex_q must not be accessed. | 
| 263 | */ | 536 | */ | 
| @@ -284,16 +557,105 @@ static void wake_futex(struct futex_q *q) | |||
| 284 | q->lock_ptr = NULL; | 557 | q->lock_ptr = NULL; | 
| 285 | } | 558 | } | 
| 286 | 559 | ||
| 560 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
| 561 | { | ||
| 562 | struct task_struct *new_owner; | ||
| 563 | struct futex_pi_state *pi_state = this->pi_state; | ||
| 564 | u32 curval, newval; | ||
| 565 | |||
| 566 | if (!pi_state) | ||
| 567 | return -EINVAL; | ||
| 568 | |||
| 569 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
| 570 | |||
| 571 | /* | ||
| 572 | * This happens when we have stolen the lock and the original | ||
| 573 | * pending owner did not enqueue itself back on the rt_mutex. | ||
| 574 | * Thats not a tragedy. We know that way, that a lock waiter | ||
| 575 | * is on the fly. We make the futex_q waiter the pending owner. | ||
| 576 | */ | ||
| 577 | if (!new_owner) | ||
| 578 | new_owner = this->task; | ||
| 579 | |||
| 580 | /* | ||
| 581 | * We pass it to the next owner. (The WAITERS bit is always | ||
| 582 | * kept enabled while there is PI state around. We must also | ||
| 583 | * preserve the owner died bit.) | ||
| 584 | */ | ||
| 585 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 586 | newval = FUTEX_WAITERS | new_owner->pid; | ||
| 587 | |||
| 588 | inc_preempt_count(); | ||
| 589 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 590 | dec_preempt_count(); | ||
| 591 | if (curval == -EFAULT) | ||
| 592 | return -EFAULT; | ||
| 593 | if (curval != uval) | ||
| 594 | return -EINVAL; | ||
| 595 | } | ||
| 596 | |||
| 597 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
| 598 | WARN_ON(list_empty(&pi_state->list)); | ||
| 599 | list_del_init(&pi_state->list); | ||
| 600 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
| 601 | |||
| 602 | spin_lock_irq(&new_owner->pi_lock); | ||
| 603 | WARN_ON(!list_empty(&pi_state->list)); | ||
| 604 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
| 605 | pi_state->owner = new_owner; | ||
| 606 | spin_unlock_irq(&new_owner->pi_lock); | ||
| 607 | |||
| 608 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 609 | |||
| 610 | return 0; | ||
| 611 | } | ||
| 612 | |||
| 613 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
| 614 | { | ||
| 615 | u32 oldval; | ||
| 616 | |||
| 617 | /* | ||
| 618 | * There is no waiter, so we unlock the futex. The owner died | ||
| 619 | * bit has not to be preserved here. We are the owner: | ||
| 620 | */ | ||
| 621 | inc_preempt_count(); | ||
| 622 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
| 623 | dec_preempt_count(); | ||
| 624 | |||
| 625 | if (oldval == -EFAULT) | ||
| 626 | return oldval; | ||
| 627 | if (oldval != uval) | ||
| 628 | return -EAGAIN; | ||
| 629 | |||
| 630 | return 0; | ||
| 631 | } | ||
| 632 | |||
| 633 | /* | ||
| 634 | * Express the locking dependencies for lockdep: | ||
| 635 | */ | ||
| 636 | static inline void | ||
| 637 | double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | ||
| 638 | { | ||
| 639 | if (hb1 <= hb2) { | ||
| 640 | spin_lock(&hb1->lock); | ||
| 641 | if (hb1 < hb2) | ||
| 642 | spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); | ||
| 643 | } else { /* hb1 > hb2 */ | ||
| 644 | spin_lock(&hb2->lock); | ||
| 645 | spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); | ||
| 646 | } | ||
| 647 | } | ||
| 648 | |||
| 287 | /* | 649 | /* | 
| 288 | * Wake up all waiters hashed on the physical page that is mapped | 650 | * Wake up all waiters hashed on the physical page that is mapped | 
| 289 | * to this virtual address: | 651 | * to this virtual address: | 
| 290 | */ | 652 | */ | 
| 291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 653 | static int futex_wake(u32 __user *uaddr, int nr_wake) | 
| 292 | { | 654 | { | 
| 293 | union futex_key key; | 655 | struct futex_hash_bucket *hb; | 
| 294 | struct futex_hash_bucket *bh; | ||
| 295 | struct list_head *head; | ||
| 296 | struct futex_q *this, *next; | 656 | struct futex_q *this, *next; | 
| 657 | struct list_head *head; | ||
| 658 | union futex_key key; | ||
| 297 | int ret; | 659 | int ret; | 
| 298 | 660 | ||
| 299 | down_read(¤t->mm->mmap_sem); | 661 | down_read(¤t->mm->mmap_sem); | 
| @@ -302,19 +664,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
| 302 | if (unlikely(ret != 0)) | 664 | if (unlikely(ret != 0)) | 
| 303 | goto out; | 665 | goto out; | 
| 304 | 666 | ||
| 305 | bh = hash_futex(&key); | 667 | hb = hash_futex(&key); | 
| 306 | spin_lock(&bh->lock); | 668 | spin_lock(&hb->lock); | 
| 307 | head = &bh->chain; | 669 | head = &hb->chain; | 
| 308 | 670 | ||
| 309 | list_for_each_entry_safe(this, next, head, list) { | 671 | list_for_each_entry_safe(this, next, head, list) { | 
| 310 | if (match_futex (&this->key, &key)) { | 672 | if (match_futex (&this->key, &key)) { | 
| 673 | if (this->pi_state) { | ||
| 674 | ret = -EINVAL; | ||
| 675 | break; | ||
| 676 | } | ||
| 311 | wake_futex(this); | 677 | wake_futex(this); | 
| 312 | if (++ret >= nr_wake) | 678 | if (++ret >= nr_wake) | 
| 313 | break; | 679 | break; | 
| 314 | } | 680 | } | 
| 315 | } | 681 | } | 
| 316 | 682 | ||
| 317 | spin_unlock(&bh->lock); | 683 | spin_unlock(&hb->lock); | 
| 318 | out: | 684 | out: | 
| 319 | up_read(¤t->mm->mmap_sem); | 685 | up_read(¤t->mm->mmap_sem); | 
| 320 | return ret; | 686 | return ret; | 
| @@ -324,10 +690,12 @@ out: | |||
| 324 | * Wake up all waiters hashed on the physical page that is mapped | 690 | * Wake up all waiters hashed on the physical page that is mapped | 
| 325 | * to this virtual address: | 691 | * to this virtual address: | 
| 326 | */ | 692 | */ | 
| 327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 693 | static int | 
| 694 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
| 695 | int nr_wake, int nr_wake2, int op) | ||
| 328 | { | 696 | { | 
| 329 | union futex_key key1, key2; | 697 | union futex_key key1, key2; | 
| 330 | struct futex_hash_bucket *bh1, *bh2; | 698 | struct futex_hash_bucket *hb1, *hb2; | 
| 331 | struct list_head *head; | 699 | struct list_head *head; | 
| 332 | struct futex_q *this, *next; | 700 | struct futex_q *this, *next; | 
| 333 | int ret, op_ret, attempt = 0; | 701 | int ret, op_ret, attempt = 0; | 
| @@ -342,27 +710,25 @@ retryfull: | |||
| 342 | if (unlikely(ret != 0)) | 710 | if (unlikely(ret != 0)) | 
| 343 | goto out; | 711 | goto out; | 
| 344 | 712 | ||
| 345 | bh1 = hash_futex(&key1); | 713 | hb1 = hash_futex(&key1); | 
| 346 | bh2 = hash_futex(&key2); | 714 | hb2 = hash_futex(&key2); | 
| 347 | 715 | ||
| 348 | retry: | 716 | retry: | 
| 349 | if (bh1 < bh2) | 717 | double_lock_hb(hb1, hb2); | 
| 350 | spin_lock(&bh1->lock); | ||
| 351 | spin_lock(&bh2->lock); | ||
| 352 | if (bh1 > bh2) | ||
| 353 | spin_lock(&bh1->lock); | ||
| 354 | 718 | ||
| 355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 719 | op_ret = futex_atomic_op_inuser(op, uaddr2); | 
| 356 | if (unlikely(op_ret < 0)) { | 720 | if (unlikely(op_ret < 0)) { | 
| 357 | int dummy; | 721 | u32 dummy; | 
| 358 | 722 | ||
| 359 | spin_unlock(&bh1->lock); | 723 | spin_unlock(&hb1->lock); | 
| 360 | if (bh1 != bh2) | 724 | if (hb1 != hb2) | 
| 361 | spin_unlock(&bh2->lock); | 725 | spin_unlock(&hb2->lock); | 
| 362 | 726 | ||
| 363 | #ifndef CONFIG_MMU | 727 | #ifndef CONFIG_MMU | 
| 364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 728 | /* | 
| 365 | * but we might get them from range checking */ | 729 | * we don't get EFAULT from MMU faults if we don't have an MMU, | 
| 730 | * but we might get them from range checking | ||
| 731 | */ | ||
| 366 | ret = op_ret; | 732 | ret = op_ret; | 
| 367 | goto out; | 733 | goto out; | 
| 368 | #endif | 734 | #endif | 
| @@ -372,47 +738,36 @@ retry: | |||
| 372 | goto out; | 738 | goto out; | 
| 373 | } | 739 | } | 
| 374 | 740 | ||
| 375 | /* futex_atomic_op_inuser needs to both read and write | 741 | /* | 
| 742 | * futex_atomic_op_inuser needs to both read and write | ||
| 376 | * *(int __user *)uaddr2, but we can't modify it | 743 | * *(int __user *)uaddr2, but we can't modify it | 
| 377 | * non-atomically. Therefore, if get_user below is not | 744 | * non-atomically. Therefore, if get_user below is not | 
| 378 | * enough, we need to handle the fault ourselves, while | 745 | * enough, we need to handle the fault ourselves, while | 
| 379 | * still holding the mmap_sem. */ | 746 | * still holding the mmap_sem. | 
| 747 | */ | ||
| 380 | if (attempt++) { | 748 | if (attempt++) { | 
| 381 | struct vm_area_struct * vma; | 749 | if (futex_handle_fault((unsigned long)uaddr2, | 
| 382 | struct mm_struct *mm = current->mm; | 750 | attempt)) { | 
| 383 | 751 | ret = -EFAULT; | |
| 384 | ret = -EFAULT; | ||
| 385 | if (attempt >= 2 || | ||
| 386 | !(vma = find_vma(mm, uaddr2)) || | ||
| 387 | vma->vm_start > uaddr2 || | ||
| 388 | !(vma->vm_flags & VM_WRITE)) | ||
| 389 | goto out; | ||
| 390 | |||
| 391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
| 392 | case VM_FAULT_MINOR: | ||
| 393 | current->min_flt++; | ||
| 394 | break; | ||
| 395 | case VM_FAULT_MAJOR: | ||
| 396 | current->maj_flt++; | ||
| 397 | break; | ||
| 398 | default: | ||
| 399 | goto out; | 752 | goto out; | 
| 400 | } | 753 | } | 
| 401 | goto retry; | 754 | goto retry; | 
| 402 | } | 755 | } | 
| 403 | 756 | ||
| 404 | /* If we would have faulted, release mmap_sem, | 757 | /* | 
| 405 | * fault it in and start all over again. */ | 758 | * If we would have faulted, release mmap_sem, | 
| 759 | * fault it in and start all over again. | ||
| 760 | */ | ||
| 406 | up_read(¤t->mm->mmap_sem); | 761 | up_read(¤t->mm->mmap_sem); | 
| 407 | 762 | ||
| 408 | ret = get_user(dummy, (int __user *)uaddr2); | 763 | ret = get_user(dummy, uaddr2); | 
| 409 | if (ret) | 764 | if (ret) | 
| 410 | return ret; | 765 | return ret; | 
| 411 | 766 | ||
| 412 | goto retryfull; | 767 | goto retryfull; | 
| 413 | } | 768 | } | 
| 414 | 769 | ||
| 415 | head = &bh1->chain; | 770 | head = &hb1->chain; | 
| 416 | 771 | ||
| 417 | list_for_each_entry_safe(this, next, head, list) { | 772 | list_for_each_entry_safe(this, next, head, list) { | 
| 418 | if (match_futex (&this->key, &key1)) { | 773 | if (match_futex (&this->key, &key1)) { | 
| @@ -423,7 +778,7 @@ retry: | |||
| 423 | } | 778 | } | 
| 424 | 779 | ||
| 425 | if (op_ret > 0) { | 780 | if (op_ret > 0) { | 
| 426 | head = &bh2->chain; | 781 | head = &hb2->chain; | 
| 427 | 782 | ||
| 428 | op_ret = 0; | 783 | op_ret = 0; | 
| 429 | list_for_each_entry_safe(this, next, head, list) { | 784 | list_for_each_entry_safe(this, next, head, list) { | 
| @@ -436,9 +791,9 @@ retry: | |||
| 436 | ret += op_ret; | 791 | ret += op_ret; | 
| 437 | } | 792 | } | 
| 438 | 793 | ||
| 439 | spin_unlock(&bh1->lock); | 794 | spin_unlock(&hb1->lock); | 
| 440 | if (bh1 != bh2) | 795 | if (hb1 != hb2) | 
| 441 | spin_unlock(&bh2->lock); | 796 | spin_unlock(&hb2->lock); | 
| 442 | out: | 797 | out: | 
| 443 | up_read(¤t->mm->mmap_sem); | 798 | up_read(¤t->mm->mmap_sem); | 
| 444 | return ret; | 799 | return ret; | 
| @@ -448,11 +803,11 @@ out: | |||
| 448 | * Requeue all waiters hashed on one physical page to another | 803 | * Requeue all waiters hashed on one physical page to another | 
| 449 | * physical page. | 804 | * physical page. | 
| 450 | */ | 805 | */ | 
| 451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 806 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, | 
| 452 | int nr_wake, int nr_requeue, int *valp) | 807 | int nr_wake, int nr_requeue, u32 *cmpval) | 
| 453 | { | 808 | { | 
| 454 | union futex_key key1, key2; | 809 | union futex_key key1, key2; | 
| 455 | struct futex_hash_bucket *bh1, *bh2; | 810 | struct futex_hash_bucket *hb1, *hb2; | 
| 456 | struct list_head *head1; | 811 | struct list_head *head1; | 
| 457 | struct futex_q *this, *next; | 812 | struct futex_q *this, *next; | 
| 458 | int ret, drop_count = 0; | 813 | int ret, drop_count = 0; | 
| @@ -467,68 +822,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
| 467 | if (unlikely(ret != 0)) | 822 | if (unlikely(ret != 0)) | 
| 468 | goto out; | 823 | goto out; | 
| 469 | 824 | ||
| 470 | bh1 = hash_futex(&key1); | 825 | hb1 = hash_futex(&key1); | 
| 471 | bh2 = hash_futex(&key2); | 826 | hb2 = hash_futex(&key2); | 
| 472 | 827 | ||
| 473 | if (bh1 < bh2) | 828 | double_lock_hb(hb1, hb2); | 
| 474 | spin_lock(&bh1->lock); | ||
| 475 | spin_lock(&bh2->lock); | ||
| 476 | if (bh1 > bh2) | ||
| 477 | spin_lock(&bh1->lock); | ||
| 478 | 829 | ||
| 479 | if (likely(valp != NULL)) { | 830 | if (likely(cmpval != NULL)) { | 
| 480 | int curval; | 831 | u32 curval; | 
| 481 | 832 | ||
| 482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 833 | ret = get_futex_value_locked(&curval, uaddr1); | 
| 483 | 834 | ||
| 484 | if (unlikely(ret)) { | 835 | if (unlikely(ret)) { | 
| 485 | spin_unlock(&bh1->lock); | 836 | spin_unlock(&hb1->lock); | 
| 486 | if (bh1 != bh2) | 837 | if (hb1 != hb2) | 
| 487 | spin_unlock(&bh2->lock); | 838 | spin_unlock(&hb2->lock); | 
| 488 | 839 | ||
| 489 | /* If we would have faulted, release mmap_sem, fault | 840 | /* | 
| 841 | * If we would have faulted, release mmap_sem, fault | ||
| 490 | * it in and start all over again. | 842 | * it in and start all over again. | 
| 491 | */ | 843 | */ | 
| 492 | up_read(¤t->mm->mmap_sem); | 844 | up_read(¤t->mm->mmap_sem); | 
| 493 | 845 | ||
| 494 | ret = get_user(curval, (int __user *)uaddr1); | 846 | ret = get_user(curval, uaddr1); | 
| 495 | 847 | ||
| 496 | if (!ret) | 848 | if (!ret) | 
| 497 | goto retry; | 849 | goto retry; | 
| 498 | 850 | ||
| 499 | return ret; | 851 | return ret; | 
| 500 | } | 852 | } | 
| 501 | if (curval != *valp) { | 853 | if (curval != *cmpval) { | 
| 502 | ret = -EAGAIN; | 854 | ret = -EAGAIN; | 
| 503 | goto out_unlock; | 855 | goto out_unlock; | 
| 504 | } | 856 | } | 
| 505 | } | 857 | } | 
| 506 | 858 | ||
| 507 | head1 = &bh1->chain; | 859 | head1 = &hb1->chain; | 
| 508 | list_for_each_entry_safe(this, next, head1, list) { | 860 | list_for_each_entry_safe(this, next, head1, list) { | 
| 509 | if (!match_futex (&this->key, &key1)) | 861 | if (!match_futex (&this->key, &key1)) | 
| 510 | continue; | 862 | continue; | 
| 511 | if (++ret <= nr_wake) { | 863 | if (++ret <= nr_wake) { | 
| 512 | wake_futex(this); | 864 | wake_futex(this); | 
| 513 | } else { | 865 | } else { | 
| 514 | list_move_tail(&this->list, &bh2->chain); | 866 | /* | 
| 515 | this->lock_ptr = &bh2->lock; | 867 | * If key1 and key2 hash to the same bucket, no need to | 
| 868 | * requeue. | ||
| 869 | */ | ||
| 870 | if (likely(head1 != &hb2->chain)) { | ||
| 871 | list_move_tail(&this->list, &hb2->chain); | ||
| 872 | this->lock_ptr = &hb2->lock; | ||
| 873 | } | ||
| 516 | this->key = key2; | 874 | this->key = key2; | 
| 517 | get_key_refs(&key2); | 875 | get_key_refs(&key2); | 
| 518 | drop_count++; | 876 | drop_count++; | 
| 519 | 877 | ||
| 520 | if (ret - nr_wake >= nr_requeue) | 878 | if (ret - nr_wake >= nr_requeue) | 
| 521 | break; | 879 | break; | 
| 522 | /* Make sure to stop if key1 == key2 */ | ||
| 523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
| 524 | head1 = &this->list; | ||
| 525 | } | 880 | } | 
| 526 | } | 881 | } | 
| 527 | 882 | ||
| 528 | out_unlock: | 883 | out_unlock: | 
| 529 | spin_unlock(&bh1->lock); | 884 | spin_unlock(&hb1->lock); | 
| 530 | if (bh1 != bh2) | 885 | if (hb1 != hb2) | 
| 531 | spin_unlock(&bh2->lock); | 886 | spin_unlock(&hb2->lock); | 
| 532 | 887 | ||
| 533 | /* drop_key_refs() must be called outside the spinlocks. */ | 888 | /* drop_key_refs() must be called outside the spinlocks. */ | 
| 534 | while (--drop_count >= 0) | 889 | while (--drop_count >= 0) | 
| @@ -543,7 +898,7 @@ out: | |||
| 543 | static inline struct futex_hash_bucket * | 898 | static inline struct futex_hash_bucket * | 
| 544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 899 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 
| 545 | { | 900 | { | 
| 546 | struct futex_hash_bucket *bh; | 901 | struct futex_hash_bucket *hb; | 
| 547 | 902 | ||
| 548 | q->fd = fd; | 903 | q->fd = fd; | 
| 549 | q->filp = filp; | 904 | q->filp = filp; | 
| @@ -551,23 +906,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
| 551 | init_waitqueue_head(&q->waiters); | 906 | init_waitqueue_head(&q->waiters); | 
| 552 | 907 | ||
| 553 | get_key_refs(&q->key); | 908 | get_key_refs(&q->key); | 
| 554 | bh = hash_futex(&q->key); | 909 | hb = hash_futex(&q->key); | 
| 555 | q->lock_ptr = &bh->lock; | 910 | q->lock_ptr = &hb->lock; | 
| 556 | 911 | ||
| 557 | spin_lock(&bh->lock); | 912 | spin_lock(&hb->lock); | 
| 558 | return bh; | 913 | return hb; | 
| 559 | } | 914 | } | 
| 560 | 915 | ||
| 561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 916 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 
| 562 | { | 917 | { | 
| 563 | list_add_tail(&q->list, &bh->chain); | 918 | list_add_tail(&q->list, &hb->chain); | 
| 564 | spin_unlock(&bh->lock); | 919 | q->task = current; | 
| 920 | spin_unlock(&hb->lock); | ||
| 565 | } | 921 | } | 
| 566 | 922 | ||
| 567 | static inline void | 923 | static inline void | 
| 568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 924 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) | 
| 569 | { | 925 | { | 
| 570 | spin_unlock(&bh->lock); | 926 | spin_unlock(&hb->lock); | 
| 571 | drop_key_refs(&q->key); | 927 | drop_key_refs(&q->key); | 
| 572 | } | 928 | } | 
| 573 | 929 | ||
| @@ -579,20 +935,22 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
| 579 | /* The key must be already stored in q->key. */ | 935 | /* The key must be already stored in q->key. */ | 
| 580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 936 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 
| 581 | { | 937 | { | 
| 582 | struct futex_hash_bucket *bh; | 938 | struct futex_hash_bucket *hb; | 
| 583 | bh = queue_lock(q, fd, filp); | 939 | |
| 584 | __queue_me(q, bh); | 940 | hb = queue_lock(q, fd, filp); | 
| 941 | __queue_me(q, hb); | ||
| 585 | } | 942 | } | 
| 586 | 943 | ||
| 587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 944 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 
| 588 | static int unqueue_me(struct futex_q *q) | 945 | static int unqueue_me(struct futex_q *q) | 
| 589 | { | 946 | { | 
| 590 | int ret = 0; | ||
| 591 | spinlock_t *lock_ptr; | 947 | spinlock_t *lock_ptr; | 
| 948 | int ret = 0; | ||
| 592 | 949 | ||
| 593 | /* In the common case we don't take the spinlock, which is nice. */ | 950 | /* In the common case we don't take the spinlock, which is nice. */ | 
| 594 | retry: | 951 | retry: | 
| 595 | lock_ptr = q->lock_ptr; | 952 | lock_ptr = q->lock_ptr; | 
| 953 | barrier(); | ||
| 596 | if (lock_ptr != 0) { | 954 | if (lock_ptr != 0) { | 
| 597 | spin_lock(lock_ptr); | 955 | spin_lock(lock_ptr); | 
| 598 | /* | 956 | /* | 
| @@ -614,6 +972,9 @@ static int unqueue_me(struct futex_q *q) | |||
| 614 | } | 972 | } | 
| 615 | WARN_ON(list_empty(&q->list)); | 973 | WARN_ON(list_empty(&q->list)); | 
| 616 | list_del(&q->list); | 974 | list_del(&q->list); | 
| 975 | |||
| 976 | BUG_ON(q->pi_state); | ||
| 977 | |||
| 617 | spin_unlock(lock_ptr); | 978 | spin_unlock(lock_ptr); | 
| 618 | ret = 1; | 979 | ret = 1; | 
| 619 | } | 980 | } | 
| @@ -622,21 +983,42 @@ static int unqueue_me(struct futex_q *q) | |||
| 622 | return ret; | 983 | return ret; | 
| 623 | } | 984 | } | 
| 624 | 985 | ||
| 625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 986 | /* | 
| 987 | * PI futexes can not be requeued and must remove themself from the | ||
| 988 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
| 989 | */ | ||
| 990 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
| 991 | { | ||
| 992 | WARN_ON(list_empty(&q->list)); | ||
| 993 | list_del(&q->list); | ||
| 994 | |||
| 995 | BUG_ON(!q->pi_state); | ||
| 996 | free_pi_state(q->pi_state); | ||
| 997 | q->pi_state = NULL; | ||
| 998 | |||
| 999 | spin_unlock(&hb->lock); | ||
| 1000 | |||
| 1001 | drop_key_refs(&q->key); | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
| 626 | { | 1005 | { | 
| 627 | DECLARE_WAITQUEUE(wait, current); | 1006 | struct task_struct *curr = current; | 
| 628 | int ret, curval; | 1007 | DECLARE_WAITQUEUE(wait, curr); | 
| 1008 | struct futex_hash_bucket *hb; | ||
| 629 | struct futex_q q; | 1009 | struct futex_q q; | 
| 630 | struct futex_hash_bucket *bh; | 1010 | u32 uval; | 
| 1011 | int ret; | ||
| 631 | 1012 | ||
| 1013 | q.pi_state = NULL; | ||
| 632 | retry: | 1014 | retry: | 
| 633 | down_read(¤t->mm->mmap_sem); | 1015 | down_read(&curr->mm->mmap_sem); | 
| 634 | 1016 | ||
| 635 | ret = get_futex_key(uaddr, &q.key); | 1017 | ret = get_futex_key(uaddr, &q.key); | 
| 636 | if (unlikely(ret != 0)) | 1018 | if (unlikely(ret != 0)) | 
| 637 | goto out_release_sem; | 1019 | goto out_release_sem; | 
| 638 | 1020 | ||
| 639 | bh = queue_lock(&q, -1, NULL); | 1021 | hb = queue_lock(&q, -1, NULL); | 
| 640 | 1022 | ||
| 641 | /* | 1023 | /* | 
| 642 | * Access the page AFTER the futex is queued. | 1024 | * Access the page AFTER the futex is queued. | 
| @@ -658,37 +1040,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
| 658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1040 | * We hold the mmap semaphore, so the mapping cannot have changed | 
| 659 | * since we looked it up in get_futex_key. | 1041 | * since we looked it up in get_futex_key. | 
| 660 | */ | 1042 | */ | 
| 661 | 1043 | ret = get_futex_value_locked(&uval, uaddr); | |
| 662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
| 663 | 1044 | ||
| 664 | if (unlikely(ret)) { | 1045 | if (unlikely(ret)) { | 
| 665 | queue_unlock(&q, bh); | 1046 | queue_unlock(&q, hb); | 
| 666 | 1047 | ||
| 667 | /* If we would have faulted, release mmap_sem, fault it in and | 1048 | /* | 
| 1049 | * If we would have faulted, release mmap_sem, fault it in and | ||
| 668 | * start all over again. | 1050 | * start all over again. | 
| 669 | */ | 1051 | */ | 
| 670 | up_read(¤t->mm->mmap_sem); | 1052 | up_read(&curr->mm->mmap_sem); | 
| 671 | 1053 | ||
| 672 | ret = get_user(curval, (int __user *)uaddr); | 1054 | ret = get_user(uval, uaddr); | 
| 673 | 1055 | ||
| 674 | if (!ret) | 1056 | if (!ret) | 
| 675 | goto retry; | 1057 | goto retry; | 
| 676 | return ret; | 1058 | return ret; | 
| 677 | } | 1059 | } | 
| 678 | if (curval != val) { | 1060 | ret = -EWOULDBLOCK; | 
| 679 | ret = -EWOULDBLOCK; | 1061 | if (uval != val) | 
| 680 | queue_unlock(&q, bh); | 1062 | goto out_unlock_release_sem; | 
| 681 | goto out_release_sem; | ||
| 682 | } | ||
| 683 | 1063 | ||
| 684 | /* Only actually queue if *uaddr contained val. */ | 1064 | /* Only actually queue if *uaddr contained val. */ | 
| 685 | __queue_me(&q, bh); | 1065 | __queue_me(&q, hb); | 
| 686 | 1066 | ||
| 687 | /* | 1067 | /* | 
| 688 | * Now the futex is queued and we have checked the data, we | 1068 | * Now the futex is queued and we have checked the data, we | 
| 689 | * don't want to hold mmap_sem while we sleep. | 1069 | * don't want to hold mmap_sem while we sleep. | 
| 690 | */ | 1070 | */ | 
| 691 | up_read(¤t->mm->mmap_sem); | 1071 | up_read(&curr->mm->mmap_sem); | 
| 692 | 1072 | ||
| 693 | /* | 1073 | /* | 
| 694 | * There might have been scheduling since the queue_me(), as we | 1074 | * There might have been scheduling since the queue_me(), as we | 
| @@ -720,12 +1100,367 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
| 720 | return 0; | 1100 | return 0; | 
| 721 | if (time == 0) | 1101 | if (time == 0) | 
| 722 | return -ETIMEDOUT; | 1102 | return -ETIMEDOUT; | 
| 723 | /* We expect signal_pending(current), but another thread may | 1103 | /* | 
| 724 | * have handled it for us already. */ | 1104 | * We expect signal_pending(current), but another thread may | 
| 1105 | * have handled it for us already. | ||
| 1106 | */ | ||
| 725 | return -EINTR; | 1107 | return -EINTR; | 
| 726 | 1108 | ||
| 1109 | out_unlock_release_sem: | ||
| 1110 | queue_unlock(&q, hb); | ||
| 1111 | |||
| 727 | out_release_sem: | 1112 | out_release_sem: | 
| 1113 | up_read(&curr->mm->mmap_sem); | ||
| 1114 | return ret; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | /* | ||
| 1118 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
| 1119 | * and failed. The kernel side here does the whole locking operation: | ||
| 1120 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
| 1121 | * races the kernel might see a 0 value of the futex too.) | ||
| 1122 | */ | ||
| 1123 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
| 1124 | long nsec, int trylock) | ||
| 1125 | { | ||
| 1126 | struct hrtimer_sleeper timeout, *to = NULL; | ||
| 1127 | struct task_struct *curr = current; | ||
| 1128 | struct futex_hash_bucket *hb; | ||
| 1129 | u32 uval, newval, curval; | ||
| 1130 | struct futex_q q; | ||
| 1131 | int ret, attempt = 0; | ||
| 1132 | |||
| 1133 | if (refill_pi_state_cache()) | ||
| 1134 | return -ENOMEM; | ||
| 1135 | |||
| 1136 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
| 1137 | to = &timeout; | ||
| 1138 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
| 1139 | hrtimer_init_sleeper(to, current); | ||
| 1140 | to->timer.expires = ktime_set(sec, nsec); | ||
| 1141 | } | ||
| 1142 | |||
| 1143 | q.pi_state = NULL; | ||
| 1144 | retry: | ||
| 1145 | down_read(&curr->mm->mmap_sem); | ||
| 1146 | |||
| 1147 | ret = get_futex_key(uaddr, &q.key); | ||
| 1148 | if (unlikely(ret != 0)) | ||
| 1149 | goto out_release_sem; | ||
| 1150 | |||
| 1151 | hb = queue_lock(&q, -1, NULL); | ||
| 1152 | |||
| 1153 | retry_locked: | ||
| 1154 | /* | ||
| 1155 | * To avoid races, we attempt to take the lock here again | ||
| 1156 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
| 1157 | * the locks. It will most likely not succeed. | ||
| 1158 | */ | ||
| 1159 | newval = current->pid; | ||
| 1160 | |||
| 1161 | inc_preempt_count(); | ||
| 1162 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
| 1163 | dec_preempt_count(); | ||
| 1164 | |||
| 1165 | if (unlikely(curval == -EFAULT)) | ||
| 1166 | goto uaddr_faulted; | ||
| 1167 | |||
| 1168 | /* We own the lock already */ | ||
| 1169 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
| 1170 | if (!detect && 0) | ||
| 1171 | force_sig(SIGKILL, current); | ||
| 1172 | ret = -EDEADLK; | ||
| 1173 | goto out_unlock_release_sem; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | /* | ||
| 1177 | * Surprise - we got the lock. Just return | ||
| 1178 | * to userspace: | ||
| 1179 | */ | ||
| 1180 | if (unlikely(!curval)) | ||
| 1181 | goto out_unlock_release_sem; | ||
| 1182 | |||
| 1183 | uval = curval; | ||
| 1184 | newval = uval | FUTEX_WAITERS; | ||
| 1185 | |||
| 1186 | inc_preempt_count(); | ||
| 1187 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 1188 | dec_preempt_count(); | ||
| 1189 | |||
| 1190 | if (unlikely(curval == -EFAULT)) | ||
| 1191 | goto uaddr_faulted; | ||
| 1192 | if (unlikely(curval != uval)) | ||
| 1193 | goto retry_locked; | ||
| 1194 | |||
| 1195 | /* | ||
| 1196 | * We dont have the lock. Look up the PI state (or create it if | ||
| 1197 | * we are the first waiter): | ||
| 1198 | */ | ||
| 1199 | ret = lookup_pi_state(uval, hb, &q); | ||
| 1200 | |||
| 1201 | if (unlikely(ret)) { | ||
| 1202 | /* | ||
| 1203 | * There were no waiters and the owner task lookup | ||
| 1204 | * failed. When the OWNER_DIED bit is set, then we | ||
| 1205 | * know that this is a robust futex and we actually | ||
| 1206 | * take the lock. This is safe as we are protected by | ||
| 1207 | * the hash bucket lock. We also set the waiters bit | ||
| 1208 | * unconditionally here, to simplify glibc handling of | ||
| 1209 | * multiple tasks racing to acquire the lock and | ||
| 1210 | * cleanup the problems which were left by the dead | ||
| 1211 | * owner. | ||
| 1212 | */ | ||
| 1213 | if (curval & FUTEX_OWNER_DIED) { | ||
| 1214 | uval = newval; | ||
| 1215 | newval = current->pid | | ||
| 1216 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
| 1217 | |||
| 1218 | inc_preempt_count(); | ||
| 1219 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1220 | uval, newval); | ||
| 1221 | dec_preempt_count(); | ||
| 1222 | |||
| 1223 | if (unlikely(curval == -EFAULT)) | ||
| 1224 | goto uaddr_faulted; | ||
| 1225 | if (unlikely(curval != uval)) | ||
| 1226 | goto retry_locked; | ||
| 1227 | ret = 0; | ||
| 1228 | } | ||
| 1229 | goto out_unlock_release_sem; | ||
| 1230 | } | ||
| 1231 | |||
| 1232 | /* | ||
| 1233 | * Only actually queue now that the atomic ops are done: | ||
| 1234 | */ | ||
| 1235 | __queue_me(&q, hb); | ||
| 1236 | |||
| 1237 | /* | ||
| 1238 | * Now the futex is queued and we have checked the data, we | ||
| 1239 | * don't want to hold mmap_sem while we sleep. | ||
| 1240 | */ | ||
| 1241 | up_read(&curr->mm->mmap_sem); | ||
| 1242 | |||
| 1243 | WARN_ON(!q.pi_state); | ||
| 1244 | /* | ||
| 1245 | * Block on the PI mutex: | ||
| 1246 | */ | ||
| 1247 | if (!trylock) | ||
| 1248 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
| 1249 | else { | ||
| 1250 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
| 1251 | /* Fixup the trylock return value: */ | ||
| 1252 | ret = ret ? 0 : -EWOULDBLOCK; | ||
| 1253 | } | ||
| 1254 | |||
| 1255 | down_read(&curr->mm->mmap_sem); | ||
| 1256 | spin_lock(q.lock_ptr); | ||
| 1257 | |||
| 1258 | /* | ||
| 1259 | * Got the lock. We might not be the anticipated owner if we | ||
| 1260 | * did a lock-steal - fix up the PI-state in that case. | ||
| 1261 | */ | ||
| 1262 | if (!ret && q.pi_state->owner != curr) { | ||
| 1263 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
| 1264 | |||
| 1265 | /* Owner died? */ | ||
| 1266 | if (q.pi_state->owner != NULL) { | ||
| 1267 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
| 1268 | WARN_ON(list_empty(&q.pi_state->list)); | ||
| 1269 | list_del_init(&q.pi_state->list); | ||
| 1270 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
| 1271 | } else | ||
| 1272 | newtid |= FUTEX_OWNER_DIED; | ||
| 1273 | |||
| 1274 | q.pi_state->owner = current; | ||
| 1275 | |||
| 1276 | spin_lock_irq(¤t->pi_lock); | ||
| 1277 | WARN_ON(!list_empty(&q.pi_state->list)); | ||
| 1278 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
| 1279 | spin_unlock_irq(¤t->pi_lock); | ||
| 1280 | |||
| 1281 | /* Unqueue and drop the lock */ | ||
| 1282 | unqueue_me_pi(&q, hb); | ||
| 1283 | up_read(&curr->mm->mmap_sem); | ||
| 1284 | /* | ||
| 1285 | * We own it, so we have to replace the pending owner | ||
| 1286 | * TID. This must be atomic as we have preserve the | ||
| 1287 | * owner died bit here. | ||
| 1288 | */ | ||
| 1289 | ret = get_user(uval, uaddr); | ||
| 1290 | while (!ret) { | ||
| 1291 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
| 1292 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1293 | uval, newval); | ||
| 1294 | if (curval == -EFAULT) | ||
| 1295 | ret = -EFAULT; | ||
| 1296 | if (curval == uval) | ||
| 1297 | break; | ||
| 1298 | uval = curval; | ||
| 1299 | } | ||
| 1300 | } else { | ||
| 1301 | /* | ||
| 1302 | * Catch the rare case, where the lock was released | ||
| 1303 | * when we were on the way back before we locked | ||
| 1304 | * the hash bucket. | ||
| 1305 | */ | ||
| 1306 | if (ret && q.pi_state->owner == curr) { | ||
| 1307 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
| 1308 | ret = 0; | ||
| 1309 | } | ||
| 1310 | /* Unqueue and drop the lock */ | ||
| 1311 | unqueue_me_pi(&q, hb); | ||
| 1312 | up_read(&curr->mm->mmap_sem); | ||
| 1313 | } | ||
| 1314 | |||
| 1315 | if (!detect && ret == -EDEADLK && 0) | ||
| 1316 | force_sig(SIGKILL, current); | ||
| 1317 | |||
| 1318 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | ||
| 1319 | |||
| 1320 | out_unlock_release_sem: | ||
| 1321 | queue_unlock(&q, hb); | ||
| 1322 | |||
| 1323 | out_release_sem: | ||
| 1324 | up_read(&curr->mm->mmap_sem); | ||
| 1325 | return ret; | ||
| 1326 | |||
| 1327 | uaddr_faulted: | ||
| 1328 | /* | ||
| 1329 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1330 | * non-atomically. Therefore, if get_user below is not | ||
| 1331 | * enough, we need to handle the fault ourselves, while | ||
| 1332 | * still holding the mmap_sem. | ||
| 1333 | */ | ||
| 1334 | if (attempt++) { | ||
| 1335 | if (futex_handle_fault((unsigned long)uaddr, attempt)) { | ||
| 1336 | ret = -EFAULT; | ||
| 1337 | goto out_unlock_release_sem; | ||
| 1338 | } | ||
| 1339 | goto retry_locked; | ||
| 1340 | } | ||
| 1341 | |||
| 1342 | queue_unlock(&q, hb); | ||
| 1343 | up_read(&curr->mm->mmap_sem); | ||
| 1344 | |||
| 1345 | ret = get_user(uval, uaddr); | ||
| 1346 | if (!ret && (uval != -EFAULT)) | ||
| 1347 | goto retry; | ||
| 1348 | |||
| 1349 | return ret; | ||
| 1350 | } | ||
| 1351 | |||
| 1352 | /* | ||
| 1353 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
| 1354 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
| 1355 | * and do the rt-mutex unlock. | ||
| 1356 | */ | ||
| 1357 | static int futex_unlock_pi(u32 __user *uaddr) | ||
| 1358 | { | ||
| 1359 | struct futex_hash_bucket *hb; | ||
| 1360 | struct futex_q *this, *next; | ||
| 1361 | u32 uval; | ||
| 1362 | struct list_head *head; | ||
| 1363 | union futex_key key; | ||
| 1364 | int ret, attempt = 0; | ||
| 1365 | |||
| 1366 | retry: | ||
| 1367 | if (get_user(uval, uaddr)) | ||
| 1368 | return -EFAULT; | ||
| 1369 | /* | ||
| 1370 | * We release only a lock we actually own: | ||
| 1371 | */ | ||
| 1372 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
| 1373 | return -EPERM; | ||
| 1374 | /* | ||
| 1375 | * First take all the futex related locks: | ||
| 1376 | */ | ||
| 1377 | down_read(¤t->mm->mmap_sem); | ||
| 1378 | |||
| 1379 | ret = get_futex_key(uaddr, &key); | ||
| 1380 | if (unlikely(ret != 0)) | ||
| 1381 | goto out; | ||
| 1382 | |||
| 1383 | hb = hash_futex(&key); | ||
| 1384 | spin_lock(&hb->lock); | ||
| 1385 | |||
| 1386 | retry_locked: | ||
| 1387 | /* | ||
| 1388 | * To avoid races, try to do the TID -> 0 atomic transition | ||
| 1389 | * again. If it succeeds then we can return without waking | ||
| 1390 | * anyone else up: | ||
| 1391 | */ | ||
| 1392 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 1393 | inc_preempt_count(); | ||
| 1394 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
| 1395 | dec_preempt_count(); | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | if (unlikely(uval == -EFAULT)) | ||
| 1399 | goto pi_faulted; | ||
| 1400 | /* | ||
| 1401 | * Rare case: we managed to release the lock atomically, | ||
| 1402 | * no need to wake anyone else up: | ||
| 1403 | */ | ||
| 1404 | if (unlikely(uval == current->pid)) | ||
| 1405 | goto out_unlock; | ||
| 1406 | |||
| 1407 | /* | ||
| 1408 | * Ok, other tasks may need to be woken up - check waiters | ||
| 1409 | * and do the wakeup if necessary: | ||
| 1410 | */ | ||
| 1411 | head = &hb->chain; | ||
| 1412 | |||
| 1413 | list_for_each_entry_safe(this, next, head, list) { | ||
| 1414 | if (!match_futex (&this->key, &key)) | ||
| 1415 | continue; | ||
| 1416 | ret = wake_futex_pi(uaddr, uval, this); | ||
| 1417 | /* | ||
| 1418 | * The atomic access to the futex value | ||
| 1419 | * generated a pagefault, so retry the | ||
| 1420 | * user-access and the wakeup: | ||
| 1421 | */ | ||
| 1422 | if (ret == -EFAULT) | ||
| 1423 | goto pi_faulted; | ||
| 1424 | goto out_unlock; | ||
| 1425 | } | ||
| 1426 | /* | ||
| 1427 | * No waiters - kernel unlocks the futex: | ||
| 1428 | */ | ||
| 1429 | if (!(uval & FUTEX_OWNER_DIED)) { | ||
| 1430 | ret = unlock_futex_pi(uaddr, uval); | ||
| 1431 | if (ret == -EFAULT) | ||
| 1432 | goto pi_faulted; | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | out_unlock: | ||
| 1436 | spin_unlock(&hb->lock); | ||
| 1437 | out: | ||
| 728 | up_read(¤t->mm->mmap_sem); | 1438 | up_read(¤t->mm->mmap_sem); | 
| 1439 | |||
| 1440 | return ret; | ||
| 1441 | |||
| 1442 | pi_faulted: | ||
| 1443 | /* | ||
| 1444 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1445 | * non-atomically. Therefore, if get_user below is not | ||
| 1446 | * enough, we need to handle the fault ourselves, while | ||
| 1447 | * still holding the mmap_sem. | ||
| 1448 | */ | ||
| 1449 | if (attempt++) { | ||
| 1450 | if (futex_handle_fault((unsigned long)uaddr, attempt)) { | ||
| 1451 | ret = -EFAULT; | ||
| 1452 | goto out_unlock; | ||
| 1453 | } | ||
| 1454 | goto retry_locked; | ||
| 1455 | } | ||
| 1456 | |||
| 1457 | spin_unlock(&hb->lock); | ||
| 1458 | up_read(¤t->mm->mmap_sem); | ||
| 1459 | |||
| 1460 | ret = get_user(uval, uaddr); | ||
| 1461 | if (!ret && (uval != -EFAULT)) | ||
| 1462 | goto retry; | ||
| 1463 | |||
| 729 | return ret; | 1464 | return ret; | 
| 730 | } | 1465 | } | 
| 731 | 1466 | ||
| @@ -735,6 +1470,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
| 735 | 1470 | ||
| 736 | unqueue_me(q); | 1471 | unqueue_me(q); | 
| 737 | kfree(q); | 1472 | kfree(q); | 
| 1473 | |||
| 738 | return 0; | 1474 | return 0; | 
| 739 | } | 1475 | } | 
| 740 | 1476 | ||
| @@ -766,7 +1502,7 @@ static struct file_operations futex_fops = { | |||
| 766 | * Signal allows caller to avoid the race which would occur if they | 1502 | * Signal allows caller to avoid the race which would occur if they | 
| 767 | * set the sigio stuff up afterwards. | 1503 | * set the sigio stuff up afterwards. | 
| 768 | */ | 1504 | */ | 
| 769 | static int futex_fd(unsigned long uaddr, int signal) | 1505 | static int futex_fd(u32 __user *uaddr, int signal) | 
| 770 | { | 1506 | { | 
| 771 | struct futex_q *q; | 1507 | struct futex_q *q; | 
| 772 | struct file *filp; | 1508 | struct file *filp; | 
| @@ -803,6 +1539,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
| 803 | err = -ENOMEM; | 1539 | err = -ENOMEM; | 
| 804 | goto error; | 1540 | goto error; | 
| 805 | } | 1541 | } | 
| 1542 | q->pi_state = NULL; | ||
| 806 | 1543 | ||
| 807 | down_read(¤t->mm->mmap_sem); | 1544 | down_read(¤t->mm->mmap_sem); | 
| 808 | err = get_futex_key(uaddr, &q->key); | 1545 | err = get_futex_key(uaddr, &q->key); | 
| @@ -840,7 +1577,7 @@ error: | |||
| 840 | * Implementation: user-space maintains a per-thread list of locks it | 1577 | * Implementation: user-space maintains a per-thread list of locks it | 
| 841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1578 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 
| 842 | * and marks all locks that are owned by this thread with the | 1579 | * and marks all locks that are owned by this thread with the | 
| 843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1580 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is | 
| 844 | * always manipulated with the lock held, so the list is private and | 1581 | * always manipulated with the lock held, so the list is private and | 
| 845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1582 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 
| 846 | * field, to allow the kernel to clean up if the thread dies after | 1583 | * field, to allow the kernel to clean up if the thread dies after | 
| @@ -887,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 887 | struct task_struct *p; | 1624 | struct task_struct *p; | 
| 888 | 1625 | ||
| 889 | ret = -ESRCH; | 1626 | ret = -ESRCH; | 
| 890 | read_lock(&tasklist_lock); | 1627 | rcu_read_lock(); | 
| 891 | p = find_task_by_pid(pid); | 1628 | p = find_task_by_pid(pid); | 
| 892 | if (!p) | 1629 | if (!p) | 
| 893 | goto err_unlock; | 1630 | goto err_unlock; | 
| @@ -896,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 896 | !capable(CAP_SYS_PTRACE)) | 1633 | !capable(CAP_SYS_PTRACE)) | 
| 897 | goto err_unlock; | 1634 | goto err_unlock; | 
| 898 | head = p->robust_list; | 1635 | head = p->robust_list; | 
| 899 | read_unlock(&tasklist_lock); | 1636 | rcu_read_unlock(); | 
| 900 | } | 1637 | } | 
| 901 | 1638 | ||
| 902 | if (put_user(sizeof(*head), len_ptr)) | 1639 | if (put_user(sizeof(*head), len_ptr)) | 
| @@ -904,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | |||
| 904 | return put_user(head, head_ptr); | 1641 | return put_user(head, head_ptr); | 
| 905 | 1642 | ||
| 906 | err_unlock: | 1643 | err_unlock: | 
| 907 | read_unlock(&tasklist_lock); | 1644 | rcu_read_unlock(); | 
| 908 | 1645 | ||
| 909 | return ret; | 1646 | return ret; | 
| 910 | } | 1647 | } | 
| @@ -913,9 +1650,9 @@ err_unlock: | |||
| 913 | * Process a futex-list entry, check whether it's owned by the | 1650 | * Process a futex-list entry, check whether it's owned by the | 
| 914 | * dying task, and do notification if so: | 1651 | * dying task, and do notification if so: | 
| 915 | */ | 1652 | */ | 
| 916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1653 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) | 
| 917 | { | 1654 | { | 
| 918 | u32 uval; | 1655 | u32 uval, nval, mval; | 
| 919 | 1656 | ||
| 920 | retry: | 1657 | retry: | 
| 921 | if (get_user(uval, uaddr)) | 1658 | if (get_user(uval, uaddr)) | 
| @@ -932,17 +1669,45 @@ retry: | |||
| 932 | * thread-death.) The rest of the cleanup is done in | 1669 | * thread-death.) The rest of the cleanup is done in | 
| 933 | * userspace. | 1670 | * userspace. | 
| 934 | */ | 1671 | */ | 
| 935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1672 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 
| 936 | uval | FUTEX_OWNER_DIED) != uval) | 1673 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); | 
| 1674 | |||
| 1675 | if (nval == -EFAULT) | ||
| 1676 | return -1; | ||
| 1677 | |||
| 1678 | if (nval != uval) | ||
| 937 | goto retry; | 1679 | goto retry; | 
| 938 | 1680 | ||
| 939 | if (uval & FUTEX_WAITERS) | 1681 | /* | 
| 940 | futex_wake((unsigned long)uaddr, 1); | 1682 | * Wake robust non-PI futexes here. The wakeup of | 
| 1683 | * PI futexes happens in exit_pi_state(): | ||
| 1684 | */ | ||
| 1685 | if (!pi) { | ||
| 1686 | if (uval & FUTEX_WAITERS) | ||
| 1687 | futex_wake(uaddr, 1); | ||
| 1688 | } | ||
| 941 | } | 1689 | } | 
| 942 | return 0; | 1690 | return 0; | 
| 943 | } | 1691 | } | 
| 944 | 1692 | ||
| 945 | /* | 1693 | /* | 
| 1694 | * Fetch a robust-list pointer. Bit 0 signals PI futexes: | ||
| 1695 | */ | ||
| 1696 | static inline int fetch_robust_entry(struct robust_list __user **entry, | ||
| 1697 | struct robust_list __user **head, int *pi) | ||
| 1698 | { | ||
| 1699 | unsigned long uentry; | ||
| 1700 | |||
| 1701 | if (get_user(uentry, (unsigned long *)head)) | ||
| 1702 | return -EFAULT; | ||
| 1703 | |||
| 1704 | *entry = (void *)(uentry & ~1UL); | ||
| 1705 | *pi = uentry & 1; | ||
| 1706 | |||
| 1707 | return 0; | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | /* | ||
| 946 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 1711 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 
| 947 | * and mark any locks found there dead, and notify any waiters. | 1712 | * and mark any locks found there dead, and notify any waiters. | 
| 948 | * | 1713 | * | 
| @@ -952,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr) | |||
| 952 | { | 1717 | { | 
| 953 | struct robust_list_head __user *head = curr->robust_list; | 1718 | struct robust_list_head __user *head = curr->robust_list; | 
| 954 | struct robust_list __user *entry, *pending; | 1719 | struct robust_list __user *entry, *pending; | 
| 955 | unsigned int limit = ROBUST_LIST_LIMIT; | 1720 | unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; | 
| 956 | unsigned long futex_offset; | 1721 | unsigned long futex_offset; | 
| 957 | 1722 | ||
| 958 | /* | 1723 | /* | 
| 959 | * Fetch the list head (which was registered earlier, via | 1724 | * Fetch the list head (which was registered earlier, via | 
| 960 | * sys_set_robust_list()): | 1725 | * sys_set_robust_list()): | 
| 961 | */ | 1726 | */ | 
| 962 | if (get_user(entry, &head->list.next)) | 1727 | if (fetch_robust_entry(&entry, &head->list.next, &pi)) | 
| 963 | return; | 1728 | return; | 
| 964 | /* | 1729 | /* | 
| 965 | * Fetch the relative futex offset: | 1730 | * Fetch the relative futex offset: | 
| @@ -970,24 +1735,25 @@ void exit_robust_list(struct task_struct *curr) | |||
| 970 | * Fetch any possibly pending lock-add first, and handle it | 1735 | * Fetch any possibly pending lock-add first, and handle it | 
| 971 | * if it exists: | 1736 | * if it exists: | 
| 972 | */ | 1737 | */ | 
| 973 | if (get_user(pending, &head->list_op_pending)) | 1738 | if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) | 
| 974 | return; | 1739 | return; | 
| 1740 | |||
| 975 | if (pending) | 1741 | if (pending) | 
| 976 | handle_futex_death((void *)pending + futex_offset, curr); | 1742 | handle_futex_death((void *)pending + futex_offset, curr, pip); | 
| 977 | 1743 | ||
| 978 | while (entry != &head->list) { | 1744 | while (entry != &head->list) { | 
| 979 | /* | 1745 | /* | 
| 980 | * A pending lock might already be on the list, so | 1746 | * A pending lock might already be on the list, so | 
| 981 | * dont process it twice: | 1747 | * don't process it twice: | 
| 982 | */ | 1748 | */ | 
| 983 | if (entry != pending) | 1749 | if (entry != pending) | 
| 984 | if (handle_futex_death((void *)entry + futex_offset, | 1750 | if (handle_futex_death((void *)entry + futex_offset, | 
| 985 | curr)) | 1751 | curr, pi)) | 
| 986 | return; | 1752 | return; | 
| 987 | /* | 1753 | /* | 
| 988 | * Fetch the next entry in the list: | 1754 | * Fetch the next entry in the list: | 
| 989 | */ | 1755 | */ | 
| 990 | if (get_user(entry, &entry->next)) | 1756 | if (fetch_robust_entry(&entry, &entry->next, &pi)) | 
| 991 | return; | 1757 | return; | 
| 992 | /* | 1758 | /* | 
| 993 | * Avoid excessively long or circular lists: | 1759 | * Avoid excessively long or circular lists: | 
| @@ -999,8 +1765,8 @@ void exit_robust_list(struct task_struct *curr) | |||
| 999 | } | 1765 | } | 
| 1000 | } | 1766 | } | 
| 1001 | 1767 | ||
| 1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1768 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, | 
| 1003 | unsigned long uaddr2, int val2, int val3) | 1769 | u32 __user *uaddr2, u32 val2, u32 val3) | 
| 1004 | { | 1770 | { | 
| 1005 | int ret; | 1771 | int ret; | 
| 1006 | 1772 | ||
| @@ -1024,6 +1790,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
| 1024 | case FUTEX_WAKE_OP: | 1790 | case FUTEX_WAKE_OP: | 
| 1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1791 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 
| 1026 | break; | 1792 | break; | 
| 1793 | case FUTEX_LOCK_PI: | ||
| 1794 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
| 1795 | break; | ||
| 1796 | case FUTEX_UNLOCK_PI: | ||
| 1797 | ret = futex_unlock_pi(uaddr); | ||
| 1798 | break; | ||
| 1799 | case FUTEX_TRYLOCK_PI: | ||
| 1800 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
| 1801 | break; | ||
| 1027 | default: | 1802 | default: | 
| 1028 | ret = -ENOSYS; | 1803 | ret = -ENOSYS; | 
| 1029 | } | 1804 | } | 
| @@ -1031,29 +1806,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
| 1031 | } | 1806 | } | 
| 1032 | 1807 | ||
| 1033 | 1808 | ||
| 1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1809 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, | 
| 1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1810 | struct timespec __user *utime, u32 __user *uaddr2, | 
| 1036 | int val3) | 1811 | u32 val3) | 
| 1037 | { | 1812 | { | 
| 1038 | struct timespec t; | 1813 | struct timespec t; | 
| 1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1814 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 
| 1040 | int val2 = 0; | 1815 | u32 val2 = 0; | 
| 1041 | 1816 | ||
| 1042 | if (utime && (op == FUTEX_WAIT)) { | 1817 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { | 
| 1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1818 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 
| 1044 | return -EFAULT; | 1819 | return -EFAULT; | 
| 1045 | if (!timespec_valid(&t)) | 1820 | if (!timespec_valid(&t)) | 
| 1046 | return -EINVAL; | 1821 | return -EINVAL; | 
| 1047 | timeout = timespec_to_jiffies(&t) + 1; | 1822 | if (op == FUTEX_WAIT) | 
| 1823 | timeout = timespec_to_jiffies(&t) + 1; | ||
| 1824 | else { | ||
| 1825 | timeout = t.tv_sec; | ||
| 1826 | val2 = t.tv_nsec; | ||
| 1827 | } | ||
| 1048 | } | 1828 | } | 
| 1049 | /* | 1829 | /* | 
| 1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1830 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 
| 1051 | */ | 1831 | */ | 
| 1052 | if (op >= FUTEX_REQUEUE) | 1832 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) | 
| 1053 | val2 = (int) (unsigned long) utime; | 1833 | val2 = (u32) (unsigned long) utime; | 
| 1054 | 1834 | ||
| 1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1835 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); | 
| 1056 | (unsigned long)uaddr2, val2, val3); | ||
| 1057 | } | 1836 | } | 
| 1058 | 1837 | ||
| 1059 | static int futexfs_get_sb(struct file_system_type *fs_type, | 1838 | static int futexfs_get_sb(struct file_system_type *fs_type, | 
