diff options
| author | Ingo Molnar <mingo@elte.hu> | 2006-06-27 05:54:58 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:47 -0400 |
| commit | c87e2837be82df479a6bae9f155c43516d2feebc (patch) | |
| tree | ad6ab35f0b78f71abaa7b05185e9e3f97809c6de /kernel | |
| parent | 0cdbee9920fb37eb2dc49b860c2b28862d647adc (diff) | |
[PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support
This adds the actual pi-futex implementation, based on rt-mutexes.
[dino@in.ibm.com: fix an oops-causing race]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/exit.c | 8 | ||||
| -rw-r--r-- | kernel/fork.c | 3 | ||||
| -rw-r--r-- | kernel/futex.c | 829 | ||||
| -rw-r--r-- | kernel/futex_compat.c | 11 | ||||
| -rw-r--r-- | kernel/rtmutex_common.h | 8 |
5 files changed, 818 insertions, 41 deletions
diff --git a/kernel/exit.c b/kernel/exit.c index 3e8a0282e9a5..ab06b9f88f64 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -926,6 +926,14 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 926 | tsk->mempolicy = NULL; | 926 | tsk->mempolicy = NULL; |
| 927 | #endif | 927 | #endif |
| 928 | /* | 928 | /* |
| 929 | * This must happen late, after the PID is not | ||
| 930 | * hashed anymore: | ||
| 931 | */ | ||
| 932 | if (unlikely(!list_empty(&tsk->pi_state_list))) | ||
| 933 | exit_pi_state_list(tsk); | ||
| 934 | if (unlikely(current->pi_state_cache)) | ||
| 935 | kfree(current->pi_state_cache); | ||
| 936 | /* | ||
| 929 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 937 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: |
| 930 | */ | 938 | */ |
| 931 | mutex_debug_check_no_locks_held(tsk); | 939 | mutex_debug_check_no_locks_held(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index b664a081fffa..628198a4f28a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
| 1092 | #ifdef CONFIG_COMPAT | 1092 | #ifdef CONFIG_COMPAT |
| 1093 | p->compat_robust_list = NULL; | 1093 | p->compat_robust_list = NULL; |
| 1094 | #endif | 1094 | #endif |
| 1095 | INIT_LIST_HEAD(&p->pi_state_list); | ||
| 1096 | p->pi_state_cache = NULL; | ||
| 1097 | |||
| 1095 | /* | 1098 | /* |
| 1096 | * sigaltstack should be cleared when sharing the same VM | 1099 | * sigaltstack should be cleared when sharing the same VM |
| 1097 | */ | 1100 | */ |
diff --git a/kernel/futex.c b/kernel/futex.c index 50356fb5d726..b305b7f8dad5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -12,6 +12,10 @@ | |||
| 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
| 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
| 14 | * | 14 | * |
| 15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
| 16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
| 17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
| 18 | * | ||
| 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
| 16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
| 17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
| @@ -46,6 +50,8 @@ | |||
| 46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
| 47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
| 48 | 52 | ||
| 53 | #include "rtmutex_common.h" | ||
| 54 | |||
| 49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
| 50 | 56 | ||
| 51 | /* | 57 | /* |
| @@ -75,6 +81,27 @@ union futex_key { | |||
| 75 | }; | 81 | }; |
| 76 | 82 | ||
| 77 | /* | 83 | /* |
| 84 | * Priority Inheritance state: | ||
| 85 | */ | ||
| 86 | struct futex_pi_state { | ||
| 87 | /* | ||
| 88 | * list of 'owned' pi_state instances - these have to be | ||
| 89 | * cleaned up in do_exit() if the task exits prematurely: | ||
| 90 | */ | ||
| 91 | struct list_head list; | ||
| 92 | |||
| 93 | /* | ||
| 94 | * The PI object: | ||
| 95 | */ | ||
| 96 | struct rt_mutex pi_mutex; | ||
| 97 | |||
| 98 | struct task_struct *owner; | ||
| 99 | atomic_t refcount; | ||
| 100 | |||
| 101 | union futex_key key; | ||
| 102 | }; | ||
| 103 | |||
| 104 | /* | ||
| 78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
| 79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
| 80 | * | 107 | * |
| @@ -96,6 +123,10 @@ struct futex_q { | |||
| 96 | /* For fd, sigio sent using these: */ | 123 | /* For fd, sigio sent using these: */ |
| 97 | int fd; | 124 | int fd; |
| 98 | struct file *filp; | 125 | struct file *filp; |
| 126 | |||
| 127 | /* Optional priority inheritance state: */ | ||
| 128 | struct futex_pi_state *pi_state; | ||
| 129 | struct task_struct *task; | ||
| 99 | }; | 130 | }; |
| 100 | 131 | ||
| 101 | /* | 132 | /* |
| @@ -259,6 +290,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | |||
| 259 | } | 290 | } |
| 260 | 291 | ||
| 261 | /* | 292 | /* |
| 293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
| 294 | */ | ||
| 295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
| 296 | { | ||
| 297 | struct vm_area_struct * vma; | ||
| 298 | struct mm_struct *mm = current->mm; | ||
| 299 | |||
| 300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
| 301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
| 302 | return -EFAULT; | ||
| 303 | |||
| 304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
| 305 | case VM_FAULT_MINOR: | ||
| 306 | current->min_flt++; | ||
| 307 | break; | ||
| 308 | case VM_FAULT_MAJOR: | ||
| 309 | current->maj_flt++; | ||
| 310 | break; | ||
| 311 | default: | ||
| 312 | return -EFAULT; | ||
| 313 | } | ||
| 314 | return 0; | ||
| 315 | } | ||
| 316 | |||
| 317 | /* | ||
| 318 | * PI code: | ||
| 319 | */ | ||
| 320 | static int refill_pi_state_cache(void) | ||
| 321 | { | ||
| 322 | struct futex_pi_state *pi_state; | ||
| 323 | |||
| 324 | if (likely(current->pi_state_cache)) | ||
| 325 | return 0; | ||
| 326 | |||
| 327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
| 328 | |||
| 329 | if (!pi_state) | ||
| 330 | return -ENOMEM; | ||
| 331 | |||
| 332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
| 333 | INIT_LIST_HEAD(&pi_state->list); | ||
| 334 | /* pi_mutex gets initialized later */ | ||
| 335 | pi_state->owner = NULL; | ||
| 336 | atomic_set(&pi_state->refcount, 1); | ||
| 337 | |||
| 338 | current->pi_state_cache = pi_state; | ||
| 339 | |||
| 340 | return 0; | ||
| 341 | } | ||
| 342 | |||
| 343 | static struct futex_pi_state * alloc_pi_state(void) | ||
| 344 | { | ||
| 345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
| 346 | |||
| 347 | WARN_ON(!pi_state); | ||
| 348 | current->pi_state_cache = NULL; | ||
| 349 | |||
| 350 | return pi_state; | ||
| 351 | } | ||
| 352 | |||
| 353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
| 354 | { | ||
| 355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
| 356 | return; | ||
| 357 | |||
| 358 | /* | ||
| 359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
| 360 | * and has cleaned up the pi_state already | ||
| 361 | */ | ||
| 362 | if (pi_state->owner) { | ||
| 363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
| 364 | list_del_init(&pi_state->list); | ||
| 365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
| 366 | |||
| 367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
| 368 | } | ||
| 369 | |||
| 370 | if (current->pi_state_cache) | ||
| 371 | kfree(pi_state); | ||
| 372 | else { | ||
| 373 | /* | ||
| 374 | * pi_state->list is already empty. | ||
| 375 | * clear pi_state->owner. | ||
| 376 | * refcount is at 0 - put it back to 1. | ||
| 377 | */ | ||
| 378 | pi_state->owner = NULL; | ||
| 379 | atomic_set(&pi_state->refcount, 1); | ||
| 380 | current->pi_state_cache = pi_state; | ||
| 381 | } | ||
| 382 | } | ||
| 383 | |||
| 384 | /* | ||
| 385 | * Look up the task based on what TID userspace gave us. | ||
| 386 | * We dont trust it. | ||
| 387 | */ | ||
| 388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
| 389 | { | ||
| 390 | struct task_struct *p; | ||
| 391 | |||
| 392 | read_lock(&tasklist_lock); | ||
| 393 | p = find_task_by_pid(pid); | ||
| 394 | if (!p) | ||
| 395 | goto out_unlock; | ||
| 396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
| 397 | p = NULL; | ||
| 398 | goto out_unlock; | ||
| 399 | } | ||
| 400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
| 401 | p = NULL; | ||
| 402 | goto out_unlock; | ||
| 403 | } | ||
| 404 | get_task_struct(p); | ||
| 405 | out_unlock: | ||
| 406 | read_unlock(&tasklist_lock); | ||
| 407 | |||
| 408 | return p; | ||
| 409 | } | ||
| 410 | |||
| 411 | /* | ||
| 412 | * This task is holding PI mutexes at exit time => bad. | ||
| 413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
| 414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
| 415 | */ | ||
| 416 | void exit_pi_state_list(struct task_struct *curr) | ||
| 417 | { | ||
| 418 | struct futex_hash_bucket *hb; | ||
| 419 | struct list_head *next, *head = &curr->pi_state_list; | ||
| 420 | struct futex_pi_state *pi_state; | ||
| 421 | union futex_key key; | ||
| 422 | |||
| 423 | /* | ||
| 424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
| 425 | * pi_state_list anymore, but we have to be careful | ||
| 426 | * versus waiters unqueueing themselfs | ||
| 427 | */ | ||
| 428 | spin_lock_irq(&curr->pi_lock); | ||
| 429 | while (!list_empty(head)) { | ||
| 430 | |||
| 431 | next = head->next; | ||
| 432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
| 433 | key = pi_state->key; | ||
| 434 | spin_unlock_irq(&curr->pi_lock); | ||
| 435 | |||
| 436 | hb = hash_futex(&key); | ||
| 437 | spin_lock(&hb->lock); | ||
| 438 | |||
| 439 | spin_lock_irq(&curr->pi_lock); | ||
| 440 | if (head->next != next) { | ||
| 441 | spin_unlock(&hb->lock); | ||
| 442 | continue; | ||
| 443 | } | ||
| 444 | |||
| 445 | list_del_init(&pi_state->list); | ||
| 446 | |||
| 447 | WARN_ON(pi_state->owner != curr); | ||
| 448 | |||
| 449 | pi_state->owner = NULL; | ||
| 450 | spin_unlock_irq(&curr->pi_lock); | ||
| 451 | |||
| 452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 453 | |||
| 454 | spin_unlock(&hb->lock); | ||
| 455 | |||
| 456 | spin_lock_irq(&curr->pi_lock); | ||
| 457 | } | ||
| 458 | spin_unlock_irq(&curr->pi_lock); | ||
| 459 | } | ||
| 460 | |||
| 461 | static int | ||
| 462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
| 463 | { | ||
| 464 | struct futex_pi_state *pi_state = NULL; | ||
| 465 | struct futex_q *this, *next; | ||
| 466 | struct list_head *head; | ||
| 467 | struct task_struct *p; | ||
| 468 | pid_t pid; | ||
| 469 | |||
| 470 | head = &hb->chain; | ||
| 471 | |||
| 472 | list_for_each_entry_safe(this, next, head, list) { | ||
| 473 | if (match_futex (&this->key, &me->key)) { | ||
| 474 | /* | ||
| 475 | * Another waiter already exists - bump up | ||
| 476 | * the refcount and return its pi_state: | ||
| 477 | */ | ||
| 478 | pi_state = this->pi_state; | ||
| 479 | atomic_inc(&pi_state->refcount); | ||
| 480 | me->pi_state = pi_state; | ||
| 481 | |||
| 482 | return 0; | ||
| 483 | } | ||
| 484 | } | ||
| 485 | |||
| 486 | /* | ||
| 487 | * We are the first waiter - try to look up the real owner and | ||
| 488 | * attach the new pi_state to it: | ||
| 489 | */ | ||
| 490 | pid = uval & FUTEX_TID_MASK; | ||
| 491 | p = futex_find_get_task(pid); | ||
| 492 | if (!p) | ||
| 493 | return -ESRCH; | ||
| 494 | |||
| 495 | pi_state = alloc_pi_state(); | ||
| 496 | |||
| 497 | /* | ||
| 498 | * Initialize the pi_mutex in locked state and make 'p' | ||
| 499 | * the owner of it: | ||
| 500 | */ | ||
| 501 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
| 502 | |||
| 503 | /* Store the key for possible exit cleanups: */ | ||
| 504 | pi_state->key = me->key; | ||
| 505 | |||
| 506 | spin_lock_irq(&p->pi_lock); | ||
| 507 | list_add(&pi_state->list, &p->pi_state_list); | ||
| 508 | pi_state->owner = p; | ||
| 509 | spin_unlock_irq(&p->pi_lock); | ||
| 510 | |||
| 511 | put_task_struct(p); | ||
| 512 | |||
| 513 | me->pi_state = pi_state; | ||
| 514 | |||
| 515 | return 0; | ||
| 516 | } | ||
| 517 | |||
| 518 | /* | ||
| 262 | * The hash bucket lock must be held when this is called. | 519 | * The hash bucket lock must be held when this is called. |
| 263 | * Afterwards, the futex_q must not be accessed. | 520 | * Afterwards, the futex_q must not be accessed. |
| 264 | */ | 521 | */ |
| @@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q) | |||
| 285 | q->lock_ptr = NULL; | 542 | q->lock_ptr = NULL; |
| 286 | } | 543 | } |
| 287 | 544 | ||
| 545 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
| 546 | { | ||
| 547 | struct task_struct *new_owner; | ||
| 548 | struct futex_pi_state *pi_state = this->pi_state; | ||
| 549 | u32 curval, newval; | ||
| 550 | |||
| 551 | if (!pi_state) | ||
| 552 | return -EINVAL; | ||
| 553 | |||
| 554 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
| 555 | |||
| 556 | /* | ||
| 557 | * This happens when we have stolen the lock and the original | ||
| 558 | * pending owner did not enqueue itself back on the rt_mutex. | ||
| 559 | * Thats not a tragedy. We know that way, that a lock waiter | ||
| 560 | * is on the fly. We make the futex_q waiter the pending owner. | ||
| 561 | */ | ||
| 562 | if (!new_owner) | ||
| 563 | new_owner = this->task; | ||
| 564 | |||
| 565 | /* | ||
| 566 | * We pass it to the next owner. (The WAITERS bit is always | ||
| 567 | * kept enabled while there is PI state around. We must also | ||
| 568 | * preserve the owner died bit.) | ||
| 569 | */ | ||
| 570 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
| 571 | |||
| 572 | inc_preempt_count(); | ||
| 573 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 574 | dec_preempt_count(); | ||
| 575 | |||
| 576 | if (curval == -EFAULT) | ||
| 577 | return -EFAULT; | ||
| 578 | if (curval != uval) | ||
| 579 | return -EINVAL; | ||
| 580 | |||
| 581 | list_del_init(&pi_state->owner->pi_state_list); | ||
| 582 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
| 583 | pi_state->owner = new_owner; | ||
| 584 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
| 585 | |||
| 586 | return 0; | ||
| 587 | } | ||
| 588 | |||
| 589 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
| 590 | { | ||
| 591 | u32 oldval; | ||
| 592 | |||
| 593 | /* | ||
| 594 | * There is no waiter, so we unlock the futex. The owner died | ||
| 595 | * bit has not to be preserved here. We are the owner: | ||
| 596 | */ | ||
| 597 | inc_preempt_count(); | ||
| 598 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
| 599 | dec_preempt_count(); | ||
| 600 | |||
| 601 | if (oldval == -EFAULT) | ||
| 602 | return oldval; | ||
| 603 | if (oldval != uval) | ||
| 604 | return -EAGAIN; | ||
| 605 | |||
| 606 | return 0; | ||
| 607 | } | ||
| 608 | |||
| 288 | /* | 609 | /* |
| 289 | * Wake up all waiters hashed on the physical page that is mapped | 610 | * Wake up all waiters hashed on the physical page that is mapped |
| 290 | * to this virtual address: | 611 | * to this virtual address: |
| @@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) | |||
| 309 | 630 | ||
| 310 | list_for_each_entry_safe(this, next, head, list) { | 631 | list_for_each_entry_safe(this, next, head, list) { |
| 311 | if (match_futex (&this->key, &key)) { | 632 | if (match_futex (&this->key, &key)) { |
| 633 | if (this->pi_state) | ||
| 634 | return -EINVAL; | ||
| 312 | wake_futex(this); | 635 | wake_futex(this); |
| 313 | if (++ret >= nr_wake) | 636 | if (++ret >= nr_wake) |
| 314 | break; | 637 | break; |
| @@ -385,27 +708,9 @@ retry: | |||
| 385 | * still holding the mmap_sem. | 708 | * still holding the mmap_sem. |
| 386 | */ | 709 | */ |
| 387 | if (attempt++) { | 710 | if (attempt++) { |
| 388 | struct vm_area_struct * vma; | 711 | if (futex_handle_fault((unsigned long)uaddr2, |
| 389 | struct mm_struct *mm = current->mm; | 712 | attempt)) |
| 390 | unsigned long address = (unsigned long)uaddr2; | ||
| 391 | |||
| 392 | ret = -EFAULT; | ||
| 393 | if (attempt >= 2 || | ||
| 394 | !(vma = find_vma(mm, address)) || | ||
| 395 | vma->vm_start > address || | ||
| 396 | !(vma->vm_flags & VM_WRITE)) | ||
| 397 | goto out; | 713 | goto out; |
| 398 | |||
| 399 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
| 400 | case VM_FAULT_MINOR: | ||
| 401 | current->min_flt++; | ||
| 402 | break; | ||
| 403 | case VM_FAULT_MAJOR: | ||
| 404 | current->maj_flt++; | ||
| 405 | break; | ||
| 406 | default: | ||
| 407 | goto out; | ||
| 408 | } | ||
| 409 | goto retry; | 714 | goto retry; |
| 410 | } | 715 | } |
| 411 | 716 | ||
| @@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
| 572 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 877 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
| 573 | { | 878 | { |
| 574 | list_add_tail(&q->list, &hb->chain); | 879 | list_add_tail(&q->list, &hb->chain); |
| 880 | q->task = current; | ||
| 575 | spin_unlock(&hb->lock); | 881 | spin_unlock(&hb->lock); |
| 576 | } | 882 | } |
| 577 | 883 | ||
| @@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q) | |||
| 626 | } | 932 | } |
| 627 | WARN_ON(list_empty(&q->list)); | 933 | WARN_ON(list_empty(&q->list)); |
| 628 | list_del(&q->list); | 934 | list_del(&q->list); |
| 935 | |||
| 936 | BUG_ON(q->pi_state); | ||
| 937 | |||
| 629 | spin_unlock(lock_ptr); | 938 | spin_unlock(lock_ptr); |
| 630 | ret = 1; | 939 | ret = 1; |
| 631 | } | 940 | } |
| @@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q) | |||
| 634 | return ret; | 943 | return ret; |
| 635 | } | 944 | } |
| 636 | 945 | ||
| 946 | /* | ||
| 947 | * PI futexes can not be requeued and must remove themself from the | ||
| 948 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
| 949 | */ | ||
| 950 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
| 951 | { | ||
| 952 | WARN_ON(list_empty(&q->list)); | ||
| 953 | list_del(&q->list); | ||
| 954 | |||
| 955 | BUG_ON(!q->pi_state); | ||
| 956 | free_pi_state(q->pi_state); | ||
| 957 | q->pi_state = NULL; | ||
| 958 | |||
| 959 | spin_unlock(&hb->lock); | ||
| 960 | |||
| 961 | drop_key_refs(&q->key); | ||
| 962 | } | ||
| 963 | |||
| 637 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | 964 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) |
| 638 | { | 965 | { |
| 639 | DECLARE_WAITQUEUE(wait, current); | 966 | struct task_struct *curr = current; |
| 967 | DECLARE_WAITQUEUE(wait, curr); | ||
| 640 | struct futex_hash_bucket *hb; | 968 | struct futex_hash_bucket *hb; |
| 641 | struct futex_q q; | 969 | struct futex_q q; |
| 642 | u32 uval; | 970 | u32 uval; |
| 643 | int ret; | 971 | int ret; |
| 644 | 972 | ||
| 973 | q.pi_state = NULL; | ||
| 645 | retry: | 974 | retry: |
| 646 | down_read(¤t->mm->mmap_sem); | 975 | down_read(&curr->mm->mmap_sem); |
| 647 | 976 | ||
| 648 | ret = get_futex_key(uaddr, &q.key); | 977 | ret = get_futex_key(uaddr, &q.key); |
| 649 | if (unlikely(ret != 0)) | 978 | if (unlikely(ret != 0)) |
| @@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
| 680 | * If we would have faulted, release mmap_sem, fault it in and | 1009 | * If we would have faulted, release mmap_sem, fault it in and |
| 681 | * start all over again. | 1010 | * start all over again. |
| 682 | */ | 1011 | */ |
| 683 | up_read(¤t->mm->mmap_sem); | 1012 | up_read(&curr->mm->mmap_sem); |
| 684 | 1013 | ||
| 685 | ret = get_user(uval, uaddr); | 1014 | ret = get_user(uval, uaddr); |
| 686 | 1015 | ||
| @@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
| 688 | goto retry; | 1017 | goto retry; |
| 689 | return ret; | 1018 | return ret; |
| 690 | } | 1019 | } |
| 691 | if (uval != val) { | 1020 | ret = -EWOULDBLOCK; |
| 692 | ret = -EWOULDBLOCK; | 1021 | if (uval != val) |
| 693 | queue_unlock(&q, hb); | 1022 | goto out_unlock_release_sem; |
| 694 | goto out_release_sem; | ||
| 695 | } | ||
| 696 | 1023 | ||
| 697 | /* Only actually queue if *uaddr contained val. */ | 1024 | /* Only actually queue if *uaddr contained val. */ |
| 698 | __queue_me(&q, hb); | 1025 | __queue_me(&q, hb); |
| @@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
| 700 | /* | 1027 | /* |
| 701 | * Now the futex is queued and we have checked the data, we | 1028 | * Now the futex is queued and we have checked the data, we |
| 702 | * don't want to hold mmap_sem while we sleep. | 1029 | * don't want to hold mmap_sem while we sleep. |
| 703 | */ | 1030 | */ |
| 704 | up_read(¤t->mm->mmap_sem); | 1031 | up_read(&curr->mm->mmap_sem); |
| 705 | 1032 | ||
| 706 | /* | 1033 | /* |
| 707 | * There might have been scheduling since the queue_me(), as we | 1034 | * There might have been scheduling since the queue_me(), as we |
| @@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
| 739 | */ | 1066 | */ |
| 740 | return -EINTR; | 1067 | return -EINTR; |
| 741 | 1068 | ||
| 1069 | out_unlock_release_sem: | ||
| 1070 | queue_unlock(&q, hb); | ||
| 1071 | |||
| 742 | out_release_sem: | 1072 | out_release_sem: |
| 1073 | up_read(&curr->mm->mmap_sem); | ||
| 1074 | return ret; | ||
| 1075 | } | ||
| 1076 | |||
| 1077 | /* | ||
| 1078 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
| 1079 | * and failed. The kernel side here does the whole locking operation: | ||
| 1080 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
| 1081 | * races the kernel might see a 0 value of the futex too.) | ||
| 1082 | */ | ||
| 1083 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
| 1084 | struct hrtimer_sleeper *to) | ||
| 1085 | { | ||
| 1086 | struct task_struct *curr = current; | ||
| 1087 | struct futex_hash_bucket *hb; | ||
| 1088 | u32 uval, newval, curval; | ||
| 1089 | struct futex_q q; | ||
| 1090 | int ret, attempt = 0; | ||
| 1091 | |||
| 1092 | if (refill_pi_state_cache()) | ||
| 1093 | return -ENOMEM; | ||
| 1094 | |||
| 1095 | q.pi_state = NULL; | ||
| 1096 | retry: | ||
| 1097 | down_read(&curr->mm->mmap_sem); | ||
| 1098 | |||
| 1099 | ret = get_futex_key(uaddr, &q.key); | ||
| 1100 | if (unlikely(ret != 0)) | ||
| 1101 | goto out_release_sem; | ||
| 1102 | |||
| 1103 | hb = queue_lock(&q, -1, NULL); | ||
| 1104 | |||
| 1105 | retry_locked: | ||
| 1106 | /* | ||
| 1107 | * To avoid races, we attempt to take the lock here again | ||
| 1108 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
| 1109 | * the locks. It will most likely not succeed. | ||
| 1110 | */ | ||
| 1111 | newval = current->pid; | ||
| 1112 | |||
| 1113 | inc_preempt_count(); | ||
| 1114 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
| 1115 | dec_preempt_count(); | ||
| 1116 | |||
| 1117 | if (unlikely(curval == -EFAULT)) | ||
| 1118 | goto uaddr_faulted; | ||
| 1119 | |||
| 1120 | /* We own the lock already */ | ||
| 1121 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
| 1122 | if (!detect && 0) | ||
| 1123 | force_sig(SIGKILL, current); | ||
| 1124 | ret = -EDEADLK; | ||
| 1125 | goto out_unlock_release_sem; | ||
| 1126 | } | ||
| 1127 | |||
| 1128 | /* | ||
| 1129 | * Surprise - we got the lock. Just return | ||
| 1130 | * to userspace: | ||
| 1131 | */ | ||
| 1132 | if (unlikely(!curval)) | ||
| 1133 | goto out_unlock_release_sem; | ||
| 1134 | |||
| 1135 | uval = curval; | ||
| 1136 | newval = uval | FUTEX_WAITERS; | ||
| 1137 | |||
| 1138 | inc_preempt_count(); | ||
| 1139 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
| 1140 | dec_preempt_count(); | ||
| 1141 | |||
| 1142 | if (unlikely(curval == -EFAULT)) | ||
| 1143 | goto uaddr_faulted; | ||
| 1144 | if (unlikely(curval != uval)) | ||
| 1145 | goto retry_locked; | ||
| 1146 | |||
| 1147 | /* | ||
| 1148 | * We dont have the lock. Look up the PI state (or create it if | ||
| 1149 | * we are the first waiter): | ||
| 1150 | */ | ||
| 1151 | ret = lookup_pi_state(uval, hb, &q); | ||
| 1152 | |||
| 1153 | if (unlikely(ret)) { | ||
| 1154 | /* | ||
| 1155 | * There were no waiters and the owner task lookup | ||
| 1156 | * failed. When the OWNER_DIED bit is set, then we | ||
| 1157 | * know that this is a robust futex and we actually | ||
| 1158 | * take the lock. This is safe as we are protected by | ||
| 1159 | * the hash bucket lock. We also set the waiters bit | ||
| 1160 | * unconditionally here, to simplify glibc handling of | ||
| 1161 | * multiple tasks racing to acquire the lock and | ||
| 1162 | * cleanup the problems which were left by the dead | ||
| 1163 | * owner. | ||
| 1164 | */ | ||
| 1165 | if (curval & FUTEX_OWNER_DIED) { | ||
| 1166 | uval = newval; | ||
| 1167 | newval = current->pid | | ||
| 1168 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
| 1169 | |||
| 1170 | inc_preempt_count(); | ||
| 1171 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1172 | uval, newval); | ||
| 1173 | dec_preempt_count(); | ||
| 1174 | |||
| 1175 | if (unlikely(curval == -EFAULT)) | ||
| 1176 | goto uaddr_faulted; | ||
| 1177 | if (unlikely(curval != uval)) | ||
| 1178 | goto retry_locked; | ||
| 1179 | ret = 0; | ||
| 1180 | } | ||
| 1181 | goto out_unlock_release_sem; | ||
| 1182 | } | ||
| 1183 | |||
| 1184 | /* | ||
| 1185 | * Only actually queue now that the atomic ops are done: | ||
| 1186 | */ | ||
| 1187 | __queue_me(&q, hb); | ||
| 1188 | |||
| 1189 | /* | ||
| 1190 | * Now the futex is queued and we have checked the data, we | ||
| 1191 | * don't want to hold mmap_sem while we sleep. | ||
| 1192 | */ | ||
| 1193 | up_read(&curr->mm->mmap_sem); | ||
| 1194 | |||
| 1195 | WARN_ON(!q.pi_state); | ||
| 1196 | /* | ||
| 1197 | * Block on the PI mutex: | ||
| 1198 | */ | ||
| 1199 | if (!trylock) | ||
| 1200 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
| 1201 | else { | ||
| 1202 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
| 1203 | /* Fixup the trylock return value: */ | ||
| 1204 | ret = ret ? 0 : -EWOULDBLOCK; | ||
| 1205 | } | ||
| 1206 | |||
| 1207 | down_read(&curr->mm->mmap_sem); | ||
| 1208 | hb = queue_lock(&q, -1, NULL); | ||
| 1209 | |||
| 1210 | /* | ||
| 1211 | * Got the lock. We might not be the anticipated owner if we | ||
| 1212 | * did a lock-steal - fix up the PI-state in that case. | ||
| 1213 | */ | ||
| 1214 | if (!ret && q.pi_state->owner != curr) { | ||
| 1215 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
| 1216 | |||
| 1217 | /* Owner died? */ | ||
| 1218 | if (q.pi_state->owner != NULL) { | ||
| 1219 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
| 1220 | list_del_init(&q.pi_state->list); | ||
| 1221 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
| 1222 | } else | ||
| 1223 | newtid |= FUTEX_OWNER_DIED; | ||
| 1224 | |||
| 1225 | q.pi_state->owner = current; | ||
| 1226 | |||
| 1227 | spin_lock_irq(¤t->pi_lock); | ||
| 1228 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
| 1229 | spin_unlock_irq(¤t->pi_lock); | ||
| 1230 | |||
| 1231 | /* Unqueue and drop the lock */ | ||
| 1232 | unqueue_me_pi(&q, hb); | ||
| 1233 | up_read(&curr->mm->mmap_sem); | ||
| 1234 | /* | ||
| 1235 | * We own it, so we have to replace the pending owner | ||
| 1236 | * TID. This must be atomic as we have preserve the | ||
| 1237 | * owner died bit here. | ||
| 1238 | */ | ||
| 1239 | ret = get_user(uval, uaddr); | ||
| 1240 | while (!ret) { | ||
| 1241 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
| 1242 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
| 1243 | uval, newval); | ||
| 1244 | if (curval == -EFAULT) | ||
| 1245 | ret = -EFAULT; | ||
| 1246 | if (curval == uval) | ||
| 1247 | break; | ||
| 1248 | uval = curval; | ||
| 1249 | } | ||
| 1250 | } else { | ||
| 1251 | /* | ||
| 1252 | * Catch the rare case, where the lock was released | ||
| 1253 | * when we were on the way back before we locked | ||
| 1254 | * the hash bucket. | ||
| 1255 | */ | ||
| 1256 | if (ret && q.pi_state->owner == curr) { | ||
| 1257 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
| 1258 | ret = 0; | ||
| 1259 | } | ||
| 1260 | /* Unqueue and drop the lock */ | ||
| 1261 | unqueue_me_pi(&q, hb); | ||
| 1262 | up_read(&curr->mm->mmap_sem); | ||
| 1263 | } | ||
| 1264 | |||
| 1265 | if (!detect && ret == -EDEADLK && 0) | ||
| 1266 | force_sig(SIGKILL, current); | ||
| 1267 | |||
| 1268 | return ret; | ||
| 1269 | |||
| 1270 | out_unlock_release_sem: | ||
| 1271 | queue_unlock(&q, hb); | ||
| 1272 | |||
| 1273 | out_release_sem: | ||
| 1274 | up_read(&curr->mm->mmap_sem); | ||
| 1275 | return ret; | ||
| 1276 | |||
| 1277 | uaddr_faulted: | ||
| 1278 | /* | ||
| 1279 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1280 | * non-atomically. Therefore, if get_user below is not | ||
| 1281 | * enough, we need to handle the fault ourselves, while | ||
| 1282 | * still holding the mmap_sem. | ||
| 1283 | */ | ||
| 1284 | if (attempt++) { | ||
| 1285 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
| 1286 | goto out_unlock_release_sem; | ||
| 1287 | |||
| 1288 | goto retry_locked; | ||
| 1289 | } | ||
| 1290 | |||
| 1291 | queue_unlock(&q, hb); | ||
| 1292 | up_read(&curr->mm->mmap_sem); | ||
| 1293 | |||
| 1294 | ret = get_user(uval, uaddr); | ||
| 1295 | if (!ret && (uval != -EFAULT)) | ||
| 1296 | goto retry; | ||
| 1297 | |||
| 1298 | return ret; | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | /* | ||
| 1302 | * Restart handler | ||
| 1303 | */ | ||
| 1304 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
| 1305 | { | ||
| 1306 | struct hrtimer_sleeper timeout, *to = NULL; | ||
| 1307 | int ret; | ||
| 1308 | |||
| 1309 | restart->fn = do_no_restart_syscall; | ||
| 1310 | |||
| 1311 | if (restart->arg2 || restart->arg3) { | ||
| 1312 | to = &timeout; | ||
| 1313 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
| 1314 | hrtimer_init_sleeper(to, current); | ||
| 1315 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
| 1316 | (u64) restart->arg0; | ||
| 1317 | } | ||
| 1318 | |||
| 1319 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
| 1320 | (u32 __user *)restart->arg0, current->pid); | ||
| 1321 | |||
| 1322 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
| 1323 | 0, to); | ||
| 1324 | |||
| 1325 | if (ret != -EINTR) | ||
| 1326 | return ret; | ||
| 1327 | |||
| 1328 | restart->fn = futex_lock_pi_restart; | ||
| 1329 | |||
| 1330 | /* The other values are filled in */ | ||
| 1331 | return -ERESTART_RESTARTBLOCK; | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | /* | ||
| 1335 | * Called from the syscall entry below. | ||
| 1336 | */ | ||
| 1337 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
| 1338 | long nsec, int trylock) | ||
| 1339 | { | ||
| 1340 | struct hrtimer_sleeper timeout, *to = NULL; | ||
| 1341 | struct restart_block *restart; | ||
| 1342 | int ret; | ||
| 1343 | |||
| 1344 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
| 1345 | to = &timeout; | ||
| 1346 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
| 1347 | hrtimer_init_sleeper(to, current); | ||
| 1348 | to->timer.expires = ktime_set(sec, nsec); | ||
| 1349 | } | ||
| 1350 | |||
| 1351 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
| 1352 | |||
| 1353 | if (ret != -EINTR) | ||
| 1354 | return ret; | ||
| 1355 | |||
| 1356 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
| 1357 | |||
| 1358 | restart = ¤t_thread_info()->restart_block; | ||
| 1359 | restart->fn = futex_lock_pi_restart; | ||
| 1360 | restart->arg0 = (unsigned long) uaddr; | ||
| 1361 | restart->arg1 = detect; | ||
| 1362 | if (to) { | ||
| 1363 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
| 1364 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
| 1365 | } else | ||
| 1366 | restart->arg2 = restart->arg3 = 0; | ||
| 1367 | |||
| 1368 | return -ERESTART_RESTARTBLOCK; | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | /* | ||
| 1372 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
| 1373 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
| 1374 | * and do the rt-mutex unlock. | ||
| 1375 | */ | ||
| 1376 | static int futex_unlock_pi(u32 __user *uaddr) | ||
| 1377 | { | ||
| 1378 | struct futex_hash_bucket *hb; | ||
| 1379 | struct futex_q *this, *next; | ||
| 1380 | u32 uval; | ||
| 1381 | struct list_head *head; | ||
| 1382 | union futex_key key; | ||
| 1383 | int ret, attempt = 0; | ||
| 1384 | |||
| 1385 | retry: | ||
| 1386 | if (get_user(uval, uaddr)) | ||
| 1387 | return -EFAULT; | ||
| 1388 | /* | ||
| 1389 | * We release only a lock we actually own: | ||
| 1390 | */ | ||
| 1391 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
| 1392 | return -EPERM; | ||
| 1393 | /* | ||
| 1394 | * First take all the futex related locks: | ||
| 1395 | */ | ||
| 1396 | down_read(¤t->mm->mmap_sem); | ||
| 1397 | |||
| 1398 | ret = get_futex_key(uaddr, &key); | ||
| 1399 | if (unlikely(ret != 0)) | ||
| 1400 | goto out; | ||
| 1401 | |||
| 1402 | hb = hash_futex(&key); | ||
| 1403 | spin_lock(&hb->lock); | ||
| 1404 | |||
| 1405 | retry_locked: | ||
| 1406 | /* | ||
| 1407 | * To avoid races, try to do the TID -> 0 atomic transition | ||
| 1408 | * again. If it succeeds then we can return without waking | ||
| 1409 | * anyone else up: | ||
| 1410 | */ | ||
| 1411 | inc_preempt_count(); | ||
| 1412 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
| 1413 | dec_preempt_count(); | ||
| 1414 | |||
| 1415 | if (unlikely(uval == -EFAULT)) | ||
| 1416 | goto pi_faulted; | ||
| 1417 | /* | ||
| 1418 | * Rare case: we managed to release the lock atomically, | ||
| 1419 | * no need to wake anyone else up: | ||
| 1420 | */ | ||
| 1421 | if (unlikely(uval == current->pid)) | ||
| 1422 | goto out_unlock; | ||
| 1423 | |||
| 1424 | /* | ||
| 1425 | * Ok, other tasks may need to be woken up - check waiters | ||
| 1426 | * and do the wakeup if necessary: | ||
| 1427 | */ | ||
| 1428 | head = &hb->chain; | ||
| 1429 | |||
| 1430 | list_for_each_entry_safe(this, next, head, list) { | ||
| 1431 | if (!match_futex (&this->key, &key)) | ||
| 1432 | continue; | ||
| 1433 | ret = wake_futex_pi(uaddr, uval, this); | ||
| 1434 | /* | ||
| 1435 | * The atomic access to the futex value | ||
| 1436 | * generated a pagefault, so retry the | ||
| 1437 | * user-access and the wakeup: | ||
| 1438 | */ | ||
| 1439 | if (ret == -EFAULT) | ||
| 1440 | goto pi_faulted; | ||
| 1441 | goto out_unlock; | ||
| 1442 | } | ||
| 1443 | /* | ||
| 1444 | * No waiters - kernel unlocks the futex: | ||
| 1445 | */ | ||
| 1446 | ret = unlock_futex_pi(uaddr, uval); | ||
| 1447 | if (ret == -EFAULT) | ||
| 1448 | goto pi_faulted; | ||
| 1449 | |||
| 1450 | out_unlock: | ||
| 1451 | spin_unlock(&hb->lock); | ||
| 1452 | out: | ||
| 1453 | up_read(¤t->mm->mmap_sem); | ||
| 1454 | |||
| 1455 | return ret; | ||
| 1456 | |||
| 1457 | pi_faulted: | ||
| 1458 | /* | ||
| 1459 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
| 1460 | * non-atomically. Therefore, if get_user below is not | ||
| 1461 | * enough, we need to handle the fault ourselves, while | ||
| 1462 | * still holding the mmap_sem. | ||
| 1463 | */ | ||
| 1464 | if (attempt++) { | ||
| 1465 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
| 1466 | goto out_unlock; | ||
| 1467 | |||
| 1468 | goto retry_locked; | ||
| 1469 | } | ||
| 1470 | |||
| 1471 | spin_unlock(&hb->lock); | ||
| 743 | up_read(¤t->mm->mmap_sem); | 1472 | up_read(¤t->mm->mmap_sem); |
| 1473 | |||
| 1474 | ret = get_user(uval, uaddr); | ||
| 1475 | if (!ret && (uval != -EFAULT)) | ||
| 1476 | goto retry; | ||
| 1477 | |||
| 744 | return ret; | 1478 | return ret; |
| 745 | } | 1479 | } |
| 746 | 1480 | ||
| @@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
| 819 | err = -ENOMEM; | 1553 | err = -ENOMEM; |
| 820 | goto error; | 1554 | goto error; |
| 821 | } | 1555 | } |
| 1556 | q->pi_state = NULL; | ||
| 822 | 1557 | ||
| 823 | down_read(¤t->mm->mmap_sem); | 1558 | down_read(¤t->mm->mmap_sem); |
| 824 | err = get_futex_key(uaddr, &q->key); | 1559 | err = get_futex_key(uaddr, &q->key); |
| @@ -856,7 +1591,7 @@ error: | |||
| 856 | * Implementation: user-space maintains a per-thread list of locks it | 1591 | * Implementation: user-space maintains a per-thread list of locks it |
| 857 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1592 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
| 858 | * and marks all locks that are owned by this thread with the | 1593 | * and marks all locks that are owned by this thread with the |
| 859 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1594 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
| 860 | * always manipulated with the lock held, so the list is private and | 1595 | * always manipulated with the lock held, so the list is private and |
| 861 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1596 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
| 862 | * field, to allow the kernel to clean up if the thread dies after | 1597 | * field, to allow the kernel to clean up if the thread dies after |
| @@ -931,7 +1666,7 @@ err_unlock: | |||
| 931 | */ | 1666 | */ |
| 932 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1667 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
| 933 | { | 1668 | { |
| 934 | u32 uval; | 1669 | u32 uval, nval; |
| 935 | 1670 | ||
| 936 | retry: | 1671 | retry: |
| 937 | if (get_user(uval, uaddr)) | 1672 | if (get_user(uval, uaddr)) |
| @@ -948,8 +1683,12 @@ retry: | |||
| 948 | * thread-death.) The rest of the cleanup is done in | 1683 | * thread-death.) The rest of the cleanup is done in |
| 949 | * userspace. | 1684 | * userspace. |
| 950 | */ | 1685 | */ |
| 951 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1686 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
| 952 | uval | FUTEX_OWNER_DIED) != uval) | 1687 | uval | FUTEX_OWNER_DIED); |
| 1688 | if (nval == -EFAULT) | ||
| 1689 | return -1; | ||
| 1690 | |||
| 1691 | if (nval != uval) | ||
| 953 | goto retry; | 1692 | goto retry; |
| 954 | 1693 | ||
| 955 | if (uval & FUTEX_WAITERS) | 1694 | if (uval & FUTEX_WAITERS) |
| @@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr) | |||
| 994 | while (entry != &head->list) { | 1733 | while (entry != &head->list) { |
| 995 | /* | 1734 | /* |
| 996 | * A pending lock might already be on the list, so | 1735 | * A pending lock might already be on the list, so |
| 997 | * dont process it twice: | 1736 | * don't process it twice: |
| 998 | */ | 1737 | */ |
| 999 | if (entry != pending) | 1738 | if (entry != pending) |
| 1000 | if (handle_futex_death((void *)entry + futex_offset, | 1739 | if (handle_futex_death((void *)entry + futex_offset, |
| @@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, | |||
| 1040 | case FUTEX_WAKE_OP: | 1779 | case FUTEX_WAKE_OP: |
| 1041 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1780 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
| 1042 | break; | 1781 | break; |
| 1782 | case FUTEX_LOCK_PI: | ||
| 1783 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
| 1784 | break; | ||
| 1785 | case FUTEX_UNLOCK_PI: | ||
| 1786 | ret = futex_unlock_pi(uaddr); | ||
| 1787 | break; | ||
| 1788 | case FUTEX_TRYLOCK_PI: | ||
| 1789 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
| 1790 | break; | ||
| 1043 | default: | 1791 | default: |
| 1044 | ret = -ENOSYS; | 1792 | ret = -ENOSYS; |
| 1045 | } | 1793 | } |
| @@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, | |||
| 1055 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1803 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
| 1056 | u32 val2 = 0; | 1804 | u32 val2 = 0; |
| 1057 | 1805 | ||
| 1058 | if (utime && (op == FUTEX_WAIT)) { | 1806 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
| 1059 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1807 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
| 1060 | return -EFAULT; | 1808 | return -EFAULT; |
| 1061 | if (!timespec_valid(&t)) | 1809 | if (!timespec_valid(&t)) |
| 1062 | return -EINVAL; | 1810 | return -EINVAL; |
| 1063 | timeout = timespec_to_jiffies(&t) + 1; | 1811 | if (op == FUTEX_WAIT) |
| 1812 | timeout = timespec_to_jiffies(&t) + 1; | ||
| 1813 | else { | ||
| 1814 | timeout = t.tv_sec; | ||
| 1815 | val2 = t.tv_nsec; | ||
| 1816 | } | ||
| 1064 | } | 1817 | } |
| 1065 | /* | 1818 | /* |
| 1066 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1819 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
| 1067 | */ | 1820 | */ |
| 1068 | if (op >= FUTEX_REQUEUE) | 1821 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
| 1069 | val2 = (u32) (unsigned long) utime; | 1822 | val2 = (u32) (unsigned long) utime; |
| 1070 | 1823 | ||
| 1071 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); | 1824 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e57c31670a3..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
| @@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
| 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
| 130 | int val2 = 0; | 130 | int val2 = 0; |
| 131 | 131 | ||
| 132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
| 133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
| 134 | return -EFAULT; | 134 | return -EFAULT; |
| 135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
| 136 | return -EINVAL; | 136 | return -EINVAL; |
| 137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
| 138 | timeout = timespec_to_jiffies(&t) + 1; | ||
| 139 | else { | ||
| 140 | timeout = t.tv_sec; | ||
| 141 | val2 = t.tv_nsec; | ||
| 142 | } | ||
| 138 | } | 143 | } |
| 139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
| 140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
| 141 | 146 | ||
| 142 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e068024eeffc..9c75856e791e 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
| @@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | |||
| 112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | 112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | /* | ||
| 116 | * PI-futex support (proxy locking functions, etc.): | ||
| 117 | */ | ||
| 118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
| 119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
| 120 | struct task_struct *proxy_owner); | ||
| 121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
| 122 | struct task_struct *proxy_owner); | ||
| 115 | #endif | 123 | #endif |
