aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2006-06-27 05:54:58 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-27 20:32:47 -0400
commitc87e2837be82df479a6bae9f155c43516d2feebc (patch)
treead6ab35f0b78f71abaa7b05185e9e3f97809c6de
parent0cdbee9920fb37eb2dc49b860c2b28862d647adc (diff)
[PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support
This adds the actual pi-futex implementation, based on rt-mutexes. [dino@in.ibm.com: fix an oops-causing race] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/futex.h7
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/exit.c8
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c829
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/rtmutex_common.h8
7 files changed, 828 insertions, 41 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index f05a3f469322..34c3a215f2cd 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -12,6 +12,9 @@
12#define FUTEX_REQUEUE 3 12#define FUTEX_REQUEUE 3
13#define FUTEX_CMP_REQUEUE 4 13#define FUTEX_CMP_REQUEUE 4
14#define FUTEX_WAKE_OP 5 14#define FUTEX_WAKE_OP 5
15#define FUTEX_LOCK_PI 6
16#define FUTEX_UNLOCK_PI 7
17#define FUTEX_TRYLOCK_PI 8
15 18
16/* 19/*
17 * Support for robust futexes: the kernel cleans up held futexes at 20 * Support for robust futexes: the kernel cleans up held futexes at
@@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr);
97 100
98#ifdef CONFIG_FUTEX 101#ifdef CONFIG_FUTEX
99extern void exit_robust_list(struct task_struct *curr); 102extern void exit_robust_list(struct task_struct *curr);
103extern void exit_pi_state_list(struct task_struct *curr);
100#else 104#else
101static inline void exit_robust_list(struct task_struct *curr) 105static inline void exit_robust_list(struct task_struct *curr)
102{ 106{
103} 107}
108static inline void exit_pi_state_list(struct task_struct *curr)
109{
110}
104#endif 111#endif
105 112
106#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ 113#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edadd13cf53f..b4e6be7de5ad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -84,6 +84,7 @@ struct sched_param {
84#include <asm/processor.h> 84#include <asm/processor.h>
85 85
86struct exec_domain; 86struct exec_domain;
87struct futex_pi_state;
87 88
88/* 89/*
89 * List of flags we want to share for kernel threads, 90 * List of flags we want to share for kernel threads,
@@ -915,6 +916,8 @@ struct task_struct {
915#ifdef CONFIG_COMPAT 916#ifdef CONFIG_COMPAT
916 struct compat_robust_list_head __user *compat_robust_list; 917 struct compat_robust_list_head __user *compat_robust_list;
917#endif 918#endif
919 struct list_head pi_state_list;
920 struct futex_pi_state *pi_state_cache;
918 921
919 atomic_t fs_excl; /* holding fs exclusive resources */ 922 atomic_t fs_excl; /* holding fs exclusive resources */
920 struct rcu_head rcu; 923 struct rcu_head rcu;
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e8a0282e9a5..ab06b9f88f64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -926,6 +926,14 @@ fastcall NORET_TYPE void do_exit(long code)
926 tsk->mempolicy = NULL; 926 tsk->mempolicy = NULL;
927#endif 927#endif
928 /* 928 /*
929 * This must happen late, after the PID is not
930 * hashed anymore:
931 */
932 if (unlikely(!list_empty(&tsk->pi_state_list)))
933 exit_pi_state_list(tsk);
934 if (unlikely(current->pi_state_cache))
935 kfree(current->pi_state_cache);
936 /*
929 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 937 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
930 */ 938 */
931 mutex_debug_check_no_locks_held(tsk); 939 mutex_debug_check_no_locks_held(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b664a081fffa..628198a4f28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
1092#ifdef CONFIG_COMPAT 1092#ifdef CONFIG_COMPAT
1093 p->compat_robust_list = NULL; 1093 p->compat_robust_list = NULL;
1094#endif 1094#endif
1095 INIT_LIST_HEAD(&p->pi_state_list);
1096 p->pi_state_cache = NULL;
1097
1095 /* 1098 /*
1096 * sigaltstack should be cleared when sharing the same VM 1099 * sigaltstack should be cleared when sharing the same VM
1097 */ 1100 */
diff --git a/kernel/futex.c b/kernel/futex.c
index 50356fb5d726..b305b7f8dad5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -96,6 +123,10 @@ struct futex_q {
96 /* For fd, sigio sent using these: */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -259,6 +290,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
259} 290}
260 291
261/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state;
481
482 return 0;
483 }
484 }
485
486 /*
487 * We are the first waiter - try to look up the real owner and
488 * attach the new pi_state to it:
489 */
490 pid = uval & FUTEX_TID_MASK;
491 p = futex_find_get_task(pid);
492 if (!p)
493 return -ESRCH;
494
495 pi_state = alloc_pi_state();
496
497 /*
498 * Initialize the pi_mutex in locked state and make 'p'
499 * the owner of it:
500 */
501 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
502
503 /* Store the key for possible exit cleanups: */
504 pi_state->key = me->key;
505
506 spin_lock_irq(&p->pi_lock);
507 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock);
510
511 put_task_struct(p);
512
513 me->pi_state = pi_state;
514
515 return 0;
516}
517
518/*
262 * The hash bucket lock must be held when this is called. 519 * The hash bucket lock must be held when this is called.
263 * Afterwards, the futex_q must not be accessed. 520 * Afterwards, the futex_q must not be accessed.
264 */ 521 */
@@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q)
285 q->lock_ptr = NULL; 542 q->lock_ptr = NULL;
286} 543}
287 544
545static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
546{
547 struct task_struct *new_owner;
548 struct futex_pi_state *pi_state = this->pi_state;
549 u32 curval, newval;
550
551 if (!pi_state)
552 return -EINVAL;
553
554 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
555
556 /*
557 * This happens when we have stolen the lock and the original
558 * pending owner did not enqueue itself back on the rt_mutex.
559 * Thats not a tragedy. We know that way, that a lock waiter
560 * is on the fly. We make the futex_q waiter the pending owner.
561 */
562 if (!new_owner)
563 new_owner = this->task;
564
565 /*
566 * We pass it to the next owner. (The WAITERS bit is always
567 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.)
569 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
571
572 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count();
575
576 if (curval == -EFAULT)
577 return -EFAULT;
578 if (curval != uval)
579 return -EINVAL;
580
581 list_del_init(&pi_state->owner->pi_state_list);
582 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner;
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 return 0;
587}
588
589static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
590{
591 u32 oldval;
592
593 /*
594 * There is no waiter, so we unlock the futex. The owner died
595 * bit has not to be preserved here. We are the owner:
596 */
597 inc_preempt_count();
598 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
599 dec_preempt_count();
600
601 if (oldval == -EFAULT)
602 return oldval;
603 if (oldval != uval)
604 return -EAGAIN;
605
606 return 0;
607}
608
288/* 609/*
289 * Wake up all waiters hashed on the physical page that is mapped 610 * Wake up all waiters hashed on the physical page that is mapped
290 * to this virtual address: 611 * to this virtual address:
@@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
309 630
310 list_for_each_entry_safe(this, next, head, list) { 631 list_for_each_entry_safe(this, next, head, list) {
311 if (match_futex (&this->key, &key)) { 632 if (match_futex (&this->key, &key)) {
633 if (this->pi_state)
634 return -EINVAL;
312 wake_futex(this); 635 wake_futex(this);
313 if (++ret >= nr_wake) 636 if (++ret >= nr_wake)
314 break; 637 break;
@@ -385,27 +708,9 @@ retry:
385 * still holding the mmap_sem. 708 * still holding the mmap_sem.
386 */ 709 */
387 if (attempt++) { 710 if (attempt++) {
388 struct vm_area_struct * vma; 711 if (futex_handle_fault((unsigned long)uaddr2,
389 struct mm_struct *mm = current->mm; 712 attempt))
390 unsigned long address = (unsigned long)uaddr2;
391
392 ret = -EFAULT;
393 if (attempt >= 2 ||
394 !(vma = find_vma(mm, address)) ||
395 vma->vm_start > address ||
396 !(vma->vm_flags & VM_WRITE))
397 goto out; 713 goto out;
398
399 switch (handle_mm_fault(mm, vma, address, 1)) {
400 case VM_FAULT_MINOR:
401 current->min_flt++;
402 break;
403 case VM_FAULT_MAJOR:
404 current->maj_flt++;
405 break;
406 default:
407 goto out;
408 }
409 goto retry; 714 goto retry;
410 } 715 }
411 716
@@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
572static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 877static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
573{ 878{
574 list_add_tail(&q->list, &hb->chain); 879 list_add_tail(&q->list, &hb->chain);
880 q->task = current;
575 spin_unlock(&hb->lock); 881 spin_unlock(&hb->lock);
576} 882}
577 883
@@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q)
626 } 932 }
627 WARN_ON(list_empty(&q->list)); 933 WARN_ON(list_empty(&q->list));
628 list_del(&q->list); 934 list_del(&q->list);
935
936 BUG_ON(q->pi_state);
937
629 spin_unlock(lock_ptr); 938 spin_unlock(lock_ptr);
630 ret = 1; 939 ret = 1;
631 } 940 }
@@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q)
634 return ret; 943 return ret;
635} 944}
636 945
946/*
947 * PI futexes can not be requeued and must remove themself from the
948 * hash bucket. The hash bucket lock is held on entry and dropped here.
949 */
950static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
951{
952 WARN_ON(list_empty(&q->list));
953 list_del(&q->list);
954
955 BUG_ON(!q->pi_state);
956 free_pi_state(q->pi_state);
957 q->pi_state = NULL;
958
959 spin_unlock(&hb->lock);
960
961 drop_key_refs(&q->key);
962}
963
637static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) 964static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
638{ 965{
639 DECLARE_WAITQUEUE(wait, current); 966 struct task_struct *curr = current;
967 DECLARE_WAITQUEUE(wait, curr);
640 struct futex_hash_bucket *hb; 968 struct futex_hash_bucket *hb;
641 struct futex_q q; 969 struct futex_q q;
642 u32 uval; 970 u32 uval;
643 int ret; 971 int ret;
644 972
973 q.pi_state = NULL;
645 retry: 974 retry:
646 down_read(&current->mm->mmap_sem); 975 down_read(&curr->mm->mmap_sem);
647 976
648 ret = get_futex_key(uaddr, &q.key); 977 ret = get_futex_key(uaddr, &q.key);
649 if (unlikely(ret != 0)) 978 if (unlikely(ret != 0))
@@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
680 * If we would have faulted, release mmap_sem, fault it in and 1009 * If we would have faulted, release mmap_sem, fault it in and
681 * start all over again. 1010 * start all over again.
682 */ 1011 */
683 up_read(&current->mm->mmap_sem); 1012 up_read(&curr->mm->mmap_sem);
684 1013
685 ret = get_user(uval, uaddr); 1014 ret = get_user(uval, uaddr);
686 1015
@@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
688 goto retry; 1017 goto retry;
689 return ret; 1018 return ret;
690 } 1019 }
691 if (uval != val) { 1020 ret = -EWOULDBLOCK;
692 ret = -EWOULDBLOCK; 1021 if (uval != val)
693 queue_unlock(&q, hb); 1022 goto out_unlock_release_sem;
694 goto out_release_sem;
695 }
696 1023
697 /* Only actually queue if *uaddr contained val. */ 1024 /* Only actually queue if *uaddr contained val. */
698 __queue_me(&q, hb); 1025 __queue_me(&q, hb);
@@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
700 /* 1027 /*
701 * Now the futex is queued and we have checked the data, we 1028 * Now the futex is queued and we have checked the data, we
702 * don't want to hold mmap_sem while we sleep. 1029 * don't want to hold mmap_sem while we sleep.
703 */ 1030 */
704 up_read(&current->mm->mmap_sem); 1031 up_read(&curr->mm->mmap_sem);
705 1032
706 /* 1033 /*
707 * There might have been scheduling since the queue_me(), as we 1034 * There might have been scheduling since the queue_me(), as we
@@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
739 */ 1066 */
740 return -EINTR; 1067 return -EINTR;
741 1068
1069 out_unlock_release_sem:
1070 queue_unlock(&q, hb);
1071
742 out_release_sem: 1072 out_release_sem:
1073 up_read(&curr->mm->mmap_sem);
1074 return ret;
1075}
1076
1077/*
1078 * Userspace tried a 0 -> TID atomic transition of the futex value
1079 * and failed. The kernel side here does the whole locking operation:
1080 * if there are waiters then it will block, it does PI, etc. (Due to
1081 * races the kernel might see a 0 value of the futex too.)
1082 */
1083static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1084 struct hrtimer_sleeper *to)
1085{
1086 struct task_struct *curr = current;
1087 struct futex_hash_bucket *hb;
1088 u32 uval, newval, curval;
1089 struct futex_q q;
1090 int ret, attempt = 0;
1091
1092 if (refill_pi_state_cache())
1093 return -ENOMEM;
1094
1095 q.pi_state = NULL;
1096 retry:
1097 down_read(&curr->mm->mmap_sem);
1098
1099 ret = get_futex_key(uaddr, &q.key);
1100 if (unlikely(ret != 0))
1101 goto out_release_sem;
1102
1103 hb = queue_lock(&q, -1, NULL);
1104
1105 retry_locked:
1106 /*
1107 * To avoid races, we attempt to take the lock here again
1108 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1109 * the locks. It will most likely not succeed.
1110 */
1111 newval = current->pid;
1112
1113 inc_preempt_count();
1114 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1115 dec_preempt_count();
1116
1117 if (unlikely(curval == -EFAULT))
1118 goto uaddr_faulted;
1119
1120 /* We own the lock already */
1121 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1122 if (!detect && 0)
1123 force_sig(SIGKILL, current);
1124 ret = -EDEADLK;
1125 goto out_unlock_release_sem;
1126 }
1127
1128 /*
1129 * Surprise - we got the lock. Just return
1130 * to userspace:
1131 */
1132 if (unlikely(!curval))
1133 goto out_unlock_release_sem;
1134
1135 uval = curval;
1136 newval = uval | FUTEX_WAITERS;
1137
1138 inc_preempt_count();
1139 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1140 dec_preempt_count();
1141
1142 if (unlikely(curval == -EFAULT))
1143 goto uaddr_faulted;
1144 if (unlikely(curval != uval))
1145 goto retry_locked;
1146
1147 /*
1148 * We dont have the lock. Look up the PI state (or create it if
1149 * we are the first waiter):
1150 */
1151 ret = lookup_pi_state(uval, hb, &q);
1152
1153 if (unlikely(ret)) {
1154 /*
1155 * There were no waiters and the owner task lookup
1156 * failed. When the OWNER_DIED bit is set, then we
1157 * know that this is a robust futex and we actually
1158 * take the lock. This is safe as we are protected by
1159 * the hash bucket lock. We also set the waiters bit
1160 * unconditionally here, to simplify glibc handling of
1161 * multiple tasks racing to acquire the lock and
1162 * cleanup the problems which were left by the dead
1163 * owner.
1164 */
1165 if (curval & FUTEX_OWNER_DIED) {
1166 uval = newval;
1167 newval = current->pid |
1168 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1169
1170 inc_preempt_count();
1171 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1172 uval, newval);
1173 dec_preempt_count();
1174
1175 if (unlikely(curval == -EFAULT))
1176 goto uaddr_faulted;
1177 if (unlikely(curval != uval))
1178 goto retry_locked;
1179 ret = 0;
1180 }
1181 goto out_unlock_release_sem;
1182 }
1183
1184 /*
1185 * Only actually queue now that the atomic ops are done:
1186 */
1187 __queue_me(&q, hb);
1188
1189 /*
1190 * Now the futex is queued and we have checked the data, we
1191 * don't want to hold mmap_sem while we sleep.
1192 */
1193 up_read(&curr->mm->mmap_sem);
1194
1195 WARN_ON(!q.pi_state);
1196 /*
1197 * Block on the PI mutex:
1198 */
1199 if (!trylock)
1200 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1201 else {
1202 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1203 /* Fixup the trylock return value: */
1204 ret = ret ? 0 : -EWOULDBLOCK;
1205 }
1206
1207 down_read(&curr->mm->mmap_sem);
1208 hb = queue_lock(&q, -1, NULL);
1209
1210 /*
1211 * Got the lock. We might not be the anticipated owner if we
1212 * did a lock-steal - fix up the PI-state in that case.
1213 */
1214 if (!ret && q.pi_state->owner != curr) {
1215 u32 newtid = current->pid | FUTEX_WAITERS;
1216
1217 /* Owner died? */
1218 if (q.pi_state->owner != NULL) {
1219 spin_lock_irq(&q.pi_state->owner->pi_lock);
1220 list_del_init(&q.pi_state->list);
1221 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1222 } else
1223 newtid |= FUTEX_OWNER_DIED;
1224
1225 q.pi_state->owner = current;
1226
1227 spin_lock_irq(&current->pi_lock);
1228 list_add(&q.pi_state->list, &current->pi_state_list);
1229 spin_unlock_irq(&current->pi_lock);
1230
1231 /* Unqueue and drop the lock */
1232 unqueue_me_pi(&q, hb);
1233 up_read(&curr->mm->mmap_sem);
1234 /*
1235 * We own it, so we have to replace the pending owner
1236 * TID. This must be atomic as we have preserve the
1237 * owner died bit here.
1238 */
1239 ret = get_user(uval, uaddr);
1240 while (!ret) {
1241 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1242 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1243 uval, newval);
1244 if (curval == -EFAULT)
1245 ret = -EFAULT;
1246 if (curval == uval)
1247 break;
1248 uval = curval;
1249 }
1250 } else {
1251 /*
1252 * Catch the rare case, where the lock was released
1253 * when we were on the way back before we locked
1254 * the hash bucket.
1255 */
1256 if (ret && q.pi_state->owner == curr) {
1257 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1258 ret = 0;
1259 }
1260 /* Unqueue and drop the lock */
1261 unqueue_me_pi(&q, hb);
1262 up_read(&curr->mm->mmap_sem);
1263 }
1264
1265 if (!detect && ret == -EDEADLK && 0)
1266 force_sig(SIGKILL, current);
1267
1268 return ret;
1269
1270 out_unlock_release_sem:
1271 queue_unlock(&q, hb);
1272
1273 out_release_sem:
1274 up_read(&curr->mm->mmap_sem);
1275 return ret;
1276
1277 uaddr_faulted:
1278 /*
1279 * We have to r/w *(int __user *)uaddr, but we can't modify it
1280 * non-atomically. Therefore, if get_user below is not
1281 * enough, we need to handle the fault ourselves, while
1282 * still holding the mmap_sem.
1283 */
1284 if (attempt++) {
1285 if (futex_handle_fault((unsigned long)uaddr, attempt))
1286 goto out_unlock_release_sem;
1287
1288 goto retry_locked;
1289 }
1290
1291 queue_unlock(&q, hb);
1292 up_read(&curr->mm->mmap_sem);
1293
1294 ret = get_user(uval, uaddr);
1295 if (!ret && (uval != -EFAULT))
1296 goto retry;
1297
1298 return ret;
1299}
1300
1301/*
1302 * Restart handler
1303 */
1304static long futex_lock_pi_restart(struct restart_block *restart)
1305{
1306 struct hrtimer_sleeper timeout, *to = NULL;
1307 int ret;
1308
1309 restart->fn = do_no_restart_syscall;
1310
1311 if (restart->arg2 || restart->arg3) {
1312 to = &timeout;
1313 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1314 hrtimer_init_sleeper(to, current);
1315 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1316 (u64) restart->arg0;
1317 }
1318
1319 pr_debug("lock_pi restart: %p, %d (%d)\n",
1320 (u32 __user *)restart->arg0, current->pid);
1321
1322 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1323 0, to);
1324
1325 if (ret != -EINTR)
1326 return ret;
1327
1328 restart->fn = futex_lock_pi_restart;
1329
1330 /* The other values are filled in */
1331 return -ERESTART_RESTARTBLOCK;
1332}
1333
1334/*
1335 * Called from the syscall entry below.
1336 */
1337static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1338 long nsec, int trylock)
1339{
1340 struct hrtimer_sleeper timeout, *to = NULL;
1341 struct restart_block *restart;
1342 int ret;
1343
1344 if (sec != MAX_SCHEDULE_TIMEOUT) {
1345 to = &timeout;
1346 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1347 hrtimer_init_sleeper(to, current);
1348 to->timer.expires = ktime_set(sec, nsec);
1349 }
1350
1351 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1352
1353 if (ret != -EINTR)
1354 return ret;
1355
1356 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1357
1358 restart = &current_thread_info()->restart_block;
1359 restart->fn = futex_lock_pi_restart;
1360 restart->arg0 = (unsigned long) uaddr;
1361 restart->arg1 = detect;
1362 if (to) {
1363 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1364 restart->arg3 = to->timer.expires.tv64 >> 32;
1365 } else
1366 restart->arg2 = restart->arg3 = 0;
1367
1368 return -ERESTART_RESTARTBLOCK;
1369}
1370
1371/*
1372 * Userspace attempted a TID -> 0 atomic transition, and failed.
1373 * This is the in-kernel slowpath: we look up the PI state (if any),
1374 * and do the rt-mutex unlock.
1375 */
1376static int futex_unlock_pi(u32 __user *uaddr)
1377{
1378 struct futex_hash_bucket *hb;
1379 struct futex_q *this, *next;
1380 u32 uval;
1381 struct list_head *head;
1382 union futex_key key;
1383 int ret, attempt = 0;
1384
1385retry:
1386 if (get_user(uval, uaddr))
1387 return -EFAULT;
1388 /*
1389 * We release only a lock we actually own:
1390 */
1391 if ((uval & FUTEX_TID_MASK) != current->pid)
1392 return -EPERM;
1393 /*
1394 * First take all the futex related locks:
1395 */
1396 down_read(&current->mm->mmap_sem);
1397
1398 ret = get_futex_key(uaddr, &key);
1399 if (unlikely(ret != 0))
1400 goto out;
1401
1402 hb = hash_futex(&key);
1403 spin_lock(&hb->lock);
1404
1405retry_locked:
1406 /*
1407 * To avoid races, try to do the TID -> 0 atomic transition
1408 * again. If it succeeds then we can return without waking
1409 * anyone else up:
1410 */
1411 inc_preempt_count();
1412 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1413 dec_preempt_count();
1414
1415 if (unlikely(uval == -EFAULT))
1416 goto pi_faulted;
1417 /*
1418 * Rare case: we managed to release the lock atomically,
1419 * no need to wake anyone else up:
1420 */
1421 if (unlikely(uval == current->pid))
1422 goto out_unlock;
1423
1424 /*
1425 * Ok, other tasks may need to be woken up - check waiters
1426 * and do the wakeup if necessary:
1427 */
1428 head = &hb->chain;
1429
1430 list_for_each_entry_safe(this, next, head, list) {
1431 if (!match_futex (&this->key, &key))
1432 continue;
1433 ret = wake_futex_pi(uaddr, uval, this);
1434 /*
1435 * The atomic access to the futex value
1436 * generated a pagefault, so retry the
1437 * user-access and the wakeup:
1438 */
1439 if (ret == -EFAULT)
1440 goto pi_faulted;
1441 goto out_unlock;
1442 }
1443 /*
1444 * No waiters - kernel unlocks the futex:
1445 */
1446 ret = unlock_futex_pi(uaddr, uval);
1447 if (ret == -EFAULT)
1448 goto pi_faulted;
1449
1450out_unlock:
1451 spin_unlock(&hb->lock);
1452out:
1453 up_read(&current->mm->mmap_sem);
1454
1455 return ret;
1456
1457pi_faulted:
1458 /*
1459 * We have to r/w *(int __user *)uaddr, but we can't modify it
1460 * non-atomically. Therefore, if get_user below is not
1461 * enough, we need to handle the fault ourselves, while
1462 * still holding the mmap_sem.
1463 */
1464 if (attempt++) {
1465 if (futex_handle_fault((unsigned long)uaddr, attempt))
1466 goto out_unlock;
1467
1468 goto retry_locked;
1469 }
1470
1471 spin_unlock(&hb->lock);
743 up_read(&current->mm->mmap_sem); 1472 up_read(&current->mm->mmap_sem);
1473
1474 ret = get_user(uval, uaddr);
1475 if (!ret && (uval != -EFAULT))
1476 goto retry;
1477
744 return ret; 1478 return ret;
745} 1479}
746 1480
@@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
819 err = -ENOMEM; 1553 err = -ENOMEM;
820 goto error; 1554 goto error;
821 } 1555 }
1556 q->pi_state = NULL;
822 1557
823 down_read(&current->mm->mmap_sem); 1558 down_read(&current->mm->mmap_sem);
824 err = get_futex_key(uaddr, &q->key); 1559 err = get_futex_key(uaddr, &q->key);
@@ -856,7 +1591,7 @@ error:
856 * Implementation: user-space maintains a per-thread list of locks it 1591 * Implementation: user-space maintains a per-thread list of locks it
857 * is holding. Upon do_exit(), the kernel carefully walks this list, 1592 * is holding. Upon do_exit(), the kernel carefully walks this list,
858 * and marks all locks that are owned by this thread with the 1593 * and marks all locks that are owned by this thread with the
859 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1594 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
860 * always manipulated with the lock held, so the list is private and 1595 * always manipulated with the lock held, so the list is private and
861 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1596 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
862 * field, to allow the kernel to clean up if the thread dies after 1597 * field, to allow the kernel to clean up if the thread dies after
@@ -931,7 +1666,7 @@ err_unlock:
931 */ 1666 */
932int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1667int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
933{ 1668{
934 u32 uval; 1669 u32 uval, nval;
935 1670
936retry: 1671retry:
937 if (get_user(uval, uaddr)) 1672 if (get_user(uval, uaddr))
@@ -948,8 +1683,12 @@ retry:
948 * thread-death.) The rest of the cleanup is done in 1683 * thread-death.) The rest of the cleanup is done in
949 * userspace. 1684 * userspace.
950 */ 1685 */
951 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1686 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
952 uval | FUTEX_OWNER_DIED) != uval) 1687 uval | FUTEX_OWNER_DIED);
1688 if (nval == -EFAULT)
1689 return -1;
1690
1691 if (nval != uval)
953 goto retry; 1692 goto retry;
954 1693
955 if (uval & FUTEX_WAITERS) 1694 if (uval & FUTEX_WAITERS)
@@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr)
994 while (entry != &head->list) { 1733 while (entry != &head->list) {
995 /* 1734 /*
996 * A pending lock might already be on the list, so 1735 * A pending lock might already be on the list, so
997 * dont process it twice: 1736 * don't process it twice:
998 */ 1737 */
999 if (entry != pending) 1738 if (entry != pending)
1000 if (handle_futex_death((void *)entry + futex_offset, 1739 if (handle_futex_death((void *)entry + futex_offset,
@@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1040 case FUTEX_WAKE_OP: 1779 case FUTEX_WAKE_OP:
1041 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1780 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1042 break; 1781 break;
1782 case FUTEX_LOCK_PI:
1783 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1784 break;
1785 case FUTEX_UNLOCK_PI:
1786 ret = futex_unlock_pi(uaddr);
1787 break;
1788 case FUTEX_TRYLOCK_PI:
1789 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1790 break;
1043 default: 1791 default:
1044 ret = -ENOSYS; 1792 ret = -ENOSYS;
1045 } 1793 }
@@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1055 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1803 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1056 u32 val2 = 0; 1804 u32 val2 = 0;
1057 1805
1058 if (utime && (op == FUTEX_WAIT)) { 1806 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1059 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1807 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1060 return -EFAULT; 1808 return -EFAULT;
1061 if (!timespec_valid(&t)) 1809 if (!timespec_valid(&t))
1062 return -EINVAL; 1810 return -EINVAL;
1063 timeout = timespec_to_jiffies(&t) + 1; 1811 if (op == FUTEX_WAIT)
1812 timeout = timespec_to_jiffies(&t) + 1;
1813 else {
1814 timeout = t.tv_sec;
1815 val2 = t.tv_nsec;
1816 }
1064 } 1817 }
1065 /* 1818 /*
1066 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1819 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1067 */ 1820 */
1068 if (op >= FUTEX_REQUEUE) 1821 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1069 val2 = (u32) (unsigned long) utime; 1822 val2 = (u32) (unsigned long) utime;
1070 1823
1071 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 1824 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 7e57c31670a3..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e068024eeffc..9c75856e791e 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; 112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113} 113}
114 114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
115#endif 123#endif