diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-06-27 05:54:58 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-27 20:32:47 -0400 |
commit | c87e2837be82df479a6bae9f155c43516d2feebc (patch) | |
tree | ad6ab35f0b78f71abaa7b05185e9e3f97809c6de | |
parent | 0cdbee9920fb37eb2dc49b860c2b28862d647adc (diff) |
[PATCH] pi-futex: futex_lock_pi/futex_unlock_pi support
This adds the actual pi-futex implementation, based on rt-mutexes.
[dino@in.ibm.com: fix an oops-causing race]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/futex.h | 7 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/exit.c | 8 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/futex.c | 829 | ||||
-rw-r--r-- | kernel/futex_compat.c | 11 | ||||
-rw-r--r-- | kernel/rtmutex_common.h | 8 |
7 files changed, 828 insertions, 41 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h index f05a3f469322..34c3a215f2cd 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h | |||
@@ -12,6 +12,9 @@ | |||
12 | #define FUTEX_REQUEUE 3 | 12 | #define FUTEX_REQUEUE 3 |
13 | #define FUTEX_CMP_REQUEUE 4 | 13 | #define FUTEX_CMP_REQUEUE 4 |
14 | #define FUTEX_WAKE_OP 5 | 14 | #define FUTEX_WAKE_OP 5 |
15 | #define FUTEX_LOCK_PI 6 | ||
16 | #define FUTEX_UNLOCK_PI 7 | ||
17 | #define FUTEX_TRYLOCK_PI 8 | ||
15 | 18 | ||
16 | /* | 19 | /* |
17 | * Support for robust futexes: the kernel cleans up held futexes at | 20 | * Support for robust futexes: the kernel cleans up held futexes at |
@@ -97,10 +100,14 @@ extern int handle_futex_death(u32 __user *uaddr, struct task_struct *curr); | |||
97 | 100 | ||
98 | #ifdef CONFIG_FUTEX | 101 | #ifdef CONFIG_FUTEX |
99 | extern void exit_robust_list(struct task_struct *curr); | 102 | extern void exit_robust_list(struct task_struct *curr); |
103 | extern void exit_pi_state_list(struct task_struct *curr); | ||
100 | #else | 104 | #else |
101 | static inline void exit_robust_list(struct task_struct *curr) | 105 | static inline void exit_robust_list(struct task_struct *curr) |
102 | { | 106 | { |
103 | } | 107 | } |
108 | static inline void exit_pi_state_list(struct task_struct *curr) | ||
109 | { | ||
110 | } | ||
104 | #endif | 111 | #endif |
105 | 112 | ||
106 | #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ | 113 | #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index edadd13cf53f..b4e6be7de5ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -84,6 +84,7 @@ struct sched_param { | |||
84 | #include <asm/processor.h> | 84 | #include <asm/processor.h> |
85 | 85 | ||
86 | struct exec_domain; | 86 | struct exec_domain; |
87 | struct futex_pi_state; | ||
87 | 88 | ||
88 | /* | 89 | /* |
89 | * List of flags we want to share for kernel threads, | 90 | * List of flags we want to share for kernel threads, |
@@ -915,6 +916,8 @@ struct task_struct { | |||
915 | #ifdef CONFIG_COMPAT | 916 | #ifdef CONFIG_COMPAT |
916 | struct compat_robust_list_head __user *compat_robust_list; | 917 | struct compat_robust_list_head __user *compat_robust_list; |
917 | #endif | 918 | #endif |
919 | struct list_head pi_state_list; | ||
920 | struct futex_pi_state *pi_state_cache; | ||
918 | 921 | ||
919 | atomic_t fs_excl; /* holding fs exclusive resources */ | 922 | atomic_t fs_excl; /* holding fs exclusive resources */ |
920 | struct rcu_head rcu; | 923 | struct rcu_head rcu; |
diff --git a/kernel/exit.c b/kernel/exit.c index 3e8a0282e9a5..ab06b9f88f64 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -926,6 +926,14 @@ fastcall NORET_TYPE void do_exit(long code) | |||
926 | tsk->mempolicy = NULL; | 926 | tsk->mempolicy = NULL; |
927 | #endif | 927 | #endif |
928 | /* | 928 | /* |
929 | * This must happen late, after the PID is not | ||
930 | * hashed anymore: | ||
931 | */ | ||
932 | if (unlikely(!list_empty(&tsk->pi_state_list))) | ||
933 | exit_pi_state_list(tsk); | ||
934 | if (unlikely(current->pi_state_cache)) | ||
935 | kfree(current->pi_state_cache); | ||
936 | /* | ||
929 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 937 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: |
930 | */ | 938 | */ |
931 | mutex_debug_check_no_locks_held(tsk); | 939 | mutex_debug_check_no_locks_held(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index b664a081fffa..628198a4f28a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1092,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1092 | #ifdef CONFIG_COMPAT | 1092 | #ifdef CONFIG_COMPAT |
1093 | p->compat_robust_list = NULL; | 1093 | p->compat_robust_list = NULL; |
1094 | #endif | 1094 | #endif |
1095 | INIT_LIST_HEAD(&p->pi_state_list); | ||
1096 | p->pi_state_cache = NULL; | ||
1097 | |||
1095 | /* | 1098 | /* |
1096 | * sigaltstack should be cleared when sharing the same VM | 1099 | * sigaltstack should be cleared when sharing the same VM |
1097 | */ | 1100 | */ |
diff --git a/kernel/futex.c b/kernel/futex.c index 50356fb5d726..b305b7f8dad5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -12,6 +12,10 @@ | |||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
18 | * | ||
15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
@@ -46,6 +50,8 @@ | |||
46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
48 | 52 | ||
53 | #include "rtmutex_common.h" | ||
54 | |||
49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
50 | 56 | ||
51 | /* | 57 | /* |
@@ -75,6 +81,27 @@ union futex_key { | |||
75 | }; | 81 | }; |
76 | 82 | ||
77 | /* | 83 | /* |
84 | * Priority Inheritance state: | ||
85 | */ | ||
86 | struct futex_pi_state { | ||
87 | /* | ||
88 | * list of 'owned' pi_state instances - these have to be | ||
89 | * cleaned up in do_exit() if the task exits prematurely: | ||
90 | */ | ||
91 | struct list_head list; | ||
92 | |||
93 | /* | ||
94 | * The PI object: | ||
95 | */ | ||
96 | struct rt_mutex pi_mutex; | ||
97 | |||
98 | struct task_struct *owner; | ||
99 | atomic_t refcount; | ||
100 | |||
101 | union futex_key key; | ||
102 | }; | ||
103 | |||
104 | /* | ||
78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
80 | * | 107 | * |
@@ -96,6 +123,10 @@ struct futex_q { | |||
96 | /* For fd, sigio sent using these: */ | 123 | /* For fd, sigio sent using these: */ |
97 | int fd; | 124 | int fd; |
98 | struct file *filp; | 125 | struct file *filp; |
126 | |||
127 | /* Optional priority inheritance state: */ | ||
128 | struct futex_pi_state *pi_state; | ||
129 | struct task_struct *task; | ||
99 | }; | 130 | }; |
100 | 131 | ||
101 | /* | 132 | /* |
@@ -259,6 +290,232 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | |||
259 | } | 290 | } |
260 | 291 | ||
261 | /* | 292 | /* |
293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
294 | */ | ||
295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
296 | { | ||
297 | struct vm_area_struct * vma; | ||
298 | struct mm_struct *mm = current->mm; | ||
299 | |||
300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
302 | return -EFAULT; | ||
303 | |||
304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
305 | case VM_FAULT_MINOR: | ||
306 | current->min_flt++; | ||
307 | break; | ||
308 | case VM_FAULT_MAJOR: | ||
309 | current->maj_flt++; | ||
310 | break; | ||
311 | default: | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * PI code: | ||
319 | */ | ||
320 | static int refill_pi_state_cache(void) | ||
321 | { | ||
322 | struct futex_pi_state *pi_state; | ||
323 | |||
324 | if (likely(current->pi_state_cache)) | ||
325 | return 0; | ||
326 | |||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
328 | |||
329 | if (!pi_state) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | ||
334 | /* pi_mutex gets initialized later */ | ||
335 | pi_state->owner = NULL; | ||
336 | atomic_set(&pi_state->refcount, 1); | ||
337 | |||
338 | current->pi_state_cache = pi_state; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static struct futex_pi_state * alloc_pi_state(void) | ||
344 | { | ||
345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
346 | |||
347 | WARN_ON(!pi_state); | ||
348 | current->pi_state_cache = NULL; | ||
349 | |||
350 | return pi_state; | ||
351 | } | ||
352 | |||
353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
354 | { | ||
355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
356 | return; | ||
357 | |||
358 | /* | ||
359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
360 | * and has cleaned up the pi_state already | ||
361 | */ | ||
362 | if (pi_state->owner) { | ||
363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
364 | list_del_init(&pi_state->list); | ||
365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
366 | |||
367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
368 | } | ||
369 | |||
370 | if (current->pi_state_cache) | ||
371 | kfree(pi_state); | ||
372 | else { | ||
373 | /* | ||
374 | * pi_state->list is already empty. | ||
375 | * clear pi_state->owner. | ||
376 | * refcount is at 0 - put it back to 1. | ||
377 | */ | ||
378 | pi_state->owner = NULL; | ||
379 | atomic_set(&pi_state->refcount, 1); | ||
380 | current->pi_state_cache = pi_state; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Look up the task based on what TID userspace gave us. | ||
386 | * We dont trust it. | ||
387 | */ | ||
388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
389 | { | ||
390 | struct task_struct *p; | ||
391 | |||
392 | read_lock(&tasklist_lock); | ||
393 | p = find_task_by_pid(pid); | ||
394 | if (!p) | ||
395 | goto out_unlock; | ||
396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
397 | p = NULL; | ||
398 | goto out_unlock; | ||
399 | } | ||
400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
401 | p = NULL; | ||
402 | goto out_unlock; | ||
403 | } | ||
404 | get_task_struct(p); | ||
405 | out_unlock: | ||
406 | read_unlock(&tasklist_lock); | ||
407 | |||
408 | return p; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * This task is holding PI mutexes at exit time => bad. | ||
413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
415 | */ | ||
416 | void exit_pi_state_list(struct task_struct *curr) | ||
417 | { | ||
418 | struct futex_hash_bucket *hb; | ||
419 | struct list_head *next, *head = &curr->pi_state_list; | ||
420 | struct futex_pi_state *pi_state; | ||
421 | union futex_key key; | ||
422 | |||
423 | /* | ||
424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
425 | * pi_state_list anymore, but we have to be careful | ||
426 | * versus waiters unqueueing themselfs | ||
427 | */ | ||
428 | spin_lock_irq(&curr->pi_lock); | ||
429 | while (!list_empty(head)) { | ||
430 | |||
431 | next = head->next; | ||
432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
433 | key = pi_state->key; | ||
434 | spin_unlock_irq(&curr->pi_lock); | ||
435 | |||
436 | hb = hash_futex(&key); | ||
437 | spin_lock(&hb->lock); | ||
438 | |||
439 | spin_lock_irq(&curr->pi_lock); | ||
440 | if (head->next != next) { | ||
441 | spin_unlock(&hb->lock); | ||
442 | continue; | ||
443 | } | ||
444 | |||
445 | list_del_init(&pi_state->list); | ||
446 | |||
447 | WARN_ON(pi_state->owner != curr); | ||
448 | |||
449 | pi_state->owner = NULL; | ||
450 | spin_unlock_irq(&curr->pi_lock); | ||
451 | |||
452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
453 | |||
454 | spin_unlock(&hb->lock); | ||
455 | |||
456 | spin_lock_irq(&curr->pi_lock); | ||
457 | } | ||
458 | spin_unlock_irq(&curr->pi_lock); | ||
459 | } | ||
460 | |||
461 | static int | ||
462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
463 | { | ||
464 | struct futex_pi_state *pi_state = NULL; | ||
465 | struct futex_q *this, *next; | ||
466 | struct list_head *head; | ||
467 | struct task_struct *p; | ||
468 | pid_t pid; | ||
469 | |||
470 | head = &hb->chain; | ||
471 | |||
472 | list_for_each_entry_safe(this, next, head, list) { | ||
473 | if (match_futex (&this->key, &me->key)) { | ||
474 | /* | ||
475 | * Another waiter already exists - bump up | ||
476 | * the refcount and return its pi_state: | ||
477 | */ | ||
478 | pi_state = this->pi_state; | ||
479 | atomic_inc(&pi_state->refcount); | ||
480 | me->pi_state = pi_state; | ||
481 | |||
482 | return 0; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * We are the first waiter - try to look up the real owner and | ||
488 | * attach the new pi_state to it: | ||
489 | */ | ||
490 | pid = uval & FUTEX_TID_MASK; | ||
491 | p = futex_find_get_task(pid); | ||
492 | if (!p) | ||
493 | return -ESRCH; | ||
494 | |||
495 | pi_state = alloc_pi_state(); | ||
496 | |||
497 | /* | ||
498 | * Initialize the pi_mutex in locked state and make 'p' | ||
499 | * the owner of it: | ||
500 | */ | ||
501 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
502 | |||
503 | /* Store the key for possible exit cleanups: */ | ||
504 | pi_state->key = me->key; | ||
505 | |||
506 | spin_lock_irq(&p->pi_lock); | ||
507 | list_add(&pi_state->list, &p->pi_state_list); | ||
508 | pi_state->owner = p; | ||
509 | spin_unlock_irq(&p->pi_lock); | ||
510 | |||
511 | put_task_struct(p); | ||
512 | |||
513 | me->pi_state = pi_state; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* | ||
262 | * The hash bucket lock must be held when this is called. | 519 | * The hash bucket lock must be held when this is called. |
263 | * Afterwards, the futex_q must not be accessed. | 520 | * Afterwards, the futex_q must not be accessed. |
264 | */ | 521 | */ |
@@ -285,6 +542,70 @@ static void wake_futex(struct futex_q *q) | |||
285 | q->lock_ptr = NULL; | 542 | q->lock_ptr = NULL; |
286 | } | 543 | } |
287 | 544 | ||
545 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
546 | { | ||
547 | struct task_struct *new_owner; | ||
548 | struct futex_pi_state *pi_state = this->pi_state; | ||
549 | u32 curval, newval; | ||
550 | |||
551 | if (!pi_state) | ||
552 | return -EINVAL; | ||
553 | |||
554 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
555 | |||
556 | /* | ||
557 | * This happens when we have stolen the lock and the original | ||
558 | * pending owner did not enqueue itself back on the rt_mutex. | ||
559 | * Thats not a tragedy. We know that way, that a lock waiter | ||
560 | * is on the fly. We make the futex_q waiter the pending owner. | ||
561 | */ | ||
562 | if (!new_owner) | ||
563 | new_owner = this->task; | ||
564 | |||
565 | /* | ||
566 | * We pass it to the next owner. (The WAITERS bit is always | ||
567 | * kept enabled while there is PI state around. We must also | ||
568 | * preserve the owner died bit.) | ||
569 | */ | ||
570 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
571 | |||
572 | inc_preempt_count(); | ||
573 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
574 | dec_preempt_count(); | ||
575 | |||
576 | if (curval == -EFAULT) | ||
577 | return -EFAULT; | ||
578 | if (curval != uval) | ||
579 | return -EINVAL; | ||
580 | |||
581 | list_del_init(&pi_state->owner->pi_state_list); | ||
582 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
583 | pi_state->owner = new_owner; | ||
584 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
590 | { | ||
591 | u32 oldval; | ||
592 | |||
593 | /* | ||
594 | * There is no waiter, so we unlock the futex. The owner died | ||
595 | * bit has not to be preserved here. We are the owner: | ||
596 | */ | ||
597 | inc_preempt_count(); | ||
598 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
599 | dec_preempt_count(); | ||
600 | |||
601 | if (oldval == -EFAULT) | ||
602 | return oldval; | ||
603 | if (oldval != uval) | ||
604 | return -EAGAIN; | ||
605 | |||
606 | return 0; | ||
607 | } | ||
608 | |||
288 | /* | 609 | /* |
289 | * Wake up all waiters hashed on the physical page that is mapped | 610 | * Wake up all waiters hashed on the physical page that is mapped |
290 | * to this virtual address: | 611 | * to this virtual address: |
@@ -309,6 +630,8 @@ static int futex_wake(u32 __user *uaddr, int nr_wake) | |||
309 | 630 | ||
310 | list_for_each_entry_safe(this, next, head, list) { | 631 | list_for_each_entry_safe(this, next, head, list) { |
311 | if (match_futex (&this->key, &key)) { | 632 | if (match_futex (&this->key, &key)) { |
633 | if (this->pi_state) | ||
634 | return -EINVAL; | ||
312 | wake_futex(this); | 635 | wake_futex(this); |
313 | if (++ret >= nr_wake) | 636 | if (++ret >= nr_wake) |
314 | break; | 637 | break; |
@@ -385,27 +708,9 @@ retry: | |||
385 | * still holding the mmap_sem. | 708 | * still holding the mmap_sem. |
386 | */ | 709 | */ |
387 | if (attempt++) { | 710 | if (attempt++) { |
388 | struct vm_area_struct * vma; | 711 | if (futex_handle_fault((unsigned long)uaddr2, |
389 | struct mm_struct *mm = current->mm; | 712 | attempt)) |
390 | unsigned long address = (unsigned long)uaddr2; | ||
391 | |||
392 | ret = -EFAULT; | ||
393 | if (attempt >= 2 || | ||
394 | !(vma = find_vma(mm, address)) || | ||
395 | vma->vm_start > address || | ||
396 | !(vma->vm_flags & VM_WRITE)) | ||
397 | goto out; | 713 | goto out; |
398 | |||
399 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
400 | case VM_FAULT_MINOR: | ||
401 | current->min_flt++; | ||
402 | break; | ||
403 | case VM_FAULT_MAJOR: | ||
404 | current->maj_flt++; | ||
405 | break; | ||
406 | default: | ||
407 | goto out; | ||
408 | } | ||
409 | goto retry; | 714 | goto retry; |
410 | } | 715 | } |
411 | 716 | ||
@@ -572,6 +877,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
572 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | 877 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
573 | { | 878 | { |
574 | list_add_tail(&q->list, &hb->chain); | 879 | list_add_tail(&q->list, &hb->chain); |
880 | q->task = current; | ||
575 | spin_unlock(&hb->lock); | 881 | spin_unlock(&hb->lock); |
576 | } | 882 | } |
577 | 883 | ||
@@ -626,6 +932,9 @@ static int unqueue_me(struct futex_q *q) | |||
626 | } | 932 | } |
627 | WARN_ON(list_empty(&q->list)); | 933 | WARN_ON(list_empty(&q->list)); |
628 | list_del(&q->list); | 934 | list_del(&q->list); |
935 | |||
936 | BUG_ON(q->pi_state); | ||
937 | |||
629 | spin_unlock(lock_ptr); | 938 | spin_unlock(lock_ptr); |
630 | ret = 1; | 939 | ret = 1; |
631 | } | 940 | } |
@@ -634,16 +943,36 @@ static int unqueue_me(struct futex_q *q) | |||
634 | return ret; | 943 | return ret; |
635 | } | 944 | } |
636 | 945 | ||
946 | /* | ||
947 | * PI futexes can not be requeued and must remove themself from the | ||
948 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
949 | */ | ||
950 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
951 | { | ||
952 | WARN_ON(list_empty(&q->list)); | ||
953 | list_del(&q->list); | ||
954 | |||
955 | BUG_ON(!q->pi_state); | ||
956 | free_pi_state(q->pi_state); | ||
957 | q->pi_state = NULL; | ||
958 | |||
959 | spin_unlock(&hb->lock); | ||
960 | |||
961 | drop_key_refs(&q->key); | ||
962 | } | ||
963 | |||
637 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | 964 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) |
638 | { | 965 | { |
639 | DECLARE_WAITQUEUE(wait, current); | 966 | struct task_struct *curr = current; |
967 | DECLARE_WAITQUEUE(wait, curr); | ||
640 | struct futex_hash_bucket *hb; | 968 | struct futex_hash_bucket *hb; |
641 | struct futex_q q; | 969 | struct futex_q q; |
642 | u32 uval; | 970 | u32 uval; |
643 | int ret; | 971 | int ret; |
644 | 972 | ||
973 | q.pi_state = NULL; | ||
645 | retry: | 974 | retry: |
646 | down_read(¤t->mm->mmap_sem); | 975 | down_read(&curr->mm->mmap_sem); |
647 | 976 | ||
648 | ret = get_futex_key(uaddr, &q.key); | 977 | ret = get_futex_key(uaddr, &q.key); |
649 | if (unlikely(ret != 0)) | 978 | if (unlikely(ret != 0)) |
@@ -680,7 +1009,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
680 | * If we would have faulted, release mmap_sem, fault it in and | 1009 | * If we would have faulted, release mmap_sem, fault it in and |
681 | * start all over again. | 1010 | * start all over again. |
682 | */ | 1011 | */ |
683 | up_read(¤t->mm->mmap_sem); | 1012 | up_read(&curr->mm->mmap_sem); |
684 | 1013 | ||
685 | ret = get_user(uval, uaddr); | 1014 | ret = get_user(uval, uaddr); |
686 | 1015 | ||
@@ -688,11 +1017,9 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
688 | goto retry; | 1017 | goto retry; |
689 | return ret; | 1018 | return ret; |
690 | } | 1019 | } |
691 | if (uval != val) { | 1020 | ret = -EWOULDBLOCK; |
692 | ret = -EWOULDBLOCK; | 1021 | if (uval != val) |
693 | queue_unlock(&q, hb); | 1022 | goto out_unlock_release_sem; |
694 | goto out_release_sem; | ||
695 | } | ||
696 | 1023 | ||
697 | /* Only actually queue if *uaddr contained val. */ | 1024 | /* Only actually queue if *uaddr contained val. */ |
698 | __queue_me(&q, hb); | 1025 | __queue_me(&q, hb); |
@@ -700,8 +1027,8 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
700 | /* | 1027 | /* |
701 | * Now the futex is queued and we have checked the data, we | 1028 | * Now the futex is queued and we have checked the data, we |
702 | * don't want to hold mmap_sem while we sleep. | 1029 | * don't want to hold mmap_sem while we sleep. |
703 | */ | 1030 | */ |
704 | up_read(¤t->mm->mmap_sem); | 1031 | up_read(&curr->mm->mmap_sem); |
705 | 1032 | ||
706 | /* | 1033 | /* |
707 | * There might have been scheduling since the queue_me(), as we | 1034 | * There might have been scheduling since the queue_me(), as we |
@@ -739,8 +1066,415 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | |||
739 | */ | 1066 | */ |
740 | return -EINTR; | 1067 | return -EINTR; |
741 | 1068 | ||
1069 | out_unlock_release_sem: | ||
1070 | queue_unlock(&q, hb); | ||
1071 | |||
742 | out_release_sem: | 1072 | out_release_sem: |
1073 | up_read(&curr->mm->mmap_sem); | ||
1074 | return ret; | ||
1075 | } | ||
1076 | |||
1077 | /* | ||
1078 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
1079 | * and failed. The kernel side here does the whole locking operation: | ||
1080 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
1081 | * races the kernel might see a 0 value of the futex too.) | ||
1082 | */ | ||
1083 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
1084 | struct hrtimer_sleeper *to) | ||
1085 | { | ||
1086 | struct task_struct *curr = current; | ||
1087 | struct futex_hash_bucket *hb; | ||
1088 | u32 uval, newval, curval; | ||
1089 | struct futex_q q; | ||
1090 | int ret, attempt = 0; | ||
1091 | |||
1092 | if (refill_pi_state_cache()) | ||
1093 | return -ENOMEM; | ||
1094 | |||
1095 | q.pi_state = NULL; | ||
1096 | retry: | ||
1097 | down_read(&curr->mm->mmap_sem); | ||
1098 | |||
1099 | ret = get_futex_key(uaddr, &q.key); | ||
1100 | if (unlikely(ret != 0)) | ||
1101 | goto out_release_sem; | ||
1102 | |||
1103 | hb = queue_lock(&q, -1, NULL); | ||
1104 | |||
1105 | retry_locked: | ||
1106 | /* | ||
1107 | * To avoid races, we attempt to take the lock here again | ||
1108 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1109 | * the locks. It will most likely not succeed. | ||
1110 | */ | ||
1111 | newval = current->pid; | ||
1112 | |||
1113 | inc_preempt_count(); | ||
1114 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1115 | dec_preempt_count(); | ||
1116 | |||
1117 | if (unlikely(curval == -EFAULT)) | ||
1118 | goto uaddr_faulted; | ||
1119 | |||
1120 | /* We own the lock already */ | ||
1121 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
1122 | if (!detect && 0) | ||
1123 | force_sig(SIGKILL, current); | ||
1124 | ret = -EDEADLK; | ||
1125 | goto out_unlock_release_sem; | ||
1126 | } | ||
1127 | |||
1128 | /* | ||
1129 | * Surprise - we got the lock. Just return | ||
1130 | * to userspace: | ||
1131 | */ | ||
1132 | if (unlikely(!curval)) | ||
1133 | goto out_unlock_release_sem; | ||
1134 | |||
1135 | uval = curval; | ||
1136 | newval = uval | FUTEX_WAITERS; | ||
1137 | |||
1138 | inc_preempt_count(); | ||
1139 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1140 | dec_preempt_count(); | ||
1141 | |||
1142 | if (unlikely(curval == -EFAULT)) | ||
1143 | goto uaddr_faulted; | ||
1144 | if (unlikely(curval != uval)) | ||
1145 | goto retry_locked; | ||
1146 | |||
1147 | /* | ||
1148 | * We dont have the lock. Look up the PI state (or create it if | ||
1149 | * we are the first waiter): | ||
1150 | */ | ||
1151 | ret = lookup_pi_state(uval, hb, &q); | ||
1152 | |||
1153 | if (unlikely(ret)) { | ||
1154 | /* | ||
1155 | * There were no waiters and the owner task lookup | ||
1156 | * failed. When the OWNER_DIED bit is set, then we | ||
1157 | * know that this is a robust futex and we actually | ||
1158 | * take the lock. This is safe as we are protected by | ||
1159 | * the hash bucket lock. We also set the waiters bit | ||
1160 | * unconditionally here, to simplify glibc handling of | ||
1161 | * multiple tasks racing to acquire the lock and | ||
1162 | * cleanup the problems which were left by the dead | ||
1163 | * owner. | ||
1164 | */ | ||
1165 | if (curval & FUTEX_OWNER_DIED) { | ||
1166 | uval = newval; | ||
1167 | newval = current->pid | | ||
1168 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1169 | |||
1170 | inc_preempt_count(); | ||
1171 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1172 | uval, newval); | ||
1173 | dec_preempt_count(); | ||
1174 | |||
1175 | if (unlikely(curval == -EFAULT)) | ||
1176 | goto uaddr_faulted; | ||
1177 | if (unlikely(curval != uval)) | ||
1178 | goto retry_locked; | ||
1179 | ret = 0; | ||
1180 | } | ||
1181 | goto out_unlock_release_sem; | ||
1182 | } | ||
1183 | |||
1184 | /* | ||
1185 | * Only actually queue now that the atomic ops are done: | ||
1186 | */ | ||
1187 | __queue_me(&q, hb); | ||
1188 | |||
1189 | /* | ||
1190 | * Now the futex is queued and we have checked the data, we | ||
1191 | * don't want to hold mmap_sem while we sleep. | ||
1192 | */ | ||
1193 | up_read(&curr->mm->mmap_sem); | ||
1194 | |||
1195 | WARN_ON(!q.pi_state); | ||
1196 | /* | ||
1197 | * Block on the PI mutex: | ||
1198 | */ | ||
1199 | if (!trylock) | ||
1200 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
1201 | else { | ||
1202 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
1203 | /* Fixup the trylock return value: */ | ||
1204 | ret = ret ? 0 : -EWOULDBLOCK; | ||
1205 | } | ||
1206 | |||
1207 | down_read(&curr->mm->mmap_sem); | ||
1208 | hb = queue_lock(&q, -1, NULL); | ||
1209 | |||
1210 | /* | ||
1211 | * Got the lock. We might not be the anticipated owner if we | ||
1212 | * did a lock-steal - fix up the PI-state in that case. | ||
1213 | */ | ||
1214 | if (!ret && q.pi_state->owner != curr) { | ||
1215 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
1216 | |||
1217 | /* Owner died? */ | ||
1218 | if (q.pi_state->owner != NULL) { | ||
1219 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
1220 | list_del_init(&q.pi_state->list); | ||
1221 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
1222 | } else | ||
1223 | newtid |= FUTEX_OWNER_DIED; | ||
1224 | |||
1225 | q.pi_state->owner = current; | ||
1226 | |||
1227 | spin_lock_irq(¤t->pi_lock); | ||
1228 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
1229 | spin_unlock_irq(¤t->pi_lock); | ||
1230 | |||
1231 | /* Unqueue and drop the lock */ | ||
1232 | unqueue_me_pi(&q, hb); | ||
1233 | up_read(&curr->mm->mmap_sem); | ||
1234 | /* | ||
1235 | * We own it, so we have to replace the pending owner | ||
1236 | * TID. This must be atomic as we have preserve the | ||
1237 | * owner died bit here. | ||
1238 | */ | ||
1239 | ret = get_user(uval, uaddr); | ||
1240 | while (!ret) { | ||
1241 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1242 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1243 | uval, newval); | ||
1244 | if (curval == -EFAULT) | ||
1245 | ret = -EFAULT; | ||
1246 | if (curval == uval) | ||
1247 | break; | ||
1248 | uval = curval; | ||
1249 | } | ||
1250 | } else { | ||
1251 | /* | ||
1252 | * Catch the rare case, where the lock was released | ||
1253 | * when we were on the way back before we locked | ||
1254 | * the hash bucket. | ||
1255 | */ | ||
1256 | if (ret && q.pi_state->owner == curr) { | ||
1257 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1258 | ret = 0; | ||
1259 | } | ||
1260 | /* Unqueue and drop the lock */ | ||
1261 | unqueue_me_pi(&q, hb); | ||
1262 | up_read(&curr->mm->mmap_sem); | ||
1263 | } | ||
1264 | |||
1265 | if (!detect && ret == -EDEADLK && 0) | ||
1266 | force_sig(SIGKILL, current); | ||
1267 | |||
1268 | return ret; | ||
1269 | |||
1270 | out_unlock_release_sem: | ||
1271 | queue_unlock(&q, hb); | ||
1272 | |||
1273 | out_release_sem: | ||
1274 | up_read(&curr->mm->mmap_sem); | ||
1275 | return ret; | ||
1276 | |||
1277 | uaddr_faulted: | ||
1278 | /* | ||
1279 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1280 | * non-atomically. Therefore, if get_user below is not | ||
1281 | * enough, we need to handle the fault ourselves, while | ||
1282 | * still holding the mmap_sem. | ||
1283 | */ | ||
1284 | if (attempt++) { | ||
1285 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1286 | goto out_unlock_release_sem; | ||
1287 | |||
1288 | goto retry_locked; | ||
1289 | } | ||
1290 | |||
1291 | queue_unlock(&q, hb); | ||
1292 | up_read(&curr->mm->mmap_sem); | ||
1293 | |||
1294 | ret = get_user(uval, uaddr); | ||
1295 | if (!ret && (uval != -EFAULT)) | ||
1296 | goto retry; | ||
1297 | |||
1298 | return ret; | ||
1299 | } | ||
1300 | |||
1301 | /* | ||
1302 | * Restart handler | ||
1303 | */ | ||
1304 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1305 | { | ||
1306 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1307 | int ret; | ||
1308 | |||
1309 | restart->fn = do_no_restart_syscall; | ||
1310 | |||
1311 | if (restart->arg2 || restart->arg3) { | ||
1312 | to = &timeout; | ||
1313 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1314 | hrtimer_init_sleeper(to, current); | ||
1315 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
1316 | (u64) restart->arg0; | ||
1317 | } | ||
1318 | |||
1319 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
1320 | (u32 __user *)restart->arg0, current->pid); | ||
1321 | |||
1322 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
1323 | 0, to); | ||
1324 | |||
1325 | if (ret != -EINTR) | ||
1326 | return ret; | ||
1327 | |||
1328 | restart->fn = futex_lock_pi_restart; | ||
1329 | |||
1330 | /* The other values are filled in */ | ||
1331 | return -ERESTART_RESTARTBLOCK; | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * Called from the syscall entry below. | ||
1336 | */ | ||
1337 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
1338 | long nsec, int trylock) | ||
1339 | { | ||
1340 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1341 | struct restart_block *restart; | ||
1342 | int ret; | ||
1343 | |||
1344 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
1345 | to = &timeout; | ||
1346 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1347 | hrtimer_init_sleeper(to, current); | ||
1348 | to->timer.expires = ktime_set(sec, nsec); | ||
1349 | } | ||
1350 | |||
1351 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
1352 | |||
1353 | if (ret != -EINTR) | ||
1354 | return ret; | ||
1355 | |||
1356 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
1357 | |||
1358 | restart = ¤t_thread_info()->restart_block; | ||
1359 | restart->fn = futex_lock_pi_restart; | ||
1360 | restart->arg0 = (unsigned long) uaddr; | ||
1361 | restart->arg1 = detect; | ||
1362 | if (to) { | ||
1363 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
1364 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
1365 | } else | ||
1366 | restart->arg2 = restart->arg3 = 0; | ||
1367 | |||
1368 | return -ERESTART_RESTARTBLOCK; | ||
1369 | } | ||
1370 | |||
1371 | /* | ||
1372 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
1373 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
1374 | * and do the rt-mutex unlock. | ||
1375 | */ | ||
1376 | static int futex_unlock_pi(u32 __user *uaddr) | ||
1377 | { | ||
1378 | struct futex_hash_bucket *hb; | ||
1379 | struct futex_q *this, *next; | ||
1380 | u32 uval; | ||
1381 | struct list_head *head; | ||
1382 | union futex_key key; | ||
1383 | int ret, attempt = 0; | ||
1384 | |||
1385 | retry: | ||
1386 | if (get_user(uval, uaddr)) | ||
1387 | return -EFAULT; | ||
1388 | /* | ||
1389 | * We release only a lock we actually own: | ||
1390 | */ | ||
1391 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
1392 | return -EPERM; | ||
1393 | /* | ||
1394 | * First take all the futex related locks: | ||
1395 | */ | ||
1396 | down_read(¤t->mm->mmap_sem); | ||
1397 | |||
1398 | ret = get_futex_key(uaddr, &key); | ||
1399 | if (unlikely(ret != 0)) | ||
1400 | goto out; | ||
1401 | |||
1402 | hb = hash_futex(&key); | ||
1403 | spin_lock(&hb->lock); | ||
1404 | |||
1405 | retry_locked: | ||
1406 | /* | ||
1407 | * To avoid races, try to do the TID -> 0 atomic transition | ||
1408 | * again. If it succeeds then we can return without waking | ||
1409 | * anyone else up: | ||
1410 | */ | ||
1411 | inc_preempt_count(); | ||
1412 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
1413 | dec_preempt_count(); | ||
1414 | |||
1415 | if (unlikely(uval == -EFAULT)) | ||
1416 | goto pi_faulted; | ||
1417 | /* | ||
1418 | * Rare case: we managed to release the lock atomically, | ||
1419 | * no need to wake anyone else up: | ||
1420 | */ | ||
1421 | if (unlikely(uval == current->pid)) | ||
1422 | goto out_unlock; | ||
1423 | |||
1424 | /* | ||
1425 | * Ok, other tasks may need to be woken up - check waiters | ||
1426 | * and do the wakeup if necessary: | ||
1427 | */ | ||
1428 | head = &hb->chain; | ||
1429 | |||
1430 | list_for_each_entry_safe(this, next, head, list) { | ||
1431 | if (!match_futex (&this->key, &key)) | ||
1432 | continue; | ||
1433 | ret = wake_futex_pi(uaddr, uval, this); | ||
1434 | /* | ||
1435 | * The atomic access to the futex value | ||
1436 | * generated a pagefault, so retry the | ||
1437 | * user-access and the wakeup: | ||
1438 | */ | ||
1439 | if (ret == -EFAULT) | ||
1440 | goto pi_faulted; | ||
1441 | goto out_unlock; | ||
1442 | } | ||
1443 | /* | ||
1444 | * No waiters - kernel unlocks the futex: | ||
1445 | */ | ||
1446 | ret = unlock_futex_pi(uaddr, uval); | ||
1447 | if (ret == -EFAULT) | ||
1448 | goto pi_faulted; | ||
1449 | |||
1450 | out_unlock: | ||
1451 | spin_unlock(&hb->lock); | ||
1452 | out: | ||
1453 | up_read(¤t->mm->mmap_sem); | ||
1454 | |||
1455 | return ret; | ||
1456 | |||
1457 | pi_faulted: | ||
1458 | /* | ||
1459 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1460 | * non-atomically. Therefore, if get_user below is not | ||
1461 | * enough, we need to handle the fault ourselves, while | ||
1462 | * still holding the mmap_sem. | ||
1463 | */ | ||
1464 | if (attempt++) { | ||
1465 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1466 | goto out_unlock; | ||
1467 | |||
1468 | goto retry_locked; | ||
1469 | } | ||
1470 | |||
1471 | spin_unlock(&hb->lock); | ||
743 | up_read(¤t->mm->mmap_sem); | 1472 | up_read(¤t->mm->mmap_sem); |
1473 | |||
1474 | ret = get_user(uval, uaddr); | ||
1475 | if (!ret && (uval != -EFAULT)) | ||
1476 | goto retry; | ||
1477 | |||
744 | return ret; | 1478 | return ret; |
745 | } | 1479 | } |
746 | 1480 | ||
@@ -819,6 +1553,7 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
819 | err = -ENOMEM; | 1553 | err = -ENOMEM; |
820 | goto error; | 1554 | goto error; |
821 | } | 1555 | } |
1556 | q->pi_state = NULL; | ||
822 | 1557 | ||
823 | down_read(¤t->mm->mmap_sem); | 1558 | down_read(¤t->mm->mmap_sem); |
824 | err = get_futex_key(uaddr, &q->key); | 1559 | err = get_futex_key(uaddr, &q->key); |
@@ -856,7 +1591,7 @@ error: | |||
856 | * Implementation: user-space maintains a per-thread list of locks it | 1591 | * Implementation: user-space maintains a per-thread list of locks it |
857 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1592 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
858 | * and marks all locks that are owned by this thread with the | 1593 | * and marks all locks that are owned by this thread with the |
859 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1594 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
860 | * always manipulated with the lock held, so the list is private and | 1595 | * always manipulated with the lock held, so the list is private and |
861 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1596 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
862 | * field, to allow the kernel to clean up if the thread dies after | 1597 | * field, to allow the kernel to clean up if the thread dies after |
@@ -931,7 +1666,7 @@ err_unlock: | |||
931 | */ | 1666 | */ |
932 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1667 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
933 | { | 1668 | { |
934 | u32 uval; | 1669 | u32 uval, nval; |
935 | 1670 | ||
936 | retry: | 1671 | retry: |
937 | if (get_user(uval, uaddr)) | 1672 | if (get_user(uval, uaddr)) |
@@ -948,8 +1683,12 @@ retry: | |||
948 | * thread-death.) The rest of the cleanup is done in | 1683 | * thread-death.) The rest of the cleanup is done in |
949 | * userspace. | 1684 | * userspace. |
950 | */ | 1685 | */ |
951 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1686 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
952 | uval | FUTEX_OWNER_DIED) != uval) | 1687 | uval | FUTEX_OWNER_DIED); |
1688 | if (nval == -EFAULT) | ||
1689 | return -1; | ||
1690 | |||
1691 | if (nval != uval) | ||
953 | goto retry; | 1692 | goto retry; |
954 | 1693 | ||
955 | if (uval & FUTEX_WAITERS) | 1694 | if (uval & FUTEX_WAITERS) |
@@ -994,7 +1733,7 @@ void exit_robust_list(struct task_struct *curr) | |||
994 | while (entry != &head->list) { | 1733 | while (entry != &head->list) { |
995 | /* | 1734 | /* |
996 | * A pending lock might already be on the list, so | 1735 | * A pending lock might already be on the list, so |
997 | * dont process it twice: | 1736 | * don't process it twice: |
998 | */ | 1737 | */ |
999 | if (entry != pending) | 1738 | if (entry != pending) |
1000 | if (handle_futex_death((void *)entry + futex_offset, | 1739 | if (handle_futex_death((void *)entry + futex_offset, |
@@ -1040,6 +1779,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, | |||
1040 | case FUTEX_WAKE_OP: | 1779 | case FUTEX_WAKE_OP: |
1041 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1780 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
1042 | break; | 1781 | break; |
1782 | case FUTEX_LOCK_PI: | ||
1783 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
1784 | break; | ||
1785 | case FUTEX_UNLOCK_PI: | ||
1786 | ret = futex_unlock_pi(uaddr); | ||
1787 | break; | ||
1788 | case FUTEX_TRYLOCK_PI: | ||
1789 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
1790 | break; | ||
1043 | default: | 1791 | default: |
1044 | ret = -ENOSYS; | 1792 | ret = -ENOSYS; |
1045 | } | 1793 | } |
@@ -1055,17 +1803,22 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, | |||
1055 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1803 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
1056 | u32 val2 = 0; | 1804 | u32 val2 = 0; |
1057 | 1805 | ||
1058 | if (utime && (op == FUTEX_WAIT)) { | 1806 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
1059 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1807 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
1060 | return -EFAULT; | 1808 | return -EFAULT; |
1061 | if (!timespec_valid(&t)) | 1809 | if (!timespec_valid(&t)) |
1062 | return -EINVAL; | 1810 | return -EINVAL; |
1063 | timeout = timespec_to_jiffies(&t) + 1; | 1811 | if (op == FUTEX_WAIT) |
1812 | timeout = timespec_to_jiffies(&t) + 1; | ||
1813 | else { | ||
1814 | timeout = t.tv_sec; | ||
1815 | val2 = t.tv_nsec; | ||
1816 | } | ||
1064 | } | 1817 | } |
1065 | /* | 1818 | /* |
1066 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1819 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
1067 | */ | 1820 | */ |
1068 | if (op >= FUTEX_REQUEUE) | 1821 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
1069 | val2 = (u32) (unsigned long) utime; | 1822 | val2 = (u32) (unsigned long) utime; |
1070 | 1823 | ||
1071 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); | 1824 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7e57c31670a3..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -129,14 +129,19 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
130 | int val2 = 0; | 130 | int val2 = 0; |
131 | 131 | ||
132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
134 | return -EFAULT; | 134 | return -EFAULT; |
135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
136 | return -EINVAL; | 136 | return -EINVAL; |
137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
138 | timeout = timespec_to_jiffies(&t) + 1; | ||
139 | else { | ||
140 | timeout = t.tv_sec; | ||
141 | val2 = t.tv_nsec; | ||
142 | } | ||
138 | } | 143 | } |
139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
141 | 146 | ||
142 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e068024eeffc..9c75856e791e 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h | |||
@@ -112,4 +112,12 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | |||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | 112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; |
113 | } | 113 | } |
114 | 114 | ||
115 | /* | ||
116 | * PI-futex support (proxy locking functions, etc.): | ||
117 | */ | ||
118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
120 | struct task_struct *proxy_owner); | ||
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
122 | struct task_struct *proxy_owner); | ||
115 | #endif | 123 | #endif |