diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-03-27 04:16:22 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-27 11:44:49 -0500 |
commit | 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 (patch) | |
tree | 696267e69228b7406b337f9651dedc75055a589e /kernel/futex.c | |
parent | e9056f13bfcdd054a0c3d730e4e096748d8a363a (diff) |
[PATCH] lightweight robust futexes: core
Add the core infrastructure for robust futexes: structure definitions, the new
syscalls and the do_exit() based cleanup mechanism.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 5efa2f978032..feb724b2554e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -8,6 +8,10 @@ | |||
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | 8 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
9 | * (C) Copyright 2003, 2004 Jamie Lokier | 9 | * (C) Copyright 2003, 2004 Jamie Lokier |
10 | * | 10 | * |
11 | * Robust futex support started by Ingo Molnar | ||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | ||
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | ||
14 | * | ||
11 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
12 | * enough at me, Linus for the original (flawed) idea, Matthew | 16 | * enough at me, Linus for the original (flawed) idea, Matthew |
13 | * Kirkwood for proof-of-concept implementation. | 17 | * Kirkwood for proof-of-concept implementation. |
@@ -829,6 +833,174 @@ error: | |||
829 | goto out; | 833 | goto out; |
830 | } | 834 | } |
831 | 835 | ||
836 | /* | ||
837 | * Support for robust futexes: the kernel cleans up held futexes at | ||
838 | * thread exit time. | ||
839 | * | ||
840 | * Implementation: user-space maintains a per-thread list of locks it | ||
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | ||
842 | * and marks all locks that are owned by this thread with the | ||
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | ||
844 | * always manipulated with the lock held, so the list is private and | ||
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | ||
846 | * field, to allow the kernel to clean up if the thread dies after | ||
847 | * acquiring the lock, but just before it could have added itself to | ||
848 | * the list. There can only be one such pending lock. | ||
849 | */ | ||
850 | |||
851 | /** | ||
852 | * sys_set_robust_list - set the robust-futex list head of a task | ||
853 | * @head: pointer to the list-head | ||
854 | * @len: length of the list-head, as userspace expects | ||
855 | */ | ||
856 | asmlinkage long | ||
857 | sys_set_robust_list(struct robust_list_head __user *head, | ||
858 | size_t len) | ||
859 | { | ||
860 | /* | ||
861 | * The kernel knows only one size for now: | ||
862 | */ | ||
863 | if (unlikely(len != sizeof(*head))) | ||
864 | return -EINVAL; | ||
865 | |||
866 | current->robust_list = head; | ||
867 | |||
868 | return 0; | ||
869 | } | ||
870 | |||
871 | /** | ||
872 | * sys_get_robust_list - get the robust-futex list head of a task | ||
873 | * @pid: pid of the process [zero for current task] | ||
874 | * @head_ptr: pointer to a list-head pointer, the kernel fills it in | ||
875 | * @len_ptr: pointer to a length field, the kernel fills in the header size | ||
876 | */ | ||
877 | asmlinkage long | ||
878 | sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | ||
879 | size_t __user *len_ptr) | ||
880 | { | ||
881 | struct robust_list_head *head; | ||
882 | unsigned long ret; | ||
883 | |||
884 | if (!pid) | ||
885 | head = current->robust_list; | ||
886 | else { | ||
887 | struct task_struct *p; | ||
888 | |||
889 | ret = -ESRCH; | ||
890 | read_lock(&tasklist_lock); | ||
891 | p = find_task_by_pid(pid); | ||
892 | if (!p) | ||
893 | goto err_unlock; | ||
894 | ret = -EPERM; | ||
895 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
896 | !capable(CAP_SYS_PTRACE)) | ||
897 | goto err_unlock; | ||
898 | head = p->robust_list; | ||
899 | read_unlock(&tasklist_lock); | ||
900 | } | ||
901 | |||
902 | if (put_user(sizeof(*head), len_ptr)) | ||
903 | return -EFAULT; | ||
904 | return put_user(head, head_ptr); | ||
905 | |||
906 | err_unlock: | ||
907 | read_unlock(&tasklist_lock); | ||
908 | |||
909 | return ret; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Process a futex-list entry, check whether it's owned by the | ||
914 | * dying task, and do notification if so: | ||
915 | */ | ||
916 | int handle_futex_death(unsigned int *uaddr, struct task_struct *curr) | ||
917 | { | ||
918 | unsigned int futex_val; | ||
919 | |||
920 | repeat: | ||
921 | if (get_user(futex_val, uaddr)) | ||
922 | return -1; | ||
923 | |||
924 | if ((futex_val & FUTEX_TID_MASK) == curr->pid) { | ||
925 | /* | ||
926 | * Ok, this dying thread is truly holding a futex | ||
927 | * of interest. Set the OWNER_DIED bit atomically | ||
928 | * via cmpxchg, and if the value had FUTEX_WAITERS | ||
929 | * set, wake up a waiter (if any). (We have to do a | ||
930 | * futex_wake() even if OWNER_DIED is already set - | ||
931 | * to handle the rare but possible case of recursive | ||
932 | * thread-death.) The rest of the cleanup is done in | ||
933 | * userspace. | ||
934 | */ | ||
935 | if (futex_atomic_cmpxchg_inuser(uaddr, futex_val, | ||
936 | futex_val | FUTEX_OWNER_DIED) != | ||
937 | futex_val) | ||
938 | goto repeat; | ||
939 | |||
940 | if (futex_val & FUTEX_WAITERS) | ||
941 | futex_wake((unsigned long)uaddr, 1); | ||
942 | } | ||
943 | return 0; | ||
944 | } | ||
945 | |||
946 | /* | ||
947 | * Walk curr->robust_list (very carefully, it's a userspace list!) | ||
948 | * and mark any locks found there dead, and notify any waiters. | ||
949 | * | ||
950 | * We silently return on any sign of list-walking problem. | ||
951 | */ | ||
952 | void exit_robust_list(struct task_struct *curr) | ||
953 | { | ||
954 | struct robust_list_head __user *head = curr->robust_list; | ||
955 | struct robust_list __user *entry, *pending; | ||
956 | unsigned int limit = ROBUST_LIST_LIMIT; | ||
957 | unsigned long futex_offset; | ||
958 | |||
959 | /* | ||
960 | * Fetch the list head (which was registered earlier, via | ||
961 | * sys_set_robust_list()): | ||
962 | */ | ||
963 | if (get_user(entry, &head->list.next)) | ||
964 | return; | ||
965 | /* | ||
966 | * Fetch the relative futex offset: | ||
967 | */ | ||
968 | if (get_user(futex_offset, &head->futex_offset)) | ||
969 | return; | ||
970 | /* | ||
971 | * Fetch any possibly pending lock-add first, and handle it | ||
972 | * if it exists: | ||
973 | */ | ||
974 | if (get_user(pending, &head->list_op_pending)) | ||
975 | return; | ||
976 | if (pending) | ||
977 | handle_futex_death((void *)pending + futex_offset, curr); | ||
978 | |||
979 | while (entry != &head->list) { | ||
980 | /* | ||
981 | * A pending lock might already be on the list, so | ||
982 | * dont process it twice: | ||
983 | */ | ||
984 | if (entry != pending) | ||
985 | if (handle_futex_death((void *)entry + futex_offset, | ||
986 | curr)) | ||
987 | return; | ||
988 | |||
989 | /* | ||
990 | * Fetch the next entry in the list: | ||
991 | */ | ||
992 | if (get_user(entry, &entry->next)) | ||
993 | return; | ||
994 | /* | ||
995 | * Avoid excessively long or circular lists: | ||
996 | */ | ||
997 | if (!--limit) | ||
998 | break; | ||
999 | |||
1000 | cond_resched(); | ||
1001 | } | ||
1002 | } | ||
1003 | |||
832 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1004 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, |
833 | unsigned long uaddr2, int val2, int val3) | 1005 | unsigned long uaddr2, int val2, int val3) |
834 | { | 1006 | { |