diff options
author | Ingo Molnar <mingo@elte.hu> | 2006-03-27 04:16:22 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-27 11:44:49 -0500 |
commit | 0771dfefc9e538f077d0b43b6dec19a5a67d0e70 (patch) | |
tree | 696267e69228b7406b337f9651dedc75055a589e | |
parent | e9056f13bfcdd054a0c3d730e4e096748d8a363a (diff) |
[PATCH] lightweight robust futexes: core
Add the core infrastructure for robust futexes: structure definitions, the new
syscalls and the do_exit() based cleanup mechanism.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@infradead.org>
Acked-by: Ulrich Drepper <drepper@redhat.com>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/futex.h | 95 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | include/linux/threads.h | 3 | ||||
-rw-r--r-- | kernel/exit.c | 3 | ||||
-rw-r--r-- | kernel/futex.c | 172 | ||||
-rw-r--r-- | kernel/sys_ni.c | 4 |
6 files changed, 279 insertions, 1 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h index 10f96c31971e..20face6b798d 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _LINUX_FUTEX_H | 1 | #ifndef _LINUX_FUTEX_H |
2 | #define _LINUX_FUTEX_H | 2 | #define _LINUX_FUTEX_H |
3 | 3 | ||
4 | #include <linux/sched.h> | ||
5 | |||
4 | /* Second argument to futex syscall */ | 6 | /* Second argument to futex syscall */ |
5 | 7 | ||
6 | 8 | ||
@@ -11,10 +13,103 @@ | |||
11 | #define FUTEX_CMP_REQUEUE 4 | 13 | #define FUTEX_CMP_REQUEUE 4 |
12 | #define FUTEX_WAKE_OP 5 | 14 | #define FUTEX_WAKE_OP 5 |
13 | 15 | ||
16 | /* | ||
17 | * Support for robust futexes: the kernel cleans up held futexes at | ||
18 | * thread exit time. | ||
19 | */ | ||
20 | |||
21 | /* | ||
22 | * Per-lock list entry - embedded in user-space locks, somewhere close | ||
23 | * to the futex field. (Note: user-space uses a double-linked list to | ||
24 | * achieve O(1) list add and remove, but the kernel only needs to know | ||
25 | * about the forward link) | ||
26 | * | ||
27 | * NOTE: this structure is part of the syscall ABI, and must not be | ||
28 | * changed. | ||
29 | */ | ||
30 | struct robust_list { | ||
31 | struct robust_list __user *next; | ||
32 | }; | ||
33 | |||
34 | /* | ||
35 | * Per-thread list head: | ||
36 | * | ||
37 | * NOTE: this structure is part of the syscall ABI, and must only be | ||
38 | * changed if the change is first communicated with the glibc folks. | ||
39 | * (When an incompatible change is done, we'll increase the structure | ||
40 | * size, which glibc will detect) | ||
41 | */ | ||
42 | struct robust_list_head { | ||
43 | /* | ||
44 | * The head of the list. Points back to itself if empty: | ||
45 | */ | ||
46 | struct robust_list list; | ||
47 | |||
48 | /* | ||
49 | * This relative offset is set by user-space, it gives the kernel | ||
50 | * the relative position of the futex field to examine. This way | ||
51 | * we keep userspace flexible, to freely shape its data-structure, | ||
52 | * without hardcoding any particular offset into the kernel: | ||
53 | */ | ||
54 | long futex_offset; | ||
55 | |||
56 | /* | ||
57 | * The death of the thread may race with userspace setting | ||
58 | * up a lock's links. So to handle this race, userspace first | ||
59 | * sets this field to the address of the to-be-taken lock, | ||
60 | * then does the lock acquire, and then adds itself to the | ||
61 | * list, and then clears this field. Hence the kernel will | ||
62 | * always have full knowledge of all locks that the thread | ||
63 | * _might_ have taken. We check the owner TID in any case, | ||
64 | * so only truly owned locks will be handled. | ||
65 | */ | ||
66 | struct robust_list __user *list_op_pending; | ||
67 | }; | ||
68 | |||
69 | /* | ||
70 | * Are there any waiters for this robust futex: | ||
71 | */ | ||
72 | #define FUTEX_WAITERS 0x80000000 | ||
73 | |||
74 | /* | ||
75 | * The kernel signals via this bit that a thread holding a futex | ||
76 | * has exited without unlocking the futex. The kernel also does | ||
77 | * a FUTEX_WAKE on such futexes, after setting the bit, to wake | ||
78 | * up any possible waiters: | ||
79 | */ | ||
80 | #define FUTEX_OWNER_DIED 0x40000000 | ||
81 | |||
82 | /* | ||
83 | * Reserved bit: | ||
84 | */ | ||
85 | #define FUTEX_OWNER_PENDING 0x20000000 | ||
86 | |||
87 | /* | ||
88 | * The rest of the robust-futex field is for the TID: | ||
89 | */ | ||
90 | #define FUTEX_TID_MASK 0x1fffffff | ||
91 | |||
92 | /* | ||
93 | * A limit of one million locks held per thread (!) ought to be enough | ||
94 | * for some time. This also protects against a deliberately circular | ||
95 | * list. Not worth introducing an rlimit for this: | ||
96 | */ | ||
97 | #define ROBUST_LIST_LIMIT 1048576 | ||
98 | |||
14 | long do_futex(unsigned long uaddr, int op, int val, | 99 | long do_futex(unsigned long uaddr, int op, int val, |
15 | unsigned long timeout, unsigned long uaddr2, int val2, | 100 | unsigned long timeout, unsigned long uaddr2, int val2, |
16 | int val3); | 101 | int val3); |
17 | 102 | ||
103 | extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr); | ||
104 | |||
105 | #ifdef CONFIG_FUTEX | ||
106 | extern void exit_robust_list(struct task_struct *curr); | ||
107 | #else | ||
108 | static inline void exit_robust_list(struct task_struct *curr) | ||
109 | { | ||
110 | } | ||
111 | #endif | ||
112 | |||
18 | #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ | 113 | #define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ |
19 | #define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ | 114 | #define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ |
20 | #define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ | 115 | #define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 036d14d2bf90..fd4848f2d750 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/topology.h> | 35 | #include <linux/topology.h> |
36 | #include <linux/seccomp.h> | 36 | #include <linux/seccomp.h> |
37 | #include <linux/rcupdate.h> | 37 | #include <linux/rcupdate.h> |
38 | #include <linux/futex.h> | ||
38 | 39 | ||
39 | #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ | 40 | #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ |
40 | 41 | ||
@@ -872,6 +873,8 @@ struct task_struct { | |||
872 | int cpuset_mems_generation; | 873 | int cpuset_mems_generation; |
873 | int cpuset_mem_spread_rotor; | 874 | int cpuset_mem_spread_rotor; |
874 | #endif | 875 | #endif |
876 | struct robust_list_head __user *robust_list; | ||
877 | |||
875 | atomic_t fs_excl; /* holding fs exclusive resources */ | 878 | atomic_t fs_excl; /* holding fs exclusive resources */ |
876 | struct rcu_head rcu; | 879 | struct rcu_head rcu; |
877 | }; | 880 | }; |
diff --git a/include/linux/threads.h b/include/linux/threads.h index b59738ac6197..e646bcdf2614 100644 --- a/include/linux/threads.h +++ b/include/linux/threads.h | |||
@@ -28,7 +28,8 @@ | |||
28 | #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000) | 28 | #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000) |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * A maximum of 4 million PIDs should be enough for a while: | 31 | * A maximum of 4 million PIDs should be enough for a while. |
32 | * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.] | ||
32 | */ | 33 | */ |
33 | #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ | 34 | #define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ |
34 | (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) | 35 | (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) |
diff --git a/kernel/exit.c b/kernel/exit.c index 8037405e136e..aecb48ca7370 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/signal.h> | 31 | #include <linux/signal.h> |
32 | #include <linux/cn_proc.h> | 32 | #include <linux/cn_proc.h> |
33 | #include <linux/mutex.h> | 33 | #include <linux/mutex.h> |
34 | #include <linux/futex.h> | ||
34 | 35 | ||
35 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
36 | #include <asm/unistd.h> | 37 | #include <asm/unistd.h> |
@@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
852 | exit_itimers(tsk->signal); | 853 | exit_itimers(tsk->signal); |
853 | acct_process(code); | 854 | acct_process(code); |
854 | } | 855 | } |
856 | if (unlikely(tsk->robust_list)) | ||
857 | exit_robust_list(tsk); | ||
855 | exit_mm(tsk); | 858 | exit_mm(tsk); |
856 | 859 | ||
857 | exit_sem(tsk); | 860 | exit_sem(tsk); |
diff --git a/kernel/futex.c b/kernel/futex.c index 5efa2f978032..feb724b2554e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -8,6 +8,10 @@ | |||
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | 8 | * Removed page pinning, fix privately mapped COW pages and other cleanups |
9 | * (C) Copyright 2003, 2004 Jamie Lokier | 9 | * (C) Copyright 2003, 2004 Jamie Lokier |
10 | * | 10 | * |
11 | * Robust futex support started by Ingo Molnar | ||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | ||
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | ||
14 | * | ||
11 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
12 | * enough at me, Linus for the original (flawed) idea, Matthew | 16 | * enough at me, Linus for the original (flawed) idea, Matthew |
13 | * Kirkwood for proof-of-concept implementation. | 17 | * Kirkwood for proof-of-concept implementation. |
@@ -829,6 +833,174 @@ error: | |||
829 | goto out; | 833 | goto out; |
830 | } | 834 | } |
831 | 835 | ||
836 | /* | ||
837 | * Support for robust futexes: the kernel cleans up held futexes at | ||
838 | * thread exit time. | ||
839 | * | ||
840 | * Implementation: user-space maintains a per-thread list of locks it | ||
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | ||
842 | * and marks all locks that are owned by this thread with the | ||
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | ||
844 | * always manipulated with the lock held, so the list is private and | ||
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | ||
846 | * field, to allow the kernel to clean up if the thread dies after | ||
847 | * acquiring the lock, but just before it could have added itself to | ||
848 | * the list. There can only be one such pending lock. | ||
849 | */ | ||
850 | |||
851 | /** | ||
852 | * sys_set_robust_list - set the robust-futex list head of a task | ||
853 | * @head: pointer to the list-head | ||
854 | * @len: length of the list-head, as userspace expects | ||
855 | */ | ||
856 | asmlinkage long | ||
857 | sys_set_robust_list(struct robust_list_head __user *head, | ||
858 | size_t len) | ||
859 | { | ||
860 | /* | ||
861 | * The kernel knows only one size for now: | ||
862 | */ | ||
863 | if (unlikely(len != sizeof(*head))) | ||
864 | return -EINVAL; | ||
865 | |||
866 | current->robust_list = head; | ||
867 | |||
868 | return 0; | ||
869 | } | ||
870 | |||
871 | /** | ||
872 | * sys_get_robust_list - get the robust-futex list head of a task | ||
873 | * @pid: pid of the process [zero for current task] | ||
874 | * @head_ptr: pointer to a list-head pointer, the kernel fills it in | ||
875 | * @len_ptr: pointer to a length field, the kernel fills in the header size | ||
876 | */ | ||
877 | asmlinkage long | ||
878 | sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr, | ||
879 | size_t __user *len_ptr) | ||
880 | { | ||
881 | struct robust_list_head *head; | ||
882 | unsigned long ret; | ||
883 | |||
884 | if (!pid) | ||
885 | head = current->robust_list; | ||
886 | else { | ||
887 | struct task_struct *p; | ||
888 | |||
889 | ret = -ESRCH; | ||
890 | read_lock(&tasklist_lock); | ||
891 | p = find_task_by_pid(pid); | ||
892 | if (!p) | ||
893 | goto err_unlock; | ||
894 | ret = -EPERM; | ||
895 | if ((current->euid != p->euid) && (current->euid != p->uid) && | ||
896 | !capable(CAP_SYS_PTRACE)) | ||
897 | goto err_unlock; | ||
898 | head = p->robust_list; | ||
899 | read_unlock(&tasklist_lock); | ||
900 | } | ||
901 | |||
902 | if (put_user(sizeof(*head), len_ptr)) | ||
903 | return -EFAULT; | ||
904 | return put_user(head, head_ptr); | ||
905 | |||
906 | err_unlock: | ||
907 | read_unlock(&tasklist_lock); | ||
908 | |||
909 | return ret; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Process a futex-list entry, check whether it's owned by the | ||
914 | * dying task, and do notification if so: | ||
915 | */ | ||
916 | int handle_futex_death(unsigned int *uaddr, struct task_struct *curr) | ||
917 | { | ||
918 | unsigned int futex_val; | ||
919 | |||
920 | repeat: | ||
921 | if (get_user(futex_val, uaddr)) | ||
922 | return -1; | ||
923 | |||
924 | if ((futex_val & FUTEX_TID_MASK) == curr->pid) { | ||
925 | /* | ||
926 | * Ok, this dying thread is truly holding a futex | ||
927 | * of interest. Set the OWNER_DIED bit atomically | ||
928 | * via cmpxchg, and if the value had FUTEX_WAITERS | ||
929 | * set, wake up a waiter (if any). (We have to do a | ||
930 | * futex_wake() even if OWNER_DIED is already set - | ||
931 | * to handle the rare but possible case of recursive | ||
932 | * thread-death.) The rest of the cleanup is done in | ||
933 | * userspace. | ||
934 | */ | ||
935 | if (futex_atomic_cmpxchg_inuser(uaddr, futex_val, | ||
936 | futex_val | FUTEX_OWNER_DIED) != | ||
937 | futex_val) | ||
938 | goto repeat; | ||
939 | |||
940 | if (futex_val & FUTEX_WAITERS) | ||
941 | futex_wake((unsigned long)uaddr, 1); | ||
942 | } | ||
943 | return 0; | ||
944 | } | ||
945 | |||
946 | /* | ||
947 | * Walk curr->robust_list (very carefully, it's a userspace list!) | ||
948 | * and mark any locks found there dead, and notify any waiters. | ||
949 | * | ||
950 | * We silently return on any sign of list-walking problem. | ||
951 | */ | ||
952 | void exit_robust_list(struct task_struct *curr) | ||
953 | { | ||
954 | struct robust_list_head __user *head = curr->robust_list; | ||
955 | struct robust_list __user *entry, *pending; | ||
956 | unsigned int limit = ROBUST_LIST_LIMIT; | ||
957 | unsigned long futex_offset; | ||
958 | |||
959 | /* | ||
960 | * Fetch the list head (which was registered earlier, via | ||
961 | * sys_set_robust_list()): | ||
962 | */ | ||
963 | if (get_user(entry, &head->list.next)) | ||
964 | return; | ||
965 | /* | ||
966 | * Fetch the relative futex offset: | ||
967 | */ | ||
968 | if (get_user(futex_offset, &head->futex_offset)) | ||
969 | return; | ||
970 | /* | ||
971 | * Fetch any possibly pending lock-add first, and handle it | ||
972 | * if it exists: | ||
973 | */ | ||
974 | if (get_user(pending, &head->list_op_pending)) | ||
975 | return; | ||
976 | if (pending) | ||
977 | handle_futex_death((void *)pending + futex_offset, curr); | ||
978 | |||
979 | while (entry != &head->list) { | ||
980 | /* | ||
981 | * A pending lock might already be on the list, so | ||
982 | * dont process it twice: | ||
983 | */ | ||
984 | if (entry != pending) | ||
985 | if (handle_futex_death((void *)entry + futex_offset, | ||
986 | curr)) | ||
987 | return; | ||
988 | |||
989 | /* | ||
990 | * Fetch the next entry in the list: | ||
991 | */ | ||
992 | if (get_user(entry, &entry->next)) | ||
993 | return; | ||
994 | /* | ||
995 | * Avoid excessively long or circular lists: | ||
996 | */ | ||
997 | if (!--limit) | ||
998 | break; | ||
999 | |||
1000 | cond_resched(); | ||
1001 | } | ||
1002 | } | ||
1003 | |||
832 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1004 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, |
833 | unsigned long uaddr2, int val2, int val3) | 1005 | unsigned long uaddr2, int val2, int val3) |
834 | { | 1006 | { |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1067090db6b1..d82864c4a617 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg); | |||
42 | cond_syscall(sys_socketcall); | 42 | cond_syscall(sys_socketcall); |
43 | cond_syscall(sys_futex); | 43 | cond_syscall(sys_futex); |
44 | cond_syscall(compat_sys_futex); | 44 | cond_syscall(compat_sys_futex); |
45 | cond_syscall(sys_set_robust_list); | ||
46 | cond_syscall(compat_sys_set_robust_list); | ||
47 | cond_syscall(sys_get_robust_list); | ||
48 | cond_syscall(compat_sys_get_robust_list); | ||
45 | cond_syscall(sys_epoll_create); | 49 | cond_syscall(sys_epoll_create); |
46 | cond_syscall(sys_epoll_ctl); | 50 | cond_syscall(sys_epoll_ctl); |
47 | cond_syscall(sys_epoll_wait); | 51 | cond_syscall(sys_epoll_wait); |