aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/futex.h95
-rw-r--r--include/linux/sched.h3
-rw-r--r--include/linux/threads.h3
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/futex.c172
-rw-r--r--kernel/sys_ni.c4
6 files changed, 279 insertions, 1 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 10f96c31971e..20face6b798d 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -1,6 +1,8 @@
1#ifndef _LINUX_FUTEX_H 1#ifndef _LINUX_FUTEX_H
2#define _LINUX_FUTEX_H 2#define _LINUX_FUTEX_H
3 3
4#include <linux/sched.h>
5
4/* Second argument to futex syscall */ 6/* Second argument to futex syscall */
5 7
6 8
@@ -11,10 +13,103 @@
11#define FUTEX_CMP_REQUEUE 4 13#define FUTEX_CMP_REQUEUE 4
12#define FUTEX_WAKE_OP 5 14#define FUTEX_WAKE_OP 5
13 15
16/*
17 * Support for robust futexes: the kernel cleans up held futexes at
18 * thread exit time.
19 */
20
21/*
22 * Per-lock list entry - embedded in user-space locks, somewhere close
23 * to the futex field. (Note: user-space uses a double-linked list to
24 * achieve O(1) list add and remove, but the kernel only needs to know
25 * about the forward link)
26 *
27 * NOTE: this structure is part of the syscall ABI, and must not be
28 * changed.
29 */
30struct robust_list {
31 struct robust_list __user *next;
32};
33
34/*
35 * Per-thread list head:
36 *
37 * NOTE: this structure is part of the syscall ABI, and must only be
38 * changed if the change is first communicated with the glibc folks.
39 * (When an incompatible change is done, we'll increase the structure
40 * size, which glibc will detect)
41 */
42struct robust_list_head {
43 /*
44 * The head of the list. Points back to itself if empty:
45 */
46 struct robust_list list;
47
48 /*
49 * This relative offset is set by user-space, it gives the kernel
50 * the relative position of the futex field to examine. This way
51 * we keep userspace flexible, to freely shape its data-structure,
52 * without hardcoding any particular offset into the kernel:
53 */
54 long futex_offset;
55
56 /*
57 * The death of the thread may race with userspace setting
58 * up a lock's links. So to handle this race, userspace first
59 * sets this field to the address of the to-be-taken lock,
60 * then does the lock acquire, and then adds itself to the
61 * list, and then clears this field. Hence the kernel will
62 * always have full knowledge of all locks that the thread
63 * _might_ have taken. We check the owner TID in any case,
64 * so only truly owned locks will be handled.
65 */
66 struct robust_list __user *list_op_pending;
67};
68
69/*
70 * Are there any waiters for this robust futex:
71 */
72#define FUTEX_WAITERS 0x80000000
73
74/*
75 * The kernel signals via this bit that a thread holding a futex
76 * has exited without unlocking the futex. The kernel also does
77 * a FUTEX_WAKE on such futexes, after setting the bit, to wake
78 * up any possible waiters:
79 */
80#define FUTEX_OWNER_DIED 0x40000000
81
82/*
83 * Reserved bit:
84 */
85#define FUTEX_OWNER_PENDING 0x20000000
86
87/*
88 * The rest of the robust-futex field is for the TID:
89 */
90#define FUTEX_TID_MASK 0x1fffffff
91
92/*
93 * A limit of one million locks held per thread (!) ought to be enough
94 * for some time. This also protects against a deliberately circular
95 * list. Not worth introducing an rlimit for this:
96 */
97#define ROBUST_LIST_LIMIT 1048576
98
14long do_futex(unsigned long uaddr, int op, int val, 99long do_futex(unsigned long uaddr, int op, int val,
15 unsigned long timeout, unsigned long uaddr2, int val2, 100 unsigned long timeout, unsigned long uaddr2, int val2,
16 int val3); 101 int val3);
17 102
103extern int handle_futex_death(unsigned int *uaddr, struct task_struct *curr);
104
105#ifdef CONFIG_FUTEX
106extern void exit_robust_list(struct task_struct *curr);
107#else
108static inline void exit_robust_list(struct task_struct *curr)
109{
110}
111#endif
112
18#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ 113#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */
19#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ 114#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */
20#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ 115#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 036d14d2bf90..fd4848f2d750 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@
35#include <linux/topology.h> 35#include <linux/topology.h>
36#include <linux/seccomp.h> 36#include <linux/seccomp.h>
37#include <linux/rcupdate.h> 37#include <linux/rcupdate.h>
38#include <linux/futex.h>
38 39
39#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ 40#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
40 41
@@ -872,6 +873,8 @@ struct task_struct {
872 int cpuset_mems_generation; 873 int cpuset_mems_generation;
873 int cpuset_mem_spread_rotor; 874 int cpuset_mem_spread_rotor;
874#endif 875#endif
876 struct robust_list_head __user *robust_list;
877
875 atomic_t fs_excl; /* holding fs exclusive resources */ 878 atomic_t fs_excl; /* holding fs exclusive resources */
876 struct rcu_head rcu; 879 struct rcu_head rcu;
877}; 880};
diff --git a/include/linux/threads.h b/include/linux/threads.h
index b59738ac6197..e646bcdf2614 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -28,7 +28,8 @@
28#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000) 28#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
29 29
30/* 30/*
31 * A maximum of 4 million PIDs should be enough for a while: 31 * A maximum of 4 million PIDs should be enough for a while.
32 * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
32 */ 33 */
33#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \ 34#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
34 (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT)) 35 (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
diff --git a/kernel/exit.c b/kernel/exit.c
index 8037405e136e..aecb48ca7370 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -31,6 +31,7 @@
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/cn_proc.h> 32#include <linux/cn_proc.h>
33#include <linux/mutex.h> 33#include <linux/mutex.h>
34#include <linux/futex.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/unistd.h> 37#include <asm/unistd.h>
@@ -852,6 +853,8 @@ fastcall NORET_TYPE void do_exit(long code)
852 exit_itimers(tsk->signal); 853 exit_itimers(tsk->signal);
853 acct_process(code); 854 acct_process(code);
854 } 855 }
856 if (unlikely(tsk->robust_list))
857 exit_robust_list(tsk);
855 exit_mm(tsk); 858 exit_mm(tsk);
856 859
857 exit_sem(tsk); 860 exit_sem(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5efa2f978032..feb724b2554e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -8,6 +8,10 @@
8 * Removed page pinning, fix privately mapped COW pages and other cleanups 8 * Removed page pinning, fix privately mapped COW pages and other cleanups
9 * (C) Copyright 2003, 2004 Jamie Lokier 9 * (C) Copyright 2003, 2004 Jamie Lokier
10 * 10 *
11 * Robust futex support started by Ingo Molnar
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 *
11 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
12 * enough at me, Linus for the original (flawed) idea, Matthew 16 * enough at me, Linus for the original (flawed) idea, Matthew
13 * Kirkwood for proof-of-concept implementation. 17 * Kirkwood for proof-of-concept implementation.
@@ -829,6 +833,174 @@ error:
829 goto out; 833 goto out;
830} 834}
831 835
836/*
837 * Support for robust futexes: the kernel cleans up held futexes at
838 * thread exit time.
839 *
840 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after
847 * acquiring the lock, but just before it could have added itself to
848 * the list. There can only be one such pending lock.
849 */
850
851/**
852 * sys_set_robust_list - set the robust-futex list head of a task
853 * @head: pointer to the list-head
854 * @len: length of the list-head, as userspace expects
855 */
856asmlinkage long
857sys_set_robust_list(struct robust_list_head __user *head,
858 size_t len)
859{
860 /*
861 * The kernel knows only one size for now:
862 */
863 if (unlikely(len != sizeof(*head)))
864 return -EINVAL;
865
866 current->robust_list = head;
867
868 return 0;
869}
870
871/**
872 * sys_get_robust_list - get the robust-futex list head of a task
873 * @pid: pid of the process [zero for current task]
874 * @head_ptr: pointer to a list-head pointer, the kernel fills it in
875 * @len_ptr: pointer to a length field, the kernel fills in the header size
876 */
877asmlinkage long
878sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
879 size_t __user *len_ptr)
880{
881 struct robust_list_head *head;
882 unsigned long ret;
883
884 if (!pid)
885 head = current->robust_list;
886 else {
887 struct task_struct *p;
888
889 ret = -ESRCH;
890 read_lock(&tasklist_lock);
891 p = find_task_by_pid(pid);
892 if (!p)
893 goto err_unlock;
894 ret = -EPERM;
895 if ((current->euid != p->euid) && (current->euid != p->uid) &&
896 !capable(CAP_SYS_PTRACE))
897 goto err_unlock;
898 head = p->robust_list;
899 read_unlock(&tasklist_lock);
900 }
901
902 if (put_user(sizeof(*head), len_ptr))
903 return -EFAULT;
904 return put_user(head, head_ptr);
905
906err_unlock:
907 read_unlock(&tasklist_lock);
908
909 return ret;
910}
911
912/*
913 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so:
915 */
916int handle_futex_death(unsigned int *uaddr, struct task_struct *curr)
917{
918 unsigned int futex_val;
919
920repeat:
921 if (get_user(futex_val, uaddr))
922 return -1;
923
924 if ((futex_val & FUTEX_TID_MASK) == curr->pid) {
925 /*
926 * Ok, this dying thread is truly holding a futex
927 * of interest. Set the OWNER_DIED bit atomically
928 * via cmpxchg, and if the value had FUTEX_WAITERS
929 * set, wake up a waiter (if any). (We have to do a
930 * futex_wake() even if OWNER_DIED is already set -
931 * to handle the rare but possible case of recursive
932 * thread-death.) The rest of the cleanup is done in
933 * userspace.
934 */
935 if (futex_atomic_cmpxchg_inuser(uaddr, futex_val,
936 futex_val | FUTEX_OWNER_DIED) !=
937 futex_val)
938 goto repeat;
939
940 if (futex_val & FUTEX_WAITERS)
941 futex_wake((unsigned long)uaddr, 1);
942 }
943 return 0;
944}
945
946/*
947 * Walk curr->robust_list (very carefully, it's a userspace list!)
948 * and mark any locks found there dead, and notify any waiters.
949 *
950 * We silently return on any sign of list-walking problem.
951 */
952void exit_robust_list(struct task_struct *curr)
953{
954 struct robust_list_head __user *head = curr->robust_list;
955 struct robust_list __user *entry, *pending;
956 unsigned int limit = ROBUST_LIST_LIMIT;
957 unsigned long futex_offset;
958
959 /*
960 * Fetch the list head (which was registered earlier, via
961 * sys_set_robust_list()):
962 */
963 if (get_user(entry, &head->list.next))
964 return;
965 /*
966 * Fetch the relative futex offset:
967 */
968 if (get_user(futex_offset, &head->futex_offset))
969 return;
970 /*
971 * Fetch any possibly pending lock-add first, and handle it
972 * if it exists:
973 */
974 if (get_user(pending, &head->list_op_pending))
975 return;
976 if (pending)
977 handle_futex_death((void *)pending + futex_offset, curr);
978
979 while (entry != &head->list) {
980 /*
981 * A pending lock might already be on the list, so
982 * dont process it twice:
983 */
984 if (entry != pending)
985 if (handle_futex_death((void *)entry + futex_offset,
986 curr))
987 return;
988
989 /*
990 * Fetch the next entry in the list:
991 */
992 if (get_user(entry, &entry->next))
993 return;
994 /*
995 * Avoid excessively long or circular lists:
996 */
997 if (!--limit)
998 break;
999
1000 cond_resched();
1001 }
1002}
1003
832long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1004long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
833 unsigned long uaddr2, int val2, int val3) 1005 unsigned long uaddr2, int val2, int val3)
834{ 1006{
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1067090db6b1..d82864c4a617 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -42,6 +42,10 @@ cond_syscall(sys_recvmsg);
42cond_syscall(sys_socketcall); 42cond_syscall(sys_socketcall);
43cond_syscall(sys_futex); 43cond_syscall(sys_futex);
44cond_syscall(compat_sys_futex); 44cond_syscall(compat_sys_futex);
45cond_syscall(sys_set_robust_list);
46cond_syscall(compat_sys_set_robust_list);
47cond_syscall(sys_get_robust_list);
48cond_syscall(compat_sys_get_robust_list);
45cond_syscall(sys_epoll_create); 49cond_syscall(sys_epoll_create);
46cond_syscall(sys_epoll_ctl); 50cond_syscall(sys_epoll_ctl);
47cond_syscall(sys_epoll_wait); 51cond_syscall(sys_epoll_wait);