From a57de0b4336e48db2811a2030bb68dba8dd09d88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:09:13 +0000 Subject: net: adding memory barrier to the poll and receive callbacks Adding memory barrier after the poll_wait function, paired with receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper to wrap the memory barrier. Without the memory barrier, following race can happen. The race fires, when following code paths meet, and the tp->rcv_nxt and __add_wait_queue updates stay in CPU caches. CPU1 CPU2 sys_select receive packet ... ... __add_wait_queue update tp->rcv_nxt ... ... tp->rcv_nxt check sock_def_readable ... { schedule ... if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep) ... } If there was no cache the code would work ok, since the wait_queue and rcv_nxt are opposit to each other. Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already passed the tp->rcv_nxt check and sleeps, or will get the new value for tp->rcv_nxt and will return with new data mask. In both cases the process (CPU1) is being added to the wait queue, so the waitqueue_active (CPU2) call cannot miss and will wake up CPU1. The bad case is when the __add_wait_queue changes done by CPU1 stay in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then endup calling schedule and sleep forever if there are no more data on the socket. Calls to poll_wait in following modules were ommited: net/bluetooth/af_bluetooth.c net/irda/af_irda.c net/irda/irnet/irnet_ppp.c net/mac80211/rc80211_pid_debugfs.c net/phonet/socket.c net/rds/af_rds.c net/rfkill/core.c net/sunrpc/cache.c net/sunrpc/rpc_pipe.c net/tipc/socket.c Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 36d4e44d6233..fc3ebb906911 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } @@ -1985,7 +1985,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2022,7 +2022,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int mask, writable; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2053,7 +2053,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer_get(sk); if (other) { if (unix_peer(other) != sk) { - poll_wait(file, &unix_sk(other)->peer_wait, + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); if (unix_recvq_full(other)) writable = 0; -- cgit v1.2.2 From 8ba69ba6a324b13e1190fc31e41954d190fd4f1d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 11 Sep 2009 11:31:45 -0700 Subject: net: unix: fix sending fds in multiple buffers Kalle Olavi Niemitalo reported that: "..., when one process calls sendmsg once to send 43804 bytes of data and one file descriptor, and another process then calls recvmsg three times to receive the 16032+16032+11740 bytes, each of those recvmsg calls returns the file descriptor in the ancillary data. I confirmed this with strace. The behaviour differs from Linux 2.6.26, where reportedly only one of those recvmsg calls (I think the first one) returned the file descriptor." This bug was introduced by a patch from me titled "net: unix: fix inflight counting bug in garbage collector", commit 6209344f5. And the reason is, quoting Kalle: "Before your patch, unix_attach_fds() would set scm->fp = NULL, so that if the loop in unix_stream_sendmsg() ran multiple iterations, it could not call unix_attach_fds() again. But now, unix_attach_fds() leaves scm->fp unchanged, and I think this causes it to be called multiple times and duplicate the same file descriptors to each struct sk_buff." Fix this by introducing a flag that is cleared at the start and set when the fds attached to the first buffer. The resulting code should work equivalently to the one on 2.6.26. Reported-by: Kalle Olavi Niemitalo Signed-off-by: Miklos Szeredi Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fc3ebb906911..51ab497115eb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1501,6 +1501,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; int sent = 0; struct scm_cookie tmp_scm; + bool fds_sent = false; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1562,12 +1563,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, size = min_t(int, size, skb_tailroom(skb)); memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) { + /* Only send the fds in the first buffer */ + if (siocb->scm->fp && !fds_sent) { err = unix_attach_fds(siocb->scm, skb); if (err) { kfree_skb(skb); goto out_err; } + fds_sent = true; } err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); -- cgit v1.2.2 From ec1b4cf74c81bfd0fbe5bf62bafc86c45917e72f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Oct 2009 05:58:39 +0000 Subject: net: mark net_proto_ops as const All usages of structure net_proto_ops should be declared const. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 51ab497115eb..0f133c5a8d3c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2214,7 +2214,7 @@ static const struct file_operations unix_seq_fops = { #endif -static struct net_proto_family unix_family_ops = { +static const struct net_proto_family unix_family_ops = { .family = PF_UNIX, .create = unix_create, .owner = THIS_MODULE, -- cgit v1.2.2 From 77238f2b942b38ab4e7f3aced44084493e4a8675 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Sun, 18 Oct 2009 23:17:37 -0700 Subject: AF_UNIX: Fix deadlock on connecting to shutdown socket I found a deadlock bug in UNIX domain socket, which makes able to DoS attack against the local machine by non-root users. How to reproduce: 1. Make a listening AF_UNIX/SOCK_STREAM socket with an abstruct namespace(*), and shutdown(2) it. 2. Repeat connect(2)ing to the listening socket from the other sockets until the connection backlog is full-filled. 3. connect(2) takes the CPU forever. If every core is taken, the system hangs. PoC code: (Run as many times as cores on SMP machines.) int main(void) { int ret; int csd; int lsd; struct sockaddr_un sun; /* make an abstruct name address (*) */ memset(&sun, 0, sizeof(sun)); sun.sun_family = PF_UNIX; sprintf(&sun.sun_path[1], "%d", getpid()); /* create the listening socket and shutdown */ lsd = socket(AF_UNIX, SOCK_STREAM, 0); bind(lsd, (struct sockaddr *)&sun, sizeof(sun)); listen(lsd, 1); shutdown(lsd, SHUT_RDWR); /* connect loop */ alarm(15); /* forcely exit the loop after 15 sec */ for (;;) { csd = socket(AF_UNIX, SOCK_STREAM, 0); ret = connect(csd, (struct sockaddr *)&sun, sizeof(sun)); if (-1 == ret) { perror("connect()"); break; } puts("Connection OK"); } return 0; } (*) Make sun_path[0] = 0 to use the abstruct namespace. If a file-based socket is used, the system doesn't deadlock because of context switches in the file system layer. Why this happens: Error checks between unix_socket_connect() and unix_wait_for_peer() are inconsistent. The former calls the latter to wait until the backlog is processed. Despite the latter returns without doing anything when the socket is shutdown, the former doesn't check the shutdown state and just retries calling the latter forever. Patch: The patch below adds shutdown check into unix_socket_connect(), so connect(2) to the shutdown socket will return -ECONREFUSED. Signed-off-by: Tomoki Sekiyama Signed-off-by: Masanori Yoshida Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 51ab497115eb..fc820cd75453 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1074,6 +1074,8 @@ restart: err = -ECONNREFUSED; if (other->sk_state != TCP_LISTEN) goto out_unlock; + if (other->sk_shutdown & RCV_SHUTDOWN) + goto out_unlock; if (unix_recvq_full(other)) { err = -EAGAIN; -- cgit v1.2.2 From 3f378b684453f2a028eda463ce383370545d9cc9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 5 Nov 2009 22:18:14 -0800 Subject: net: pass kern to net_proto_family create function The generic __sock_create function has a kern argument which allows the security system to make decisions based on if a socket is being created by the kernel or by userspace. This patch passes that flag to the net_proto_family specific create function, so it can do the same thing. Signed-off-by: Eric Paris Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3291902f0b88..178d3af2a605 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -621,7 +621,8 @@ out: return sk; } -static int unix_create(struct net *net, struct socket *sock, int protocol) +static int unix_create(struct net *net, struct socket *sock, int protocol, + int kern) { if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; -- cgit v1.2.2 From 13cfa97bef0f1172879f98307ac716acf3e9cea9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 8 Nov 2009 05:51:19 +0000 Subject: net: netlink_getname, packet_getname -- use DECLARE_SOCKADDR guard Use guard DECLARE_SOCKADDR in a few more places which allow us to catch if the structure copied back is too big. Signed-off-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 178d3af2a605..7553ea6edd8f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1259,7 +1259,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ { struct sock *sk = sock->sk; struct unix_sock *u; - struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; if (peer) { -- cgit v1.2.2 From f64f9e719261a87818dd192a3a2352e5b20fbd0f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 29 Nov 2009 16:55:45 -0800 Subject: net: Move && and || to end of previous line Not including net/atm/ Compiled tested x86 allyesconfig only Added a > 80 column line or two, which I ignored. Existing checkpatch plaints willfully, cheerfully ignored. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/unix/af_unix.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7553ea6edd8f..f25511903115 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1033,8 +1033,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) - && !u->addr && (err = unix_autobind(sock)) != 0) + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && + (err = unix_autobind(sock)) != 0) goto out; timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); @@ -1378,8 +1378,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) - && !u->addr && (err = unix_autobind(sock)) != 0) + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr + && (err = unix_autobind(sock)) != 0) goto out; err = -EMSGSIZE; -- cgit v1.2.2 From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:35:32 +0000 Subject: net: spread __net_init, __net_exit __net_init/__net_exit are apparently not going away, so use them to full extent. In some cases __net_init was removed, because it was called from __net_exit code. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f25511903115..9bc9b92bc099 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2224,7 +2224,7 @@ static const struct net_proto_family unix_family_ops = { }; -static int unix_net_init(struct net *net) +static int __net_init unix_net_init(struct net *net) { int error = -ENOMEM; @@ -2243,7 +2243,7 @@ out: return error; } -static void unix_net_exit(struct net *net) +static void __net_exit unix_net_exit(struct net *net) { unix_sysctl_unregister(net); proc_net_remove(net, "unix"); -- cgit v1.2.2 From 663717f65c075eb4c6da7a123041295bd5295cc0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 18 Feb 2010 14:12:06 -0800 Subject: AF_UNIX: update locking comment The lock used in unix_state_lock() is a spin_lock not reader-writer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9bc9b92bc099..3d9122e78f41 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -144,7 +144,7 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) /* * SMP locking strategy: * hash table is protected with spinlock unix_table_lock - * each socket state is protected by separate rwlock. + * each socket state is protected by separate spin lock. */ static inline unsigned unix_hash_fold(__wsum n) -- cgit v1.2.2 From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3d9122e78f41..87c0360eaa25 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -316,7 +316,7 @@ static void unix_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync(sk_sleep(sk)); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1736,7 +1736,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) unix_state_lock(sk); for (;;) { - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || @@ -1752,7 +1752,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } @@ -1991,7 +1991,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table struct sock *sk = sock->sk; unsigned int mask; - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ @@ -2028,7 +2028,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int mask, writable; - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ -- cgit v1.2.2 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 87c0360eaa25..fef2cc5e9d2b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk) static void unix_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (unix_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive @@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion) skpair->sk_err = ECONNRESET; unix_state_unlock(skpair); skpair->sk_state_change(skpair); - read_lock(&skpair->sk_callback_lock); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); - read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -1142,7 +1143,7 @@ restart: newsk->sk_peercred.pid = task_tgid_vnr(current); current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); newu = unix_sk(newsk); - newsk->sk_sleep = &newu->peer_wait; + newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); /* copy address information from listening to new sock*/ @@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&other->sk_callback_lock); } if (other) sock_put(other); -- cgit v1.2.2 From 109f6e39fa07c48f580125f531f46cb7c245b528 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:30:14 +0000 Subject: af_unix: Allow SO_PEERCRED to work across namespaces. Use struct pid and struct cred to store the peer credentials on struct sock. This gives enough information to convert the peer credential information to a value relative to whatever namespace the socket is in at the time. This removes nasty surprises when using SO_PEERCRED on socket connetions where the processes on either side are in different pid and user namespaces. Signed-off-by: Eric W. Biederman Acked-by: Daniel Lezcano Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fef2cc5e9d2b..e1f1349fae86 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -450,11 +450,31 @@ static int unix_release_sock(struct sock *sk, int embrion) return 0; } +static void init_peercred(struct sock *sk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); +} + +static void copy_peercred(struct sock *sk, struct sock *peersk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); +} + static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct pid *old_pid = NULL; + const struct cred *old_cred = NULL; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -470,12 +490,14 @@ static int unix_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ - sk->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&sk->sk_peercred.uid, &sk->sk_peercred.gid); + init_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); + put_pid(old_pid); + if (old_cred) + put_cred(old_cred); out: return err; } @@ -1140,8 +1162,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - newsk->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); + init_peercred(newsk); newu = unix_sk(newsk); newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); @@ -1157,7 +1178,7 @@ restart: } /* Set credentials */ - sk->sk_peercred = other->sk_peercred; + copy_peercred(sk, other); sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; @@ -1199,10 +1220,8 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&skb->sk_peercred.uid, &skb->sk_peercred.gid); - ska->sk_peercred.uid = skb->sk_peercred.uid; - ska->sk_peercred.gid = skb->sk_peercred.gid; + init_peercred(ska); + init_peercred(skb); if (ska->sk_type != SOCK_DGRAM) { ska->sk_state = TCP_ESTABLISHED; -- cgit v1.2.2 From 7361c36c5224519b258219fe3d0e8abc865d8134 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:34:33 +0000 Subject: af_unix: Allow credentials to work across user and pid namespaces. In unix_skb_parms store pointers to struct pid and struct cred instead of raw uid, gid, and pid values, then translate the credentials on reception into values that are meaningful in the receiving processes namespaces. Signed-off-by: Eric W. Biederman Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e1f1349fae86..5fe9d6fe08b8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1316,18 +1316,20 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) int i; scm->fp = UNIXCB(skb).fp; - skb->destructor = sock_wfree; UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +static void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - unix_detach_fds(&scm, skb); + scm.pid = UNIXCB(skb).pid; + scm.cred = UNIXCB(skb).cred; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); /* Alas, it calls VFS */ /* So fscking what? fput() had been SMP-safe since the last Summer */ @@ -1350,10 +1352,22 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count-1; i >= 0; i--) unix_inflight(scm->fp->fp[i]); - skb->destructor = unix_destruct_fds; return 0; } +static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) +{ + int err = 0; + UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).cred = get_cred(scm->cred); + UNIXCB(skb).fp = NULL; + if (scm->fp && send_fds) + err = unix_attach_fds(scm, skb); + + skb->destructor = unix_destruct_scm; + return err; +} + /* * Send AF_UNIX data. */ @@ -1410,12 +1424,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (skb == NULL) goto out; - memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) { - err = unix_attach_fds(siocb->scm, skb); - if (err) - goto out_free; - } + err = unix_scm_to_skb(siocb->scm, skb, true); + if (err) + goto out_free; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1585,16 +1596,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, */ size = min_t(int, size, skb_tailroom(skb)); - memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + /* Only send the fds in the first buffer */ - if (siocb->scm->fp && !fds_sent) { - err = unix_attach_fds(siocb->scm, skb); - if (err) { - kfree_skb(skb); - goto out_err; - } - fds_sent = true; + err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); + if (err) { + kfree_skb(skb); + goto out_err; } + fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err) { @@ -1711,7 +1720,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); } - siocb->scm->creds = *UNIXCREDS(skb); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { @@ -1860,14 +1869,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (check_creds) { /* Never glue messages from different writers */ - if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, - sizeof(siocb->scm->creds)) != 0) { + if ((UNIXCB(skb).pid != siocb->scm->pid) || + (UNIXCB(skb).cred != siocb->scm->cred)) { skb_queue_head(&sk->sk_receive_queue, skb); break; } } else { /* Copy credentials */ - siocb->scm->creds = *UNIXCREDS(skb); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); check_creds = 1; } -- cgit v1.2.2 From 6616f7888c6088324727363276f6de05a1dca6fc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:35:48 +0000 Subject: af_unix: Allow connecting to sockets in other network namespaces. Remove the restriction that only allows connecting to a unix domain socket identified by unix path that is in the same network namespace. Crossing network namespaces is always tricky and we did not support this at first, because of a strict policy of don't mix the namespaces. Later after Pavel proposed this we did not support this because no one had performed the audit to make certain using unix domain sockets across namespaces is safe. What fundamentally makes connecting to af_unix sockets in other namespaces is safe is that you have to have the proper permissions on the unix domain socket inode that lives in the filesystem. If you want strict isolation you just don't create inodes where unfriendlys can get at them, or with permissions that allow unfriendlys to open them. All nicely handled for us by the mount namespace and other standard file system facilities. I looked through unix domain sockets and they are a very controlled environment so none of the work that goes on in dev_forward_skb to make crossing namespaces safe appears needed, we are not loosing controll of the skb and so do not need to set up the skb to look like it is comming in fresh from the outside world. Further the fields in struct unix_skb_parms should not have any problems crossing network namespaces. Now that we handle SCM_CREDENTIALS in a way that gives useable values across namespaces. There does not appear to be any operational problems with encouraging the use of unix domain sockets across containers either. Signed-off-by: Eric W. Biederman Acked-by: Daniel Lezcano Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5fe9d6fe08b8..75ba48b0d12a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -282,7 +282,7 @@ static inline struct sock *unix_find_socket_byname(struct net *net, return s; } -static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) +static struct sock *unix_find_socket_byinode(struct inode *i) { struct sock *s; struct hlist_node *node; @@ -292,9 +292,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; - if (!net_eq(sock_net(s), net)) - continue; - if (dentry && dentry->d_inode == i) { sock_hold(s); goto found; @@ -758,7 +755,7 @@ static struct sock *unix_find_other(struct net *net, err = -ECONNREFUSED; if (!S_ISSOCK(inode->i_mode)) goto put_fail; - u = unix_find_socket_byinode(net, inode); + u = unix_find_socket_byinode(inode); if (!u) goto put_fail; -- cgit v1.2.2 From 70d4bf6d467a330ccc947df9b2608e329d9e7708 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 20 Jul 2010 06:45:56 +0000 Subject: drop_monitor: convert some kfree_skb call sites to consume_skb Convert a few calls from kfree_skb to consume_skb Noticed while I was working on dropwatch that I was detecting lots of internal skb drops in several places. While some are legitimate, several were not, freeing skbs that were at the end of their life, rather than being discarded due to an error. This patch converts those calls sites from using kfree_skb to consume_skb, which quiets the in-kernel drop_monitor code from detecting them as drops. Tested successfully by myself Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 75ba48b0d12a..4414a18c63b4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1906,7 +1906,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, break; } - kfree_skb(skb); + consume_skb(skb); if (siocb->scm->fp) break; -- cgit v1.2.2 From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Sep 2010 11:13:50 +0000 Subject: net: poll() optimizations No need to test twice sk->sk_shutdown & RCV_SHUTDOWN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..9077b4ea00c5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2024,11 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ -- cgit v1.2.2 From 8df73ff90f00f14d2c7ff7156f7ef153f7e9d3b7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 4 Sep 2010 01:34:28 +0000 Subject: UNIX: Do not loop forever at unix_autobind(). We assumed that unix_autobind() never fails if kzalloc() succeeded. But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can consume all names using fork()/socket()/bind(). If all names are in use, those who call bind() with addr_len == sizeof(short) or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue while (1) yield(); loop at unix_autobind() till a name becomes available. This patch adds a loop counter in order to give up after 1048576 attempts. Calling yield() for once per 256 attempts may not be sufficient when many names are already in use, for __unix_find_socket_byname() can take long time under such circumstance. Therefore, this patch also adds cond_resched() call. Note that currently a local user can consume 2GB of kernel memory if the user is allowed to create and autobind 1048576 UNIX domain sockets. We should consider adding some restriction for autobind operation. Signed-off-by: Tetsuo Handa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..0b39b2451ea5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock) static u32 ordernum = 1; struct unix_address *addr; int err; + unsigned int retries = 0; mutex_lock(&u->readlock); @@ -717,9 +718,17 @@ retry: if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); - /* Sanity yield. It is unusual case, but yet... */ - if (!(ordernum&0xFF)) - yield(); + /* + * __unix_find_socket_byname() may take long time if many names + * are already in use. + */ + cond_resched(); + /* Give up if all names seems to be in use. */ + if (retries++ == 0xFFFFF) { + err = -ENOSPC; + kfree(addr); + goto out; + } goto retry; } addr->hash ^= sk->sk_type; -- cgit v1.2.2 From 3f66116e89521ef71ab0d63dc07a639def88a577 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 4 Oct 2010 08:48:28 +0000 Subject: AF_UNIX: Implement SO_TIMESTAMP and SO_TIMETAMPNS on Unix sockets Userspace applications can already request to receive timestamps with: setsockopt(sockfd, SOL_SOCKET, SO_TIMESTAMP, ...) Although setsockopt() returns zero (success), timestamps are not added to the ancillary data. This patch fixes that on SOCK_DGRAM and SOCK_SEQPACKET Unix sockets. Signed-off-by: Alban Crequy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c586da3f4f18..0ebc777a6660 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1511,6 +1511,8 @@ restart: goto restart; } + if (sock_flag(other, SOCK_RCVTSTAMP)) + __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other, len); @@ -1722,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + if (!siocb->scm) { siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); -- cgit v1.2.2 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/unix/af_unix.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..3c95304a0817 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v1.2.2 From 67426b756c4d52c511c4b22b269accea171692a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Oct 2010 20:44:44 +0000 Subject: af_unix: use keyed wakeups Instead of wakeup all sleepers, use wake_up_interruptible_sync_poll() to wakeup only ones interested into writing the socket. This patch is a specialization of commit 37e5540b3c9d (epoll keyed wakeups: make sockets use keyed wakeups). On a test program provided by Alan Crequy : Before: real 0m3.101s user 0m0.000s sys 0m6.104s After: real 0m0.211s user 0m0.000s sys 0m0.208s Reported-by: Alban Crequy Signed-off-by: Eric Dumazet Cc: Davide Libenzi Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304a0817..f33c5958dbb2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -316,7 +316,8 @@ static void unix_write_space(struct sock *sk) if (unix_writable(sk)) { wq = rcu_dereference(sk->sk_wq); if (wq_has_sleeper(wq)) - wake_up_interruptible_sync(&wq->wait); + wake_up_interruptible_sync_poll(&wq->wait, + POLLOUT | POLLWRNORM | POLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -1710,7 +1711,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; } - wake_up_interruptible_sync(&u->peer_wait); + wake_up_interruptible_sync_poll(&u->peer_wait, + POLLOUT | POLLWRNORM | POLLWRBAND); if (msg->msg_name) unix_copy_addr(msg, skb->sk); -- cgit v1.2.2 From 5456f09aaf88731e16dbcea7522cb330b6846415 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 31 Oct 2010 05:36:23 +0000 Subject: af_unix: fix unix_dgram_poll() behavior for EPOLLOUT event Alban Crequy reported a problem with connected dgram af_unix sockets and provided a test program. epoll() would miss to send an EPOLLOUT event when a thread unqueues a packet from the other peer, making its receive queue not full. This is because unix_dgram_poll() fails to call sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); if the socket is not writeable at the time epoll_ctl(ADD) is called. We must call sock_poll_wait(), regardless of 'writable' status, so that epoll can be notified later of states changes. Misc: avoids testing twice (sk->sk_shutdown & RCV_SHUTDOWN) Reported-by: Alban Crequy Cc: Davide Libenzi Signed-off-by: Eric Dumazet Acked-by: Davide Libenzi Signed-off-by: David S. Miller --- net/unix/af_unix.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f33c5958dbb2..e8898758dd31 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2074,13 +2074,12 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2092,20 +2091,15 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; } - /* writable? */ writable = unix_writable(sk); - if (writable) { - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, - wait); - if (unix_recvq_full(other)) - writable = 0; - } - - sock_put(other); + other = unix_peer_get(sk); + if (other) { + if (unix_peer(other) != sk) { + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); + if (unix_recvq_full(other)) + writable = 0; } + sock_put(other); } if (writable) -- cgit v1.2.2 From 973a34aa8593dbfe84386343c694f5beecb51d8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 31 Oct 2010 05:38:25 +0000 Subject: af_unix: optimize unix_dgram_poll() unix_dgram_poll() is pretty expensive to check POLLOUT status, because it has to lock the socket to get its peer, take a reference on the peer to check its receive queue status, and queue another poll_wait on peer_wait. This all can be avoided if the process calling unix_dgram_poll() is not interested in POLLOUT status. It makes unix_dgram_recvmsg() faster by not queueing irrelevant pollers in peer_wait. On a test program provided by Alan Crequy : Before: real 0m0.211s user 0m0.000s sys 0m0.208s After: real 0m0.044s user 0m0.000s sys 0m0.040s Suggested-by: Davide Libenzi Reported-by: Alban Crequy Acked-by: Davide Libenzi Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e8898758dd31..7ff31c60186a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2091,6 +2091,10 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; } + /* No write status requested, avoid expensive OUT tests. */ + if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) + return mask; + writable = unix_writable(sk); other = unix_peer_get(sk); if (other) { -- cgit v1.2.2 From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Nov 2010 04:11:39 +0000 Subject: af_unix: limit recursion level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its easy to eat all kernel memory and trigger NMI watchdog, using an exploit program that queues unix sockets on top of others. lkml ref : http://lkml.org/lkml/2010/11/25/8 This mechanism is used in applications, one choice we have is to have a recursion limit. Other limits might be needed as well (if we queue other types of files), since the passfd mechanism is currently limited by socket receive queue sizes only. Add a recursion_level to unix socket, allowing up to 4 levels. Each time we send an unix socket through sendfd mechanism, we copy its recursion level (plus one) to receiver. This recursion level is cleared when socket receive queue is emptied. Reported-by: Марк Коренберг Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304a0817..2268e6798124 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +#define MAX_RECURSION_LEVEL 4 + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage @@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count-1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - return 0; + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = unix_scm_to_skb(siocb->scm, skb, true); - if (err) + if (err < 0) goto out_free; + max_level = err + 1; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1514,6 +1534,8 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); @@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); - if (err) { + if (err < 0) { kfree_skb(skb); goto out_err; } + max_level = err + 1; fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, size); sent += size; @@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; -- cgit v1.2.2 From 3610cda53f247e176bcbb7a7cca64bc53b12acdb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 5 Jan 2011 15:38:53 -0800 Subject: af_unix: Avoid socket->sk NULL OOPS in stream connect security hooks. unix_release() can asynchornously set socket->sk to NULL, and it does so without holding the unix_state_lock() on "other" during stream connects. However, the reverse mapping, sk->sk_socket, is only transitioned to NULL under the unix_state_lock(). Therefore make the security hooks follow the reverse mapping instead of the forward mapping. Reported-by: Jeremy Fitzhardinge Reported-by: Linus Torvalds Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 417d7a6c36cf..dd419d286204 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1157,7 +1157,7 @@ restart: goto restart; } - err = security_unix_stream_connect(sock, other->sk_socket, newsk); + err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; -- cgit v1.2.2