From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 17 Jan 2010 03:35:32 +0000 Subject: net: spread __net_init, __net_exit __net_init/__net_exit are apparently not going away, so use them to full extent. In some cases __net_init was removed, because it was called from __net_exit code. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f25511903115..9bc9b92bc099 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2224,7 +2224,7 @@ static const struct net_proto_family unix_family_ops = { }; -static int unix_net_init(struct net *net) +static int __net_init unix_net_init(struct net *net) { int error = -ENOMEM; @@ -2243,7 +2243,7 @@ out: return error; } -static void unix_net_exit(struct net *net) +static void __net_exit unix_net_exit(struct net *net) { unix_sysctl_unregister(net); proc_net_remove(net, "unix"); -- cgit v1.2.2 From 663717f65c075eb4c6da7a123041295bd5295cc0 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 18 Feb 2010 14:12:06 -0800 Subject: AF_UNIX: update locking comment The lock used in unix_state_lock() is a spin_lock not reader-writer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9bc9b92bc099..3d9122e78f41 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -144,7 +144,7 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) /* * SMP locking strategy: * hash table is protected with spinlock unix_table_lock - * each socket state is protected by separate rwlock. + * each socket state is protected by separate spin lock. */ static inline unsigned unix_hash_fold(__wsum n) -- cgit v1.2.2 From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 20 Apr 2010 13:03:51 +0000 Subject: net: sk_sleep() helper Define a new function to return the waitqueue of a "struct sock". static inline wait_queue_head_t *sk_sleep(struct sock *sk) { return sk->sk_sleep; } Change all read occurrences of sk_sleep by a call to this function. Needed for a future RCU conversion. sk_sleep wont be a field directly available. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3d9122e78f41..87c0360eaa25 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -316,7 +316,7 @@ static void unix_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk->sk_sleep); + wake_up_interruptible_sync(sk_sleep(sk)); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1736,7 +1736,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) unix_state_lock(sk); for (;;) { - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (!skb_queue_empty(&sk->sk_receive_queue) || sk->sk_err || @@ -1752,7 +1752,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo) clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } - finish_wait(sk->sk_sleep, &wait); + finish_wait(sk_sleep(sk), &wait); unix_state_unlock(sk); return timeo; } @@ -1991,7 +1991,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table struct sock *sk = sock->sk; unsigned int mask; - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ @@ -2028,7 +2028,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int mask, writable; - sock_poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk_sleep(sk), wait); mask = 0; /* exceptional events? */ -- cgit v1.2.2 From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Apr 2010 11:01:49 +0000 Subject: net: sock_def_readable() and friends RCU conversion sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we need two atomic operations (and associated dirtying) per incoming packet. RCU conversion is pretty much needed : 1) Add a new structure, called "struct socket_wq" to hold all fields that will need rcu_read_lock() protection (currently: a wait_queue_head_t and a struct fasync_struct pointer). [Future patch will add a list anchor for wakeup coalescing] 2) Attach one of such structure to each "struct socket" created in sock_alloc_inode(). 3) Respect RCU grace period when freeing a "struct socket_wq" 4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct socket_wq" 5) Change sk_sleep() function to use new sk->sk_wq instead of sk->sk_sleep 6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside a rcu_read_lock() section. 7) Change all sk_has_sleeper() callers to : - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock) - Use wq_has_sleeper() to eventually wakeup tasks. - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock) 8) sock_wake_async() is modified to use rcu protection as well. 9) Exceptions : macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq" instead of dynamically allocated ones. They dont need rcu freeing. Some cleanups or followups are probably needed, (possible sk_callback_lock conversion to a spinlock for example...). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 87c0360eaa25..fef2cc5e9d2b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk) static void unix_write_space(struct sock *sk) { - read_lock(&sk->sk_callback_lock); + struct socket_wq *wq; + + rcu_read_lock(); if (unix_writable(sk)) { - if (sk_has_sleeper(sk)) - wake_up_interruptible_sync(sk_sleep(sk)); + wq = rcu_dereference(sk->sk_wq); + if (wq_has_sleeper(wq)) + wake_up_interruptible_sync(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } - read_unlock(&sk->sk_callback_lock); + rcu_read_unlock(); } /* When dgram socket disconnects (or changes its peer), we clear its receive @@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion) skpair->sk_err = ECONNRESET; unix_state_unlock(skpair); skpair->sk_state_change(skpair); - read_lock(&skpair->sk_callback_lock); sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); - read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ unix_peer(sk) = NULL; @@ -1142,7 +1143,7 @@ restart: newsk->sk_peercred.pid = task_tgid_vnr(current); current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); newu = unix_sk(newsk); - newsk->sk_sleep = &newu->peer_wait; + newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); /* copy address information from listening to new sock*/ @@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode) other->sk_shutdown |= peer_mode; unix_state_unlock(other); other->sk_state_change(other); - read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - read_unlock(&other->sk_callback_lock); } if (other) sock_put(other); -- cgit v1.2.2 From 109f6e39fa07c48f580125f531f46cb7c245b528 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:30:14 +0000 Subject: af_unix: Allow SO_PEERCRED to work across namespaces. Use struct pid and struct cred to store the peer credentials on struct sock. This gives enough information to convert the peer credential information to a value relative to whatever namespace the socket is in at the time. This removes nasty surprises when using SO_PEERCRED on socket connetions where the processes on either side are in different pid and user namespaces. Signed-off-by: Eric W. Biederman Acked-by: Daniel Lezcano Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fef2cc5e9d2b..e1f1349fae86 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -450,11 +450,31 @@ static int unix_release_sock(struct sock *sk, int embrion) return 0; } +static void init_peercred(struct sock *sk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(task_tgid(current)); + sk->sk_peer_cred = get_current_cred(); +} + +static void copy_peercred(struct sock *sk, struct sock *peersk) +{ + put_pid(sk->sk_peer_pid); + if (sk->sk_peer_cred) + put_cred(sk->sk_peer_cred); + sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); +} + static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct pid *old_pid = NULL; + const struct cred *old_cred = NULL; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -470,12 +490,14 @@ static int unix_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ - sk->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&sk->sk_peercred.uid, &sk->sk_peercred.gid); + init_peercred(sk); err = 0; out_unlock: unix_state_unlock(sk); + put_pid(old_pid); + if (old_cred) + put_cred(old_cred); out: return err; } @@ -1140,8 +1162,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - newsk->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid); + init_peercred(newsk); newu = unix_sk(newsk); newsk->sk_wq = &newu->peer_wq; otheru = unix_sk(other); @@ -1157,7 +1178,7 @@ restart: } /* Set credentials */ - sk->sk_peercred = other->sk_peercred; + copy_peercred(sk, other); sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; @@ -1199,10 +1220,8 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current); - current_euid_egid(&skb->sk_peercred.uid, &skb->sk_peercred.gid); - ska->sk_peercred.uid = skb->sk_peercred.uid; - ska->sk_peercred.gid = skb->sk_peercred.gid; + init_peercred(ska); + init_peercred(skb); if (ska->sk_type != SOCK_DGRAM) { ska->sk_state = TCP_ESTABLISHED; -- cgit v1.2.2 From 7361c36c5224519b258219fe3d0e8abc865d8134 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:34:33 +0000 Subject: af_unix: Allow credentials to work across user and pid namespaces. In unix_skb_parms store pointers to struct pid and struct cred instead of raw uid, gid, and pid values, then translate the credentials on reception into values that are meaningful in the receiving processes namespaces. Signed-off-by: Eric W. Biederman Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e1f1349fae86..5fe9d6fe08b8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1316,18 +1316,20 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) int i; scm->fp = UNIXCB(skb).fp; - skb->destructor = sock_wfree; UNIXCB(skb).fp = NULL; for (i = scm->fp->count-1; i >= 0; i--) unix_notinflight(scm->fp->fp[i]); } -static void unix_destruct_fds(struct sk_buff *skb) +static void unix_destruct_scm(struct sk_buff *skb) { struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - unix_detach_fds(&scm, skb); + scm.pid = UNIXCB(skb).pid; + scm.cred = UNIXCB(skb).cred; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); /* Alas, it calls VFS */ /* So fscking what? fput() had been SMP-safe since the last Summer */ @@ -1350,10 +1352,22 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count-1; i >= 0; i--) unix_inflight(scm->fp->fp[i]); - skb->destructor = unix_destruct_fds; return 0; } +static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) +{ + int err = 0; + UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).cred = get_cred(scm->cred); + UNIXCB(skb).fp = NULL; + if (scm->fp && send_fds) + err = unix_attach_fds(scm, skb); + + skb->destructor = unix_destruct_scm; + return err; +} + /* * Send AF_UNIX data. */ @@ -1410,12 +1424,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (skb == NULL) goto out; - memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); - if (siocb->scm->fp) { - err = unix_attach_fds(siocb->scm, skb); - if (err) - goto out_free; - } + err = unix_scm_to_skb(siocb->scm, skb, true); + if (err) + goto out_free; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1585,16 +1596,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, */ size = min_t(int, size, skb_tailroom(skb)); - memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + /* Only send the fds in the first buffer */ - if (siocb->scm->fp && !fds_sent) { - err = unix_attach_fds(siocb->scm, skb); - if (err) { - kfree_skb(skb); - goto out_err; - } - fds_sent = true; + err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); + if (err) { + kfree_skb(skb); + goto out_err; } + fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err) { @@ -1711,7 +1720,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); } - siocb->scm->creds = *UNIXCREDS(skb); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { @@ -1860,14 +1869,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (check_creds) { /* Never glue messages from different writers */ - if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, - sizeof(siocb->scm->creds)) != 0) { + if ((UNIXCB(skb).pid != siocb->scm->pid) || + (UNIXCB(skb).cred != siocb->scm->cred)) { skb_queue_head(&sk->sk_receive_queue, skb); break; } } else { /* Copy credentials */ - siocb->scm->creds = *UNIXCREDS(skb); + scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); check_creds = 1; } -- cgit v1.2.2 From 6616f7888c6088324727363276f6de05a1dca6fc Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 13 Jun 2010 03:35:48 +0000 Subject: af_unix: Allow connecting to sockets in other network namespaces. Remove the restriction that only allows connecting to a unix domain socket identified by unix path that is in the same network namespace. Crossing network namespaces is always tricky and we did not support this at first, because of a strict policy of don't mix the namespaces. Later after Pavel proposed this we did not support this because no one had performed the audit to make certain using unix domain sockets across namespaces is safe. What fundamentally makes connecting to af_unix sockets in other namespaces is safe is that you have to have the proper permissions on the unix domain socket inode that lives in the filesystem. If you want strict isolation you just don't create inodes where unfriendlys can get at them, or with permissions that allow unfriendlys to open them. All nicely handled for us by the mount namespace and other standard file system facilities. I looked through unix domain sockets and they are a very controlled environment so none of the work that goes on in dev_forward_skb to make crossing namespaces safe appears needed, we are not loosing controll of the skb and so do not need to set up the skb to look like it is comming in fresh from the outside world. Further the fields in struct unix_skb_parms should not have any problems crossing network namespaces. Now that we handle SCM_CREDENTIALS in a way that gives useable values across namespaces. There does not appear to be any operational problems with encouraging the use of unix domain sockets across containers either. Signed-off-by: Eric W. Biederman Acked-by: Daniel Lezcano Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/unix/af_unix.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5fe9d6fe08b8..75ba48b0d12a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -282,7 +282,7 @@ static inline struct sock *unix_find_socket_byname(struct net *net, return s; } -static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) +static struct sock *unix_find_socket_byinode(struct inode *i) { struct sock *s; struct hlist_node *node; @@ -292,9 +292,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; - if (!net_eq(sock_net(s), net)) - continue; - if (dentry && dentry->d_inode == i) { sock_hold(s); goto found; @@ -758,7 +755,7 @@ static struct sock *unix_find_other(struct net *net, err = -ECONNREFUSED; if (!S_ISSOCK(inode->i_mode)) goto put_fail; - u = unix_find_socket_byinode(net, inode); + u = unix_find_socket_byinode(inode); if (!u) goto put_fail; -- cgit v1.2.2 From 70d4bf6d467a330ccc947df9b2608e329d9e7708 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 20 Jul 2010 06:45:56 +0000 Subject: drop_monitor: convert some kfree_skb call sites to consume_skb Convert a few calls from kfree_skb to consume_skb Noticed while I was working on dropwatch that I was detecting lots of internal skb drops in several places. While some are legitimate, several were not, freeing skbs that were at the end of their life, rather than being discarded due to an error. This patch converts those calls sites from using kfree_skb to consume_skb, which quiets the in-kernel drop_monitor code from detecting them as drops. Tested successfully by myself Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 75ba48b0d12a..4414a18c63b4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1906,7 +1906,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, break; } - kfree_skb(skb); + consume_skb(skb); if (siocb->scm->fp) break; -- cgit v1.2.2 From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Sep 2010 11:13:50 +0000 Subject: net: poll() optimizations No need to test twice sk->sk_shutdown & RCV_SHUTDOWN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..9077b4ea00c5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2024,11 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ -- cgit v1.2.2 From 8df73ff90f00f14d2c7ff7156f7ef153f7e9d3b7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 4 Sep 2010 01:34:28 +0000 Subject: UNIX: Do not loop forever at unix_autobind(). We assumed that unix_autobind() never fails if kzalloc() succeeded. But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can consume all names using fork()/socket()/bind(). If all names are in use, those who call bind() with addr_len == sizeof(short) or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue while (1) yield(); loop at unix_autobind() till a name becomes available. This patch adds a loop counter in order to give up after 1048576 attempts. Calling yield() for once per 256 attempts may not be sufficient when many names are already in use, for __unix_find_socket_byname() can take long time under such circumstance. Therefore, this patch also adds cond_resched() call. Note that currently a local user can consume 2GB of kernel memory if the user is allowed to create and autobind 1048576 UNIX domain sockets. We should consider adding some restriction for autobind operation. Signed-off-by: Tetsuo Handa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..0b39b2451ea5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock) static u32 ordernum = 1; struct unix_address *addr; int err; + unsigned int retries = 0; mutex_lock(&u->readlock); @@ -717,9 +718,17 @@ retry: if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); - /* Sanity yield. It is unusual case, but yet... */ - if (!(ordernum&0xFF)) - yield(); + /* + * __unix_find_socket_byname() may take long time if many names + * are already in use. + */ + cond_resched(); + /* Give up if all names seems to be in use. */ + if (retries++ == 0xFFFFF) { + err = -ENOSPC; + kfree(addr); + goto out; + } goto retry; } addr->hash ^= sk->sk_type; -- cgit v1.2.2 From 3f66116e89521ef71ab0d63dc07a639def88a577 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 4 Oct 2010 08:48:28 +0000 Subject: AF_UNIX: Implement SO_TIMESTAMP and SO_TIMETAMPNS on Unix sockets Userspace applications can already request to receive timestamps with: setsockopt(sockfd, SOL_SOCKET, SO_TIMESTAMP, ...) Although setsockopt() returns zero (success), timestamps are not added to the ancillary data. This patch fixes that on SOCK_DGRAM and SOCK_SEQPACKET Unix sockets. Signed-off-by: Alban Crequy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c586da3f4f18..0ebc777a6660 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1511,6 +1511,8 @@ restart: goto restart; } + if (sock_flag(other, SOCK_RCVTSTAMP)) + __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other, len); @@ -1722,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + if (!siocb->scm) { siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); -- cgit v1.2.2 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/unix/af_unix.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..3c95304a0817 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v1.2.2 From 67426b756c4d52c511c4b22b269accea171692a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 29 Oct 2010 20:44:44 +0000 Subject: af_unix: use keyed wakeups Instead of wakeup all sleepers, use wake_up_interruptible_sync_poll() to wakeup only ones interested into writing the socket. This patch is a specialization of commit 37e5540b3c9d (epoll keyed wakeups: make sockets use keyed wakeups). On a test program provided by Alan Crequy : Before: real 0m3.101s user 0m0.000s sys 0m6.104s After: real 0m0.211s user 0m0.000s sys 0m0.208s Reported-by: Alban Crequy Signed-off-by: Eric Dumazet Cc: Davide Libenzi Signed-off-by: David S. Miller --- net/unix/af_unix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304a0817..f33c5958dbb2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -316,7 +316,8 @@ static void unix_write_space(struct sock *sk) if (unix_writable(sk)) { wq = rcu_dereference(sk->sk_wq); if (wq_has_sleeper(wq)) - wake_up_interruptible_sync(&wq->wait); + wake_up_interruptible_sync_poll(&wq->wait, + POLLOUT | POLLWRNORM | POLLWRBAND); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); @@ -1710,7 +1711,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, goto out_unlock; } - wake_up_interruptible_sync(&u->peer_wait); + wake_up_interruptible_sync_poll(&u->peer_wait, + POLLOUT | POLLWRNORM | POLLWRBAND); if (msg->msg_name) unix_copy_addr(msg, skb->sk); -- cgit v1.2.2 From 5456f09aaf88731e16dbcea7522cb330b6846415 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 31 Oct 2010 05:36:23 +0000 Subject: af_unix: fix unix_dgram_poll() behavior for EPOLLOUT event Alban Crequy reported a problem with connected dgram af_unix sockets and provided a test program. epoll() would miss to send an EPOLLOUT event when a thread unqueues a packet from the other peer, making its receive queue not full. This is because unix_dgram_poll() fails to call sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); if the socket is not writeable at the time epoll_ctl(ADD) is called. We must call sock_poll_wait(), regardless of 'writable' status, so that epoll can be notified later of states changes. Misc: avoids testing twice (sk->sk_shutdown & RCV_SHUTDOWN) Reported-by: Alban Crequy Cc: Davide Libenzi Signed-off-by: Eric Dumazet Acked-by: Davide Libenzi Signed-off-by: David S. Miller --- net/unix/af_unix.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f33c5958dbb2..e8898758dd31 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2074,13 +2074,12 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -2092,20 +2091,15 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; } - /* writable? */ writable = unix_writable(sk); - if (writable) { - other = unix_peer_get(sk); - if (other) { - if (unix_peer(other) != sk) { - sock_poll_wait(file, &unix_sk(other)->peer_wait, - wait); - if (unix_recvq_full(other)) - writable = 0; - } - - sock_put(other); + other = unix_peer_get(sk); + if (other) { + if (unix_peer(other) != sk) { + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); + if (unix_recvq_full(other)) + writable = 0; } + sock_put(other); } if (writable) -- cgit v1.2.2 From 973a34aa8593dbfe84386343c694f5beecb51d8a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 31 Oct 2010 05:38:25 +0000 Subject: af_unix: optimize unix_dgram_poll() unix_dgram_poll() is pretty expensive to check POLLOUT status, because it has to lock the socket to get its peer, take a reference on the peer to check its receive queue status, and queue another poll_wait on peer_wait. This all can be avoided if the process calling unix_dgram_poll() is not interested in POLLOUT status. It makes unix_dgram_recvmsg() faster by not queueing irrelevant pollers in peer_wait. On a test program provided by Alan Crequy : Before: real 0m0.211s user 0m0.000s sys 0m0.208s After: real 0m0.044s user 0m0.000s sys 0m0.040s Suggested-by: Davide Libenzi Reported-by: Alban Crequy Acked-by: Davide Libenzi Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e8898758dd31..7ff31c60186a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2091,6 +2091,10 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, return mask; } + /* No write status requested, avoid expensive OUT tests. */ + if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT))) + return mask; + writable = unix_writable(sk); other = unix_peer_get(sk); if (other) { -- cgit v1.2.2 From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Nov 2010 04:11:39 +0000 Subject: af_unix: limit recursion level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its easy to eat all kernel memory and trigger NMI watchdog, using an exploit program that queues unix sockets on top of others. lkml ref : http://lkml.org/lkml/2010/11/25/8 This mechanism is used in applications, one choice we have is to have a recursion limit. Other limits might be needed as well (if we queue other types of files), since the passfd mechanism is currently limited by socket receive queue sizes only. Add a recursion_level to unix socket, allowing up to 4 levels. Each time we send an unix socket through sendfd mechanism, we copy its recursion level (plus one) to receiver. This recursion level is cleared when socket receive queue is emptied. Reported-by: Марк Коренберг Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304a0817..2268e6798124 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +#define MAX_RECURSION_LEVEL 4 + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage @@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count-1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - return 0; + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = unix_scm_to_skb(siocb->scm, skb, true); - if (err) + if (err < 0) goto out_free; + max_level = err + 1; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1514,6 +1534,8 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); @@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); - if (err) { + if (err < 0) { kfree_skb(skb); goto out_err; } + max_level = err + 1; fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, size); sent += size; @@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; -- cgit v1.2.2 From 3610cda53f247e176bcbb7a7cca64bc53b12acdb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 5 Jan 2011 15:38:53 -0800 Subject: af_unix: Avoid socket->sk NULL OOPS in stream connect security hooks. unix_release() can asynchornously set socket->sk to NULL, and it does so without holding the unix_state_lock() on "other" during stream connects. However, the reverse mapping, sk->sk_socket, is only transitioned to NULL under the unix_state_lock(). Therefore make the security hooks follow the reverse mapping instead of the forward mapping. Reported-by: Jeremy Fitzhardinge Reported-by: Linus Torvalds Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 417d7a6c36cf..dd419d286204 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1157,7 +1157,7 @@ restart: goto restart; } - err = security_unix_stream_connect(sock, other->sk_socket, newsk); + err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; -- cgit v1.2.2