From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Sep 2010 11:13:50 +0000 Subject: net: poll() optimizations No need to test twice sk->sk_shutdown & RCV_SHUTDOWN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..9077b4ea00c5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2024,11 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ -- cgit v1.2.2 From 8df73ff90f00f14d2c7ff7156f7ef153f7e9d3b7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 4 Sep 2010 01:34:28 +0000 Subject: UNIX: Do not loop forever at unix_autobind(). We assumed that unix_autobind() never fails if kzalloc() succeeded. But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can consume all names using fork()/socket()/bind(). If all names are in use, those who call bind() with addr_len == sizeof(short) or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue while (1) yield(); loop at unix_autobind() till a name becomes available. This patch adds a loop counter in order to give up after 1048576 attempts. Calling yield() for once per 256 attempts may not be sufficient when many names are already in use, for __unix_find_socket_byname() can take long time under such circumstance. Therefore, this patch also adds cond_resched() call. Note that currently a local user can consume 2GB of kernel memory if the user is allowed to create and autobind 1048576 UNIX domain sockets. We should consider adding some restriction for autobind operation. Signed-off-by: Tetsuo Handa Signed-off-by: David S. Miller --- net/unix/af_unix.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18c63b4..0b39b2451ea5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock) static u32 ordernum = 1; struct unix_address *addr; int err; + unsigned int retries = 0; mutex_lock(&u->readlock); @@ -717,9 +718,17 @@ retry: if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { spin_unlock(&unix_table_lock); - /* Sanity yield. It is unusual case, but yet... */ - if (!(ordernum&0xFF)) - yield(); + /* + * __unix_find_socket_byname() may take long time if many names + * are already in use. + */ + cond_resched(); + /* Give up if all names seems to be in use. */ + if (retries++ == 0xFFFFF) { + err = -ENOSPC; + kfree(addr); + goto out; + } goto retry; } addr->hash ^= sk->sk_type; -- cgit v1.2.2 From 3f66116e89521ef71ab0d63dc07a639def88a577 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 4 Oct 2010 08:48:28 +0000 Subject: AF_UNIX: Implement SO_TIMESTAMP and SO_TIMETAMPNS on Unix sockets Userspace applications can already request to receive timestamps with: setsockopt(sockfd, SOL_SOCKET, SO_TIMESTAMP, ...) Although setsockopt() returns zero (success), timestamps are not added to the ancillary data. This patch fixes that on SOCK_DGRAM and SOCK_SEQPACKET Unix sockets. Signed-off-by: Alban Crequy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c586da3f4f18..0ebc777a6660 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1511,6 +1511,8 @@ restart: goto restart; } + if (sock_flag(other, SOCK_RCVTSTAMP)) + __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other, len); @@ -1722,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + if (!siocb->scm) { siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); -- cgit v1.2.2 From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Oct 2010 14:22:44 -0700 Subject: fs: allow for more than 2^31 files Robin Holt tried to boot a 16TB system and found af_unix was overflowing a 32bit value : We were seeing a failure which prevented boot. The kernel was incapable of creating either a named pipe or unix domain socket. This comes down to a common kernel function called unix_create1() which does: atomic_inc(&unix_nr_socks); if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; The function get_max_files() is a simple return of files_stat.max_files. files_stat.max_files is a signed integer and is computed in fs/file_table.c's files_init(). n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = n; In our case, mempages (total_ram_pages) is approx 3,758,096,384 (0xe0000000). That leaves max_files at approximately 1,503,238,553. This causes 2 * get_max_files() to integer overflow. Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long integers, and change af_unix to use an atomic_long_t instead of atomic_t. get_max_files() is changed to return an unsigned long. get_nr_files() is changed to return a long. unix_nr_socks is changed from atomic_t to atomic_long_t, while not strictly needed to address Robin problem. Before patch (on a 64bit kernel) : # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max -18446744071562067968 After patch: # echo 2147483648 >/proc/sys/fs/file-max # cat /proc/sys/fs/file-max 2147483648 # cat /proc/sys/fs/file-nr 704 0 2147483648 Reported-by: Robin Holt Signed-off-by: Eric Dumazet Acked-by: David Miller Reviewed-by: Robin Holt Tested-by: Robin Holt Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/unix/af_unix.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0ebc777a6660..3c95304a0817 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -117,7 +117,7 @@ static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static DEFINE_SPINLOCK(unix_table_lock); -static atomic_t unix_nr_socks = ATOMIC_INIT(0); +static atomic_long_t unix_nr_socks; #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk) if (u->addr) unix_release_addr(u->addr); - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); local_bh_enable(); #ifdef UNIX_REFCNT_DEBUG - printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, - atomic_read(&unix_nr_socks)); + printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk, + atomic_long_read(&unix_nr_socks)); #endif } @@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - atomic_inc(&unix_nr_socks); - if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) + atomic_long_inc(&unix_nr_socks); + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) goto out; sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); @@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock) unix_insert_socket(unix_sockets_unbound, sk); out: if (sk == NULL) - atomic_dec(&unix_nr_socks); + atomic_long_dec(&unix_nr_socks); else { local_bh_disable(); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); -- cgit v1.2.2 From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 25 Nov 2010 04:11:39 +0000 Subject: af_unix: limit recursion level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its easy to eat all kernel memory and trigger NMI watchdog, using an exploit program that queues unix sockets on top of others. lkml ref : http://lkml.org/lkml/2010/11/25/8 This mechanism is used in applications, one choice we have is to have a recursion limit. Other limits might be needed as well (if we queue other types of files), since the passfd mechanism is currently limited by socket receive queue sizes only. Add a recursion_level to unix socket, allowing up to 4 levels. Each time we send an unix socket through sendfd mechanism, we copy its recursion level (plus one) to receiver. This recursion level is cleared when socket receive queue is emptied. Reported-by: Марк Коренберг Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3c95304a0817..2268e6798124 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb) sock_wfree(skb); } +#define MAX_RECURSION_LEVEL 4 + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { int i; + unsigned char max_level = 0; + int unix_sock_count = 0; + + for (i = scm->fp->count - 1; i >= 0; i--) { + struct sock *sk = unix_get_socket(scm->fp->fp[i]); + + if (sk) { + unix_sock_count++; + max_level = max(max_level, + unix_sk(sk)->recursion_level); + } + } + if (unlikely(max_level > MAX_RECURSION_LEVEL)) + return -ETOOMANYREFS; /* * Need to duplicate file references for the sake of garbage @@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count-1; i >= 0; i--) - unix_inflight(scm->fp->fp[i]); - return 0; + if (unix_sock_count) { + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->fp[i]); + } + return max_level; } static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) @@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, struct sk_buff *skb; long timeo; struct scm_cookie tmp_scm; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; err = unix_scm_to_skb(siocb->scm, skb, true); - if (err) + if (err < 0) goto out_free; + max_level = err + 1; unix_get_secdata(siocb->scm, skb); skb_reset_transport_header(skb); @@ -1514,6 +1534,8 @@ restart: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); @@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, int sent = 0; struct scm_cookie tmp_scm; bool fds_sent = false; + int max_level; if (NULL == siocb->scm) siocb->scm = &tmp_scm; @@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); - if (err) { + if (err < 0) { kfree_skb(skb); goto out_err; } + max_level = err + 1; fds_sent = true; err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); @@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); + if (max_level > unix_sk(other)->recursion_level) + unix_sk(other)->recursion_level = max_level; unix_state_unlock(other); other->sk_data_ready(other, size); sent += size; @@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb == NULL) { + unix_sk(sk)->recursion_level = 0; if (copied >= target) goto unlock; -- cgit v1.2.2