From a57de0b4336e48db2811a2030bb68dba8dd09d88 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 8 Jul 2009 12:09:13 +0000
Subject: net: adding memory barrier to the poll and receive callbacks

Adding memory barrier after the poll_wait function, paired with
receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper
to wrap the memory barrier.

Without the memory barrier, following race can happen.
The race fires, when following code paths meet, and the tp->rcv_nxt
and __add_wait_queue updates stay in CPU caches.

CPU1                         CPU2

sys_select                   receive packet
  ...                        ...
  __add_wait_queue           update tp->rcv_nxt
  ...                        ...
  tp->rcv_nxt check          sock_def_readable
  ...                        {
  schedule                      ...
                                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                                        wake_up_interruptible(sk->sk_sleep)
                                ...
                             }

If there was no cache the code would work ok, since the wait_queue and
rcv_nxt are opposit to each other.

Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already
passed the tp->rcv_nxt check and sleeps, or will get the new value for
tp->rcv_nxt and will return with new data mask.
In both cases the process (CPU1) is being added to the wait queue, so the
waitqueue_active (CPU2) call cannot miss and will wake up CPU1.

The bad case is when the __add_wait_queue changes done by CPU1 stay in its
cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1 will then
endup calling schedule and sleep forever if there are no more data on the
socket.

Calls to poll_wait in following modules were ommited:
	net/bluetooth/af_bluetooth.c
	net/irda/af_irda.c
	net/irda/irnet/irnet_ppp.c
	net/mac80211/rc80211_pid_debugfs.c
	net/phonet/socket.c
	net/rds/af_rds.c
	net/rfkill/core.c
	net/sunrpc/cache.c
	net/sunrpc/rpc_pipe.c
	net/tipc/socket.c

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 36d4e44d6233..fc3ebb906911 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk)
 {
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
-		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
+		if (sk_has_sleeper(sk))
 			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
@@ -1985,7 +1985,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	struct sock *sk = sock->sk;
 	unsigned int mask;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk->sk_sleep, wait);
 	mask = 0;
 
 	/* exceptional events? */
@@ -2022,7 +2022,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
 
-	poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk->sk_sleep, wait);
 	mask = 0;
 
 	/* exceptional events? */
@@ -2053,7 +2053,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 		other = unix_peer_get(sk);
 		if (other) {
 			if (unix_peer(other) != sk) {
-				poll_wait(file, &unix_sk(other)->peer_wait,
+				sock_poll_wait(file, &unix_sk(other)->peer_wait,
 					  wait);
 				if (unix_recvq_full(other))
 					writable = 0;
-- 
cgit v1.2.2


From 8ba69ba6a324b13e1190fc31e41954d190fd4f1d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 11 Sep 2009 11:31:45 -0700
Subject: net: unix: fix sending fds in multiple buffers

Kalle Olavi Niemitalo reported that:

  "..., when one process calls sendmsg once to send 43804 bytes of
  data and one file descriptor, and another process then calls recvmsg
  three times to receive the 16032+16032+11740 bytes, each of those
  recvmsg calls returns the file descriptor in the ancillary data.  I
  confirmed this with strace.  The behaviour differs from Linux
  2.6.26, where reportedly only one of those recvmsg calls (I think
  the first one) returned the file descriptor."

This bug was introduced by a patch from me titled "net: unix: fix inflight
counting bug in garbage collector", commit 6209344f5.

And the reason is, quoting Kalle:

  "Before your patch, unix_attach_fds() would set scm->fp = NULL, so
  that if the loop in unix_stream_sendmsg() ran multiple iterations,
  it could not call unix_attach_fds() again.  But now,
  unix_attach_fds() leaves scm->fp unchanged, and I think this causes
  it to be called multiple times and duplicate the same file
  descriptors to each struct sk_buff."

Fix this by introducing a flag that is cleared at the start and set
when the fds attached to the first buffer.  The resulting code should
work equivalently to the one on 2.6.26.

Reported-by: Kalle Olavi Niemitalo <kon@iki.fi>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fc3ebb906911..51ab497115eb 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1501,6 +1501,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct sk_buff *skb;
 	int sent = 0;
 	struct scm_cookie tmp_scm;
+	bool fds_sent = false;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1562,12 +1563,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		size = min_t(int, size, skb_tailroom(skb));
 
 		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-		if (siocb->scm->fp) {
+		/* Only send the fds in the first buffer */
+		if (siocb->scm->fp && !fds_sent) {
 			err = unix_attach_fds(siocb->scm, skb);
 			if (err) {
 				kfree_skb(skb);
 				goto out_err;
 			}
+			fds_sent = true;
 		}
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
-- 
cgit v1.2.2


From ec1b4cf74c81bfd0fbe5bf62bafc86c45917e72f Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 5 Oct 2009 05:58:39 +0000
Subject: net: mark net_proto_ops as const

All usages of structure net_proto_ops should be declared const.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 51ab497115eb..0f133c5a8d3c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2214,7 +2214,7 @@ static const struct file_operations unix_seq_fops = {
 
 #endif
 
-static struct net_proto_family unix_family_ops = {
+static const struct net_proto_family unix_family_ops = {
 	.family = PF_UNIX,
 	.create = unix_create,
 	.owner	= THIS_MODULE,
-- 
cgit v1.2.2


From 77238f2b942b38ab4e7f3aced44084493e4a8675 Mon Sep 17 00:00:00 2001
From: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Date: Sun, 18 Oct 2009 23:17:37 -0700
Subject: AF_UNIX: Fix deadlock on connecting to shutdown socket

I found a deadlock bug in UNIX domain socket, which makes able to DoS
attack against the local machine by non-root users.

How to reproduce:
1. Make a listening AF_UNIX/SOCK_STREAM socket with an abstruct
    namespace(*), and shutdown(2) it.
 2. Repeat connect(2)ing to the listening socket from the other sockets
    until the connection backlog is full-filled.
 3. connect(2) takes the CPU forever. If every core is taken, the
    system hangs.

PoC code: (Run as many times as cores on SMP machines.)

int main(void)
{
	int ret;
	int csd;
	int lsd;
	struct sockaddr_un sun;

	/* make an abstruct name address (*) */
	memset(&sun, 0, sizeof(sun));
	sun.sun_family = PF_UNIX;
	sprintf(&sun.sun_path[1], "%d", getpid());

	/* create the listening socket and shutdown */
	lsd = socket(AF_UNIX, SOCK_STREAM, 0);
	bind(lsd, (struct sockaddr *)&sun, sizeof(sun));
	listen(lsd, 1);
	shutdown(lsd, SHUT_RDWR);

	/* connect loop */
	alarm(15); /* forcely exit the loop after 15 sec */
	for (;;) {
		csd = socket(AF_UNIX, SOCK_STREAM, 0);
		ret = connect(csd, (struct sockaddr *)&sun, sizeof(sun));
		if (-1 == ret) {
			perror("connect()");
			break;
		}
		puts("Connection OK");
	}
	return 0;
}

(*) Make sun_path[0] = 0 to use the abstruct namespace.
    If a file-based socket is used, the system doesn't deadlock because
    of context switches in the file system layer.

Why this happens:
 Error checks between unix_socket_connect() and unix_wait_for_peer() are
 inconsistent. The former calls the latter to wait until the backlog is
 processed. Despite the latter returns without doing anything when the
 socket is shutdown, the former doesn't check the shutdown state and
 just retries calling the latter forever.

Patch:
 The patch below adds shutdown check into unix_socket_connect(), so
 connect(2) to the shutdown socket will return -ECONREFUSED.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Signed-off-by: Masanori Yoshida <masanori.yoshida.tv@hitachi.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 51ab497115eb..fc820cd75453 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1074,6 +1074,8 @@ restart:
 	err = -ECONNREFUSED;
 	if (other->sk_state != TCP_LISTEN)
 		goto out_unlock;
+	if (other->sk_shutdown & RCV_SHUTDOWN)
+		goto out_unlock;
 
 	if (unix_recvq_full(other)) {
 		err = -EAGAIN;
-- 
cgit v1.2.2


From 3f378b684453f2a028eda463ce383370545d9cc9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 5 Nov 2009 22:18:14 -0800
Subject: net: pass kern to net_proto_family create function

The generic __sock_create function has a kern argument which allows the
security system to make decisions based on if a socket is being created by
the kernel or by userspace.  This patch passes that flag to the
net_proto_family specific create function, so it can do the same thing.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3291902f0b88..178d3af2a605 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -621,7 +621,8 @@ out:
 	return sk;
 }
 
-static int unix_create(struct net *net, struct socket *sock, int protocol)
+static int unix_create(struct net *net, struct socket *sock, int protocol,
+		       int kern)
 {
 	if (protocol && protocol != PF_UNIX)
 		return -EPROTONOSUPPORT;
-- 
cgit v1.2.2


From 13cfa97bef0f1172879f98307ac716acf3e9cea9 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 8 Nov 2009 05:51:19 +0000
Subject: net: netlink_getname, packet_getname -- use DECLARE_SOCKADDR guard

Use guard DECLARE_SOCKADDR in a few more places which allow
us to catch if the structure copied back is too big.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 178d3af2a605..7553ea6edd8f 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1259,7 +1259,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_
 {
 	struct sock *sk = sock->sk;
 	struct unix_sock *u;
-	struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
+	DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
 	int err = 0;
 
 	if (peer) {
-- 
cgit v1.2.2


From f64f9e719261a87818dd192a3a2352e5b20fbd0f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Sun, 29 Nov 2009 16:55:45 -0800
Subject: net: Move && and || to end of previous line

Not including net/atm/

Compiled tested x86 allyesconfig only
Added a > 80 column line or two, which I ignored.
Existing checkpatch plaints willfully, cheerfully ignored.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7553ea6edd8f..f25511903115 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1033,8 +1033,8 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 		goto out;
 	addr_len = err;
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags)
-		&& !u->addr && (err = unix_autobind(sock)) != 0)
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
+	    (err = unix_autobind(sock)) != 0)
 		goto out;
 
 	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
@@ -1378,8 +1378,8 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto out;
 	}
 
-	if (test_bit(SOCK_PASSCRED, &sock->flags)
-		&& !u->addr && (err = unix_autobind(sock)) != 0)
+	if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
+	    && (err = unix_autobind(sock)) != 0)
 		goto out;
 
 	err = -EMSGSIZE;
-- 
cgit v1.2.2


From 2c8c1e7297e19bdef3c178c3ea41d898a7716e3e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 17 Jan 2010 03:35:32 +0000
Subject: net: spread __net_init, __net_exit

__net_init/__net_exit are apparently not going away, so use them
to full extent.

In some cases __net_init was removed, because it was called from
__net_exit code.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f25511903115..9bc9b92bc099 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2224,7 +2224,7 @@ static const struct net_proto_family unix_family_ops = {
 };
 
 
-static int unix_net_init(struct net *net)
+static int __net_init unix_net_init(struct net *net)
 {
 	int error = -ENOMEM;
 
@@ -2243,7 +2243,7 @@ out:
 	return error;
 }
 
-static void unix_net_exit(struct net *net)
+static void __net_exit unix_net_exit(struct net *net)
 {
 	unix_sysctl_unregister(net);
 	proc_net_remove(net, "unix");
-- 
cgit v1.2.2


From 663717f65c075eb4c6da7a123041295bd5295cc0 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Thu, 18 Feb 2010 14:12:06 -0800
Subject: AF_UNIX: update locking comment

The lock used in unix_state_lock() is a spin_lock not reader-writer.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 9bc9b92bc099..3d9122e78f41 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -144,7 +144,7 @@ static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 /*
  *  SMP locking strategy:
  *    hash table is protected with spinlock unix_table_lock
- *    each socket state is protected by separate rwlock.
+ *    each socket state is protected by separate spin lock.
  */
 
 static inline unsigned unix_hash_fold(__wsum n)
-- 
cgit v1.2.2


From aa395145165cb06a0d0885221bbe0ce4a564391d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 20 Apr 2010 13:03:51 +0000
Subject: net: sk_sleep() helper

Define a new function to return the waitqueue of a "struct sock".

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
	return sk->sk_sleep;
}

Change all read occurrences of sk_sleep by a call to this function.

Needed for a future RCU conversion. sk_sleep wont be a field directly
available.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3d9122e78f41..87c0360eaa25 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -316,7 +316,7 @@ static void unix_write_space(struct sock *sk)
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync(sk->sk_sleep);
+			wake_up_interruptible_sync(sk_sleep(sk));
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
@@ -1736,7 +1736,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo)
 	unix_state_lock(sk);
 
 	for (;;) {
-		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
 		if (!skb_queue_empty(&sk->sk_receive_queue) ||
 		    sk->sk_err ||
@@ -1752,7 +1752,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo)
 		clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
 	}
 
-	finish_wait(sk->sk_sleep, &wait);
+	finish_wait(sk_sleep(sk), &wait);
 	unix_state_unlock(sk);
 	return timeo;
 }
@@ -1991,7 +1991,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	struct sock *sk = sock->sk;
 	unsigned int mask;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* exceptional events? */
@@ -2028,7 +2028,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	struct sock *sk = sock->sk, *other;
 	unsigned int mask, writable;
 
-	sock_poll_wait(file, sk->sk_sleep, wait);
+	sock_poll_wait(file, sk_sleep(sk), wait);
 	mask = 0;
 
 	/* exceptional events? */
-- 
cgit v1.2.2


From 43815482370c510c569fd18edb57afcb0fa8cab6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 11:01:49 +0000
Subject: net: sock_def_readable() and friends RCU conversion

sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.

RCU conversion is pretty much needed :

1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).

[Future patch will add a list anchor for wakeup coalescing]

2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().

3) Respect RCU grace period when freeing a "struct socket_wq"

4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"

5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep

6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.

7) Change all sk_has_sleeper() callers to :
  - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
  - Use wq_has_sleeper() to eventually wakeup tasks.
  - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)

8) sock_wake_async() is modified to use rcu protection as well.

9) Exceptions :
  macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.

Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 87c0360eaa25..fef2cc5e9d2b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -313,13 +313,16 @@ static inline int unix_writable(struct sock *sk)
 
 static void unix_write_space(struct sock *sk)
 {
-	read_lock(&sk->sk_callback_lock);
+	struct socket_wq *wq;
+
+	rcu_read_lock();
 	if (unix_writable(sk)) {
-		if (sk_has_sleeper(sk))
-			wake_up_interruptible_sync(sk_sleep(sk));
+		wq = rcu_dereference(sk->sk_wq);
+		if (wq_has_sleeper(wq))
+			wake_up_interruptible_sync(&wq->wait);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
-	read_unlock(&sk->sk_callback_lock);
+	rcu_read_unlock();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
@@ -406,9 +409,7 @@ static int unix_release_sock(struct sock *sk, int embrion)
 				skpair->sk_err = ECONNRESET;
 			unix_state_unlock(skpair);
 			skpair->sk_state_change(skpair);
-			read_lock(&skpair->sk_callback_lock);
 			sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
-			read_unlock(&skpair->sk_callback_lock);
 		}
 		sock_put(skpair); /* It may now die */
 		unix_peer(sk) = NULL;
@@ -1142,7 +1143,7 @@ restart:
 	newsk->sk_peercred.pid	= task_tgid_vnr(current);
 	current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid);
 	newu = unix_sk(newsk);
-	newsk->sk_sleep		= &newu->peer_wait;
+	newsk->sk_wq		= &newu->peer_wq;
 	otheru = unix_sk(other);
 
 	/* copy address information from listening to new sock*/
@@ -1931,12 +1932,10 @@ static int unix_shutdown(struct socket *sock, int mode)
 			other->sk_shutdown |= peer_mode;
 			unix_state_unlock(other);
 			other->sk_state_change(other);
-			read_lock(&other->sk_callback_lock);
 			if (peer_mode == SHUTDOWN_MASK)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
 			else if (peer_mode & RCV_SHUTDOWN)
 				sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
-			read_unlock(&other->sk_callback_lock);
 		}
 		if (other)
 			sock_put(other);
-- 
cgit v1.2.2


From 109f6e39fa07c48f580125f531f46cb7c245b528 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 13 Jun 2010 03:30:14 +0000
Subject: af_unix: Allow SO_PEERCRED to work across namespaces.

Use struct pid and struct cred to store the peer credentials on struct
sock.  This gives enough information to convert the peer credential
information to a value relative to whatever namespace the socket is in
at the time.

This removes nasty surprises when using SO_PEERCRED on socket
connetions where the processes on either side are in different pid and
user namespaces.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index fef2cc5e9d2b..e1f1349fae86 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -450,11 +450,31 @@ static int unix_release_sock(struct sock *sk, int embrion)
 	return 0;
 }
 
+static void init_peercred(struct sock *sk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(task_tgid(current));
+	sk->sk_peer_cred = get_current_cred();
+}
+
+static void copy_peercred(struct sock *sk, struct sock *peersk)
+{
+	put_pid(sk->sk_peer_pid);
+	if (sk->sk_peer_cred)
+		put_cred(sk->sk_peer_cred);
+	sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
+	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
+}
+
 static int unix_listen(struct socket *sock, int backlog)
 {
 	int err;
 	struct sock *sk = sock->sk;
 	struct unix_sock *u = unix_sk(sk);
+	struct pid *old_pid = NULL;
+	const struct cred *old_cred = NULL;
 
 	err = -EOPNOTSUPP;
 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
@@ -470,12 +490,14 @@ static int unix_listen(struct socket *sock, int backlog)
 	sk->sk_max_ack_backlog	= backlog;
 	sk->sk_state		= TCP_LISTEN;
 	/* set credentials so connect can copy them */
-	sk->sk_peercred.pid	= task_tgid_vnr(current);
-	current_euid_egid(&sk->sk_peercred.uid, &sk->sk_peercred.gid);
+	init_peercred(sk);
 	err = 0;
 
 out_unlock:
 	unix_state_unlock(sk);
+	put_pid(old_pid);
+	if (old_cred)
+		put_cred(old_cred);
 out:
 	return err;
 }
@@ -1140,8 +1162,7 @@ restart:
 	unix_peer(newsk)	= sk;
 	newsk->sk_state		= TCP_ESTABLISHED;
 	newsk->sk_type		= sk->sk_type;
-	newsk->sk_peercred.pid	= task_tgid_vnr(current);
-	current_euid_egid(&newsk->sk_peercred.uid, &newsk->sk_peercred.gid);
+	init_peercred(newsk);
 	newu = unix_sk(newsk);
 	newsk->sk_wq		= &newu->peer_wq;
 	otheru = unix_sk(other);
@@ -1157,7 +1178,7 @@ restart:
 	}
 
 	/* Set credentials */
-	sk->sk_peercred = other->sk_peercred;
+	copy_peercred(sk, other);
 
 	sock->state	= SS_CONNECTED;
 	sk->sk_state	= TCP_ESTABLISHED;
@@ -1199,10 +1220,8 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb)
 	sock_hold(skb);
 	unix_peer(ska) = skb;
 	unix_peer(skb) = ska;
-	ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current);
-	current_euid_egid(&skb->sk_peercred.uid, &skb->sk_peercred.gid);
-	ska->sk_peercred.uid = skb->sk_peercred.uid;
-	ska->sk_peercred.gid = skb->sk_peercred.gid;
+	init_peercred(ska);
+	init_peercred(skb);
 
 	if (ska->sk_type != SOCK_DGRAM) {
 		ska->sk_state = TCP_ESTABLISHED;
-- 
cgit v1.2.2


From 7361c36c5224519b258219fe3d0e8abc865d8134 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 13 Jun 2010 03:34:33 +0000
Subject: af_unix: Allow credentials to work across user and pid namespaces.

In unix_skb_parms store pointers to struct pid and struct cred instead
of raw uid, gid, and pid values, then translate the credentials on
reception into values that are meaningful in the receiving processes
namespaces.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 53 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e1f1349fae86..5fe9d6fe08b8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1316,18 +1316,20 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	int i;
 
 	scm->fp = UNIXCB(skb).fp;
-	skb->destructor = sock_wfree;
 	UNIXCB(skb).fp = NULL;
 
 	for (i = scm->fp->count-1; i >= 0; i--)
 		unix_notinflight(scm->fp->fp[i]);
 }
 
-static void unix_destruct_fds(struct sk_buff *skb)
+static void unix_destruct_scm(struct sk_buff *skb)
 {
 	struct scm_cookie scm;
 	memset(&scm, 0, sizeof(scm));
-	unix_detach_fds(&scm, skb);
+	scm.pid  = UNIXCB(skb).pid;
+	scm.cred = UNIXCB(skb).cred;
+	if (UNIXCB(skb).fp)
+		unix_detach_fds(&scm, skb);
 
 	/* Alas, it calls VFS */
 	/* So fscking what? fput() had been SMP-safe since the last Summer */
@@ -1350,10 +1352,22 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 
 	for (i = scm->fp->count-1; i >= 0; i--)
 		unix_inflight(scm->fp->fp[i]);
-	skb->destructor = unix_destruct_fds;
 	return 0;
 }
 
+static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+{
+	int err = 0;
+	UNIXCB(skb).pid  = get_pid(scm->pid);
+	UNIXCB(skb).cred = get_cred(scm->cred);
+	UNIXCB(skb).fp = NULL;
+	if (scm->fp && send_fds)
+		err = unix_attach_fds(scm, skb);
+
+	skb->destructor = unix_destruct_scm;
+	return err;
+}
+
 /*
  *	Send AF_UNIX data.
  */
@@ -1410,12 +1424,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
-	memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
-	if (siocb->scm->fp) {
-		err = unix_attach_fds(siocb->scm, skb);
-		if (err)
-			goto out_free;
-	}
+	err = unix_scm_to_skb(siocb->scm, skb, true);
+	if (err)
+		goto out_free;
 	unix_get_secdata(siocb->scm, skb);
 
 	skb_reset_transport_header(skb);
@@ -1585,16 +1596,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		 */
 		size = min_t(int, size, skb_tailroom(skb));
 
-		memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
+
 		/* Only send the fds in the first buffer */
-		if (siocb->scm->fp && !fds_sent) {
-			err = unix_attach_fds(siocb->scm, skb);
-			if (err) {
-				kfree_skb(skb);
-				goto out_err;
-			}
-			fds_sent = true;
+		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
+		if (err) {
+			kfree_skb(skb);
+			goto out_err;
 		}
+		fds_sent = true;
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
@@ -1711,7 +1720,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		siocb->scm = &tmp_scm;
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
-	siocb->scm->creds = *UNIXCREDS(skb);
+	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
 	unix_set_secdata(siocb->scm, skb);
 
 	if (!(flags & MSG_PEEK)) {
@@ -1860,14 +1869,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 		if (check_creds) {
 			/* Never glue messages from different writers */
-			if (memcmp(UNIXCREDS(skb), &siocb->scm->creds,
-				   sizeof(siocb->scm->creds)) != 0) {
+			if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
+			    (UNIXCB(skb).cred != siocb->scm->cred)) {
 				skb_queue_head(&sk->sk_receive_queue, skb);
 				break;
 			}
 		} else {
 			/* Copy credentials */
-			siocb->scm->creds = *UNIXCREDS(skb);
+			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
 			check_creds = 1;
 		}
 
-- 
cgit v1.2.2


From 6616f7888c6088324727363276f6de05a1dca6fc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sun, 13 Jun 2010 03:35:48 +0000
Subject: af_unix: Allow connecting to sockets in other network namespaces.

Remove the restriction that only allows connecting to a unix domain
socket identified by unix path that is in the same network namespace.

Crossing network namespaces is always tricky and we did not support
this at first, because of a strict policy of don't mix the namespaces.
Later after Pavel proposed this we did not support this because no one
had performed the audit to make certain using unix domain sockets
across namespaces is safe.

What fundamentally makes connecting to af_unix sockets in other
namespaces is safe is that you have to have the proper permissions on
the unix domain socket inode that lives in the filesystem.  If you
want strict isolation you just don't create inodes where unfriendlys
can get at them, or with permissions that allow unfriendlys to open
them.  All nicely handled for us by the mount namespace and other
standard file system facilities.

I looked through unix domain sockets and they are a very controlled
environment so none of the work that goes on in dev_forward_skb to
make crossing namespaces safe appears needed, we are not loosing
controll of the skb and so do not need to set up the skb to look like
it is comming in fresh from the outside world.  Further the fields in
struct unix_skb_parms should not have any problems crossing network
namespaces.

Now that we handle SCM_CREDENTIALS in a way that gives useable values
across namespaces.  There does not appear to be any operational
problems with encouraging the use of unix domain sockets across
containers either.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>
Acked-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5fe9d6fe08b8..75ba48b0d12a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -282,7 +282,7 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
 	return s;
 }
 
-static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
+static struct sock *unix_find_socket_byinode(struct inode *i)
 {
 	struct sock *s;
 	struct hlist_node *node;
@@ -292,9 +292,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->dentry;
 
-		if (!net_eq(sock_net(s), net))
-			continue;
-
 		if (dentry && dentry->d_inode == i) {
 			sock_hold(s);
 			goto found;
@@ -758,7 +755,7 @@ static struct sock *unix_find_other(struct net *net,
 		err = -ECONNREFUSED;
 		if (!S_ISSOCK(inode->i_mode))
 			goto put_fail;
-		u = unix_find_socket_byinode(net, inode);
+		u = unix_find_socket_byinode(inode);
 		if (!u)
 			goto put_fail;
 
-- 
cgit v1.2.2


From 70d4bf6d467a330ccc947df9b2608e329d9e7708 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 20 Jul 2010 06:45:56 +0000
Subject: drop_monitor: convert some kfree_skb call sites to consume_skb

Convert a few calls from kfree_skb to consume_skb

Noticed while I was working on dropwatch that I was detecting lots of internal
skb drops in several places.  While some are legitimate, several were not,
freeing skbs that were at the end of their life, rather than being discarded due
to an error.  This patch converts those calls sites from using kfree_skb to
consume_skb, which quiets the in-kernel drop_monitor code from detecting them as
drops.  Tested successfully by myself

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 75ba48b0d12a..4414a18c63b4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1906,7 +1906,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 				break;
 			}
 
-			kfree_skb(skb);
+			consume_skb(skb);
 
 			if (siocb->scm->fp)
 				break;
-- 
cgit v1.2.2


From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 6 Sep 2010 11:13:50 +0000
Subject: net: poll() optimizations

No need to test twice sk->sk_shutdown & RCV_SHUTDOWN

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4414a18c63b4..9077b4ea00c5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2024,11 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
-- 
cgit v1.2.2


From 8df73ff90f00f14d2c7ff7156f7ef153f7e9d3b7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Sat, 4 Sep 2010 01:34:28 +0000
Subject: UNIX: Do not loop forever at unix_autobind().

We assumed that unix_autobind() never fails if kzalloc() succeeded.
But unix_autobind() allows only 1048576 names. If /proc/sys/fs/file-max is
larger than 1048576 (e.g. systems with more than 10GB of RAM), a local user can
consume all names using fork()/socket()/bind().

If all names are in use, those who call bind() with addr_len == sizeof(short)
or connect()/sendmsg() with setsockopt(SO_PASSCRED) will continue

  while (1)
        yield();

loop at unix_autobind() till a name becomes available.
This patch adds a loop counter in order to give up after 1048576 attempts.

Calling yield() for once per 256 attempts may not be sufficient when many names
are already in use, for __unix_find_socket_byname() can take long time under
such circumstance. Therefore, this patch also adds cond_resched() call.

Note that currently a local user can consume 2GB of kernel memory if the user
is allowed to create and autobind 1048576 UNIX domain sockets. We should
consider adding some restriction for autobind operation.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 4414a18c63b4..0b39b2451ea5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -692,6 +692,7 @@ static int unix_autobind(struct socket *sock)
 	static u32 ordernum = 1;
 	struct unix_address *addr;
 	int err;
+	unsigned int retries = 0;
 
 	mutex_lock(&u->readlock);
 
@@ -717,9 +718,17 @@ retry:
 	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 				      addr->hash)) {
 		spin_unlock(&unix_table_lock);
-		/* Sanity yield. It is unusual case, but yet... */
-		if (!(ordernum&0xFF))
-			yield();
+		/*
+		 * __unix_find_socket_byname() may take long time if many names
+		 * are already in use.
+		 */
+		cond_resched();
+		/* Give up if all names seems to be in use. */
+		if (retries++ == 0xFFFFF) {
+			err = -ENOSPC;
+			kfree(addr);
+			goto out;
+		}
 		goto retry;
 	}
 	addr->hash ^= sk->sk_type;
-- 
cgit v1.2.2


From 3f66116e89521ef71ab0d63dc07a639def88a577 Mon Sep 17 00:00:00 2001
From: Alban Crequy <alban.crequy@collabora.co.uk>
Date: Mon, 4 Oct 2010 08:48:28 +0000
Subject: AF_UNIX: Implement SO_TIMESTAMP and SO_TIMETAMPNS on Unix sockets

Userspace applications can already request to receive timestamps with:
setsockopt(sockfd, SOL_SOCKET, SO_TIMESTAMP, ...)

Although setsockopt() returns zero (success), timestamps are not added to the
ancillary data. This patch fixes that on SOCK_DGRAM and SOCK_SEQPACKET Unix
sockets.

Signed-off-by: Alban Crequy <alban.crequy@collabora.co.uk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c586da3f4f18..0ebc777a6660 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1511,6 +1511,8 @@ restart:
 		goto restart;
 	}
 
+	if (sock_flag(other, SOCK_RCVTSTAMP))
+		__net_timestamp(skb);
 	skb_queue_tail(&other->sk_receive_queue, skb);
 	unix_state_unlock(other);
 	other->sk_data_ready(other, len);
@@ -1722,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	if (err)
 		goto out_free;
 
+	if (sock_flag(sk, SOCK_RCVTSTAMP))
+		__sock_recv_timestamp(msg, sk, skb);
+
 	if (!siocb->scm) {
 		siocb->scm = &tmp_scm;
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
-- 
cgit v1.2.2


From 518de9b39e854542de59bfb8b9f61c8f7ecf808b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Oct 2010 14:22:44 -0700
Subject: fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of atomic_t.

get_max_files() is changed to return an unsigned long.  get_nr_files() is
changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/unix/af_unix.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0ebc777a6660..3c95304a0817 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-- 
cgit v1.2.2


From 67426b756c4d52c511c4b22b269accea171692a8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 29 Oct 2010 20:44:44 +0000
Subject: af_unix: use keyed wakeups

Instead of wakeup all sleepers, use wake_up_interruptible_sync_poll() to
wakeup only ones interested into writing the socket.

This patch is a specialization of commit 37e5540b3c9d (epoll keyed
wakeups: make sockets use keyed wakeups).

On a test program provided by Alan Crequy :

Before:
real    0m3.101s
user    0m0.000s
sys     0m6.104s

After:

real	0m0.211s
user	0m0.000s
sys	0m0.208s

Reported-by: Alban Crequy <alban.crequy@collabora.co.uk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3c95304a0817..f33c5958dbb2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -316,7 +316,8 @@ static void unix_write_space(struct sock *sk)
 	if (unix_writable(sk)) {
 		wq = rcu_dereference(sk->sk_wq);
 		if (wq_has_sleeper(wq))
-			wake_up_interruptible_sync(&wq->wait);
+			wake_up_interruptible_sync_poll(&wq->wait,
+				POLLOUT | POLLWRNORM | POLLWRBAND);
 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
@@ -1710,7 +1711,8 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		goto out_unlock;
 	}
 
-	wake_up_interruptible_sync(&u->peer_wait);
+	wake_up_interruptible_sync_poll(&u->peer_wait,
+					POLLOUT | POLLWRNORM | POLLWRBAND);
 
 	if (msg->msg_name)
 		unix_copy_addr(msg, skb->sk);
-- 
cgit v1.2.2


From 5456f09aaf88731e16dbcea7522cb330b6846415 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 31 Oct 2010 05:36:23 +0000
Subject: af_unix: fix unix_dgram_poll() behavior for EPOLLOUT event

Alban Crequy reported a problem with connected dgram af_unix sockets and
provided a test program. epoll() would miss to send an EPOLLOUT event
when a thread unqueues a packet from the other peer, making its receive
queue not full.

This is because unix_dgram_poll() fails to call sock_poll_wait(file,
&unix_sk(other)->peer_wait, wait);
if the socket is not writeable at the time epoll_ctl(ADD) is called.

We must call sock_poll_wait(), regardless of 'writable' status, so that
epoll can be notified later of states changes.

Misc: avoids testing twice (sk->sk_shutdown & RCV_SHUTDOWN)

Reported-by: Alban Crequy <alban.crequy@collabora.co.uk>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f33c5958dbb2..e8898758dd31 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2074,13 +2074,12 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
 		mask |= POLLERR;
 	if (sk->sk_shutdown & RCV_SHUTDOWN)
-		mask |= POLLRDHUP;
+		mask |= POLLRDHUP | POLLIN | POLLRDNORM;
 	if (sk->sk_shutdown == SHUTDOWN_MASK)
 		mask |= POLLHUP;
 
 	/* readable? */
-	if (!skb_queue_empty(&sk->sk_receive_queue) ||
-	    (sk->sk_shutdown & RCV_SHUTDOWN))
+	if (!skb_queue_empty(&sk->sk_receive_queue))
 		mask |= POLLIN | POLLRDNORM;
 
 	/* Connection-based need to check for termination and startup */
@@ -2092,20 +2091,15 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 			return mask;
 	}
 
-	/* writable? */
 	writable = unix_writable(sk);
-	if (writable) {
-		other = unix_peer_get(sk);
-		if (other) {
-			if (unix_peer(other) != sk) {
-				sock_poll_wait(file, &unix_sk(other)->peer_wait,
-					  wait);
-				if (unix_recvq_full(other))
-					writable = 0;
-			}
-
-			sock_put(other);
+	other = unix_peer_get(sk);
+	if (other) {
+		if (unix_peer(other) != sk) {
+			sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
+			if (unix_recvq_full(other))
+				writable = 0;
 		}
+		sock_put(other);
 	}
 
 	if (writable)
-- 
cgit v1.2.2


From 973a34aa8593dbfe84386343c694f5beecb51d8a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 31 Oct 2010 05:38:25 +0000
Subject: af_unix: optimize unix_dgram_poll()

unix_dgram_poll() is pretty expensive to check POLLOUT status, because
it has to lock the socket to get its peer, take a reference on the peer
to check its receive queue status, and queue another poll_wait on
peer_wait. This all can be avoided if the process calling
unix_dgram_poll() is not interested in POLLOUT status. It makes
unix_dgram_recvmsg() faster by not queueing irrelevant pollers in
peer_wait.

On a test program provided by Alan Crequy :

Before:

real    0m0.211s
user    0m0.000s
sys     0m0.208s

After:

real    0m0.044s
user    0m0.000s
sys     0m0.040s

Suggested-by: Davide Libenzi <davidel@xmailserver.org>
Reported-by: Alban Crequy <alban.crequy@collabora.co.uk>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e8898758dd31..7ff31c60186a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2091,6 +2091,10 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
 			return mask;
 	}
 
+	/* No write status requested, avoid expensive OUT tests. */
+	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+		return mask;
+
 	writable = unix_writable(sk);
 	other = unix_peer_get(sk);
 	if (other) {
-- 
cgit v1.2.2


From 25888e30319f8896fc656fc68643e6a078263060 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 25 Nov 2010 04:11:39 +0000
Subject: af_unix: limit recursion level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Its easy to eat all kernel memory and trigger NMI watchdog, using an
exploit program that queues unix sockets on top of others.

lkml ref : http://lkml.org/lkml/2010/11/25/8

This mechanism is used in applications, one choice we have is to have a
recursion limit.

Other limits might be needed as well (if we queue other types of files),
since the passfd mechanism is currently limited by socket receive queue
sizes only.

Add a recursion_level to unix socket, allowing up to 4 levels.

Each time we send an unix socket through sendfd mechanism, we copy its
recursion level (plus one) to receiver. This recursion level is cleared
when socket receive queue is emptied.

Reported-by: Марк Коренберг <socketpair@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3c95304a0817..2268e6798124 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1343,9 +1343,25 @@ static void unix_destruct_scm(struct sk_buff *skb)
 	sock_wfree(skb);
 }
 
+#define MAX_RECURSION_LEVEL 4
+
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	int i;
+	unsigned char max_level = 0;
+	int unix_sock_count = 0;
+
+	for (i = scm->fp->count - 1; i >= 0; i--) {
+		struct sock *sk = unix_get_socket(scm->fp->fp[i]);
+
+		if (sk) {
+			unix_sock_count++;
+			max_level = max(max_level,
+					unix_sk(sk)->recursion_level);
+		}
+	}
+	if (unlikely(max_level > MAX_RECURSION_LEVEL))
+		return -ETOOMANYREFS;
 
 	/*
 	 * Need to duplicate file references for the sake of garbage
@@ -1356,9 +1372,11 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	if (!UNIXCB(skb).fp)
 		return -ENOMEM;
 
-	for (i = scm->fp->count-1; i >= 0; i--)
-		unix_inflight(scm->fp->fp[i]);
-	return 0;
+	if (unix_sock_count) {
+		for (i = scm->fp->count - 1; i >= 0; i--)
+			unix_inflight(scm->fp->fp[i]);
+	}
+	return max_level;
 }
 
 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
@@ -1393,6 +1411,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct sk_buff *skb;
 	long timeo;
 	struct scm_cookie tmp_scm;
+	int max_level;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1431,8 +1450,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 
 	err = unix_scm_to_skb(siocb->scm, skb, true);
-	if (err)
+	if (err < 0)
 		goto out_free;
+	max_level = err + 1;
 	unix_get_secdata(siocb->scm, skb);
 
 	skb_reset_transport_header(skb);
@@ -1514,6 +1534,8 @@ restart:
 	if (sock_flag(other, SOCK_RCVTSTAMP))
 		__net_timestamp(skb);
 	skb_queue_tail(&other->sk_receive_queue, skb);
+	if (max_level > unix_sk(other)->recursion_level)
+		unix_sk(other)->recursion_level = max_level;
 	unix_state_unlock(other);
 	other->sk_data_ready(other, len);
 	sock_put(other);
@@ -1544,6 +1566,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
+	int max_level;
 
 	if (NULL == siocb->scm)
 		siocb->scm = &tmp_scm;
@@ -1607,10 +1630,11 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 
 		/* Only send the fds in the first buffer */
 		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
-		if (err) {
+		if (err < 0) {
 			kfree_skb(skb);
 			goto out_err;
 		}
+		max_level = err + 1;
 		fds_sent = true;
 
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
@@ -1626,6 +1650,8 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 			goto pipe_err_free;
 
 		skb_queue_tail(&other->sk_receive_queue, skb);
+		if (max_level > unix_sk(other)->recursion_level)
+			unix_sk(other)->recursion_level = max_level;
 		unix_state_unlock(other);
 		other->sk_data_ready(other, size);
 		sent += size;
@@ -1845,6 +1871,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		unix_state_lock(sk);
 		skb = skb_dequeue(&sk->sk_receive_queue);
 		if (skb == NULL) {
+			unix_sk(sk)->recursion_level = 0;
 			if (copied >= target)
 				goto unlock;
 
-- 
cgit v1.2.2


From 3610cda53f247e176bcbb7a7cca64bc53b12acdb Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Wed, 5 Jan 2011 15:38:53 -0800
Subject: af_unix: Avoid socket->sk NULL OOPS in stream connect security hooks.

unix_release() can asynchornously set socket->sk to NULL, and
it does so without holding the unix_state_lock() on "other"
during stream connects.

However, the reverse mapping, sk->sk_socket, is only transitioned
to NULL under the unix_state_lock().

Therefore make the security hooks follow the reverse mapping instead
of the forward mapping.

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/unix/af_unix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/unix/af_unix.c')

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 417d7a6c36cf..dd419d286204 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1157,7 +1157,7 @@ restart:
 		goto restart;
 	}
 
-	err = security_unix_stream_connect(sock, other->sk_socket, newsk);
+	err = security_unix_stream_connect(sk, other, newsk);
 	if (err) {
 		unix_state_unlock(sk);
 		goto out_unlock;
-- 
cgit v1.2.2