aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRainer Weikusat <rweikusat@mssgmbh.com>2008-06-27 22:34:18 -0400
committerDavid S. Miller <davem@davemloft.net>2008-06-27 22:34:18 -0400
commitec0d215f9420564fc8286dcf93d2d068bb53a07e (patch)
treef28d77219b4983c5e8c042202d5e37a746f05e60
parentdb43a282d3ec92ea45109c5551fff3dcc5afef02 (diff)
af_unix: fix 'poll for write'/connected DGRAM sockets
For n:1 'datagram connections' (eg /dev/log), the unix_dgram_sendmsg routine implements a form of receiver-imposed flow control by comparing the length of the receive queue of the 'peer socket' with the max_ack_backlog value stored in the corresponding sock structure, either blocking the thread which caused the send-routine to be called or returning EAGAIN. This routine is used by both SOCK_DGRAM and SOCK_SEQPACKET sockets. The poll-implementation for these socket types is datagram_poll from core/datagram.c. A socket is deemed to be writeable by this routine when the memory presently consumed by datagrams owned by it is less than the configured socket send buffer size. This is always wrong for PF_UNIX non-stream sockets connected to server sockets dealing with (potentially) multiple clients if the abovementioned receive queue is currently considered to be full. 'poll' will then return, indicating that the socket is writeable, but a subsequent write result in EAGAIN, effectively causing an (usual) application to 'poll for writeability by repeated send request with O_NONBLOCK set' until it has consumed its time quantum. The change below uses a suitably modified variant of the datagram_poll routines for both type of PF_UNIX sockets, which tests if the recv-queue of the peer a socket is connected to is presently considered to be 'full' as part of the 'is this socket writeable'-checking code. The socket being polled is additionally put onto the peer_wait wait queue associated with its peer, because the unix_dgram_recvmsg routine does a wake up on this queue after a datagram was received and the 'other wakeup call' is done implicitly as part of skb destruction, meaning, a process blocked in poll because of a full peer receive queue could otherwise sleep forever if no datagram owned by its socket was already sitting on this queue. Among this change is a small (inline) helper routine named 'unix_recvq_full', which consolidates the actual testing code (in three different places) into a single location. Signed-off-by: Rainer Weikusat <rweikusat@mssgmbh.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/unix/af_unix.c52
1 files changed, 24 insertions, 28 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 657835f227d3..783317dacd30 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -487,8 +487,8 @@ static int unix_socketpair(struct socket *, struct socket *);
487static int unix_accept(struct socket *, struct socket *, int); 487static int unix_accept(struct socket *, struct socket *, int);
488static int unix_getname(struct socket *, struct sockaddr *, int *, int); 488static int unix_getname(struct socket *, struct sockaddr *, int *, int);
489static unsigned int unix_poll(struct file *, struct socket *, poll_table *); 489static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
490static unsigned int unix_datagram_poll(struct file *, struct socket *, 490static unsigned int unix_dgram_poll(struct file *, struct socket *,
491 poll_table *); 491 poll_table *);
492static int unix_ioctl(struct socket *, unsigned int, unsigned long); 492static int unix_ioctl(struct socket *, unsigned int, unsigned long);
493static int unix_shutdown(struct socket *, int); 493static int unix_shutdown(struct socket *, int);
494static int unix_stream_sendmsg(struct kiocb *, struct socket *, 494static int unix_stream_sendmsg(struct kiocb *, struct socket *,
@@ -534,7 +534,7 @@ static const struct proto_ops unix_dgram_ops = {
534 .socketpair = unix_socketpair, 534 .socketpair = unix_socketpair,
535 .accept = sock_no_accept, 535 .accept = sock_no_accept,
536 .getname = unix_getname, 536 .getname = unix_getname,
537 .poll = unix_datagram_poll, 537 .poll = unix_dgram_poll,
538 .ioctl = unix_ioctl, 538 .ioctl = unix_ioctl,
539 .listen = sock_no_listen, 539 .listen = sock_no_listen,
540 .shutdown = unix_shutdown, 540 .shutdown = unix_shutdown,
@@ -555,7 +555,7 @@ static const struct proto_ops unix_seqpacket_ops = {
555 .socketpair = unix_socketpair, 555 .socketpair = unix_socketpair,
556 .accept = unix_accept, 556 .accept = unix_accept,
557 .getname = unix_getname, 557 .getname = unix_getname,
558 .poll = unix_datagram_poll, 558 .poll = unix_dgram_poll,
559 .ioctl = unix_ioctl, 559 .ioctl = unix_ioctl,
560 .listen = unix_listen, 560 .listen = unix_listen,
561 .shutdown = unix_shutdown, 561 .shutdown = unix_shutdown,
@@ -1994,29 +1994,13 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
1994 return mask; 1994 return mask;
1995} 1995}
1996 1996
1997static unsigned int unix_datagram_poll(struct file *file, struct socket *sock, 1997static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
1998 poll_table *wait) 1998 poll_table *wait)
1999{ 1999{
2000 struct sock *sk = sock->sk, *peer; 2000 struct sock *sk = sock->sk, *other;
2001 unsigned int mask; 2001 unsigned int mask, writable;
2002 2002
2003 poll_wait(file, sk->sk_sleep, wait); 2003 poll_wait(file, sk->sk_sleep, wait);
2004
2005 peer = unix_peer_get(sk);
2006 if (peer) {
2007 if (peer != sk) {
2008 /*
2009 * Writability of a connected socket additionally
2010 * depends on the state of the receive queue of the
2011 * peer.
2012 */
2013 poll_wait(file, &unix_sk(peer)->peer_wait, wait);
2014 } else {
2015 sock_put(peer);
2016 peer = NULL;
2017 }
2018 }
2019
2020 mask = 0; 2004 mask = 0;
2021 2005
2022 /* exceptional events? */ 2006 /* exceptional events? */
@@ -2042,14 +2026,26 @@ static unsigned int unix_datagram_poll(struct file *file, struct socket *sock,
2042 } 2026 }
2043 2027
2044 /* writable? */ 2028 /* writable? */
2045 if (unix_writable(sk) && !(peer && unix_recvq_full(peer))) 2029 writable = unix_writable(sk);
2030 if (writable) {
2031 other = unix_peer_get(sk);
2032 if (other) {
2033 if (unix_peer(other) != sk) {
2034 poll_wait(file, &unix_sk(other)->peer_wait,
2035 wait);
2036 if (unix_recvq_full(other))
2037 writable = 0;
2038 }
2039
2040 sock_put(other);
2041 }
2042 }
2043
2044 if (writable)
2046 mask |= POLLOUT | POLLWRNORM | POLLWRBAND; 2045 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2047 else 2046 else
2048 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 2047 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2049 2048
2050 if (peer)
2051 sock_put(peer);
2052
2053 return mask; 2049 return mask;
2054} 2050}
2055 2051