diff options
author | Eric Dumazet <edumazet@google.com> | 2013-08-08 17:37:32 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2013-08-10 04:16:44 -0400 |
commit | e370a7236321773245c5522d8bb299380830d3b2 (patch) | |
tree | e374e13b5fcccdd9aa28fcb5ab0dd3df05b4d384 /net/unix | |
parent | 149479d019e06df5a7f4096f95c00cfb1380309c (diff) |
af_unix: improve STREAM behavior with fragmented memory
unix_stream_sendmsg() currently uses order-2 allocations,
and we had numerous reports this can fail.
The __GFP_REPEAT flag present in sock_alloc_send_pskb() is
not helping.
This patch extends the work done in commit eb6a24816b247c
("af_unix: reduce high order page allocations) for
datagram sockets.
This opens the possibility of zero copy IO (splice() and
friends)
The trick is to not use skb_pull() anymore in recvmsg() path,
and instead add a @consumed field in UNIXCB() to track amount
of already read payload in the skb.
There is a performance regression for large sends
because of extra page allocations that will be addressed
in a follow-up patch, allowing sock_alloc_send_pskb()
to attempt high order page allocations.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/unix')
-rw-r--r-- | net/unix/af_unix.c | 65 |
1 files changed, 30 insertions, 35 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c4ce243824bb..99dc760cdd95 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -1596,6 +1596,10 @@ out: | |||
1596 | return err; | 1596 | return err; |
1597 | } | 1597 | } |
1598 | 1598 | ||
1599 | /* We use paged skbs for stream sockets, and limit occupancy to 32768 | ||
1600 | * bytes, and a minimun of a full page. | ||
1601 | */ | ||
1602 | #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) | ||
1599 | 1603 | ||
1600 | static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | 1604 | static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, |
1601 | struct msghdr *msg, size_t len) | 1605 | struct msghdr *msg, size_t len) |
@@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
1609 | struct scm_cookie tmp_scm; | 1613 | struct scm_cookie tmp_scm; |
1610 | bool fds_sent = false; | 1614 | bool fds_sent = false; |
1611 | int max_level; | 1615 | int max_level; |
1616 | int data_len; | ||
1612 | 1617 | ||
1613 | if (NULL == siocb->scm) | 1618 | if (NULL == siocb->scm) |
1614 | siocb->scm = &tmp_scm; | 1619 | siocb->scm = &tmp_scm; |
@@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
1635 | goto pipe_err; | 1640 | goto pipe_err; |
1636 | 1641 | ||
1637 | while (sent < len) { | 1642 | while (sent < len) { |
1638 | /* | 1643 | size = len - sent; |
1639 | * Optimisation for the fact that under 0.01% of X | ||
1640 | * messages typically need breaking up. | ||
1641 | */ | ||
1642 | |||
1643 | size = len-sent; | ||
1644 | 1644 | ||
1645 | /* Keep two messages in the pipe so it schedules better */ | 1645 | /* Keep two messages in the pipe so it schedules better */ |
1646 | if (size > ((sk->sk_sndbuf >> 1) - 64)) | 1646 | size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64); |
1647 | size = (sk->sk_sndbuf >> 1) - 64; | ||
1648 | 1647 | ||
1649 | if (size > SKB_MAX_ALLOC) | 1648 | /* allow fallback to order-0 allocations */ |
1650 | size = SKB_MAX_ALLOC; | 1649 | size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ); |
1651 | |||
1652 | /* | ||
1653 | * Grab a buffer | ||
1654 | */ | ||
1655 | 1650 | ||
1656 | skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, | 1651 | data_len = max_t(int, 0, size - SKB_MAX_HEAD(0)); |
1657 | &err); | ||
1658 | 1652 | ||
1659 | if (skb == NULL) | 1653 | skb = sock_alloc_send_pskb(sk, size - data_len, data_len, |
1654 | msg->msg_flags & MSG_DONTWAIT, &err); | ||
1655 | if (!skb) | ||
1660 | goto out_err; | 1656 | goto out_err; |
1661 | 1657 | ||
1662 | /* | ||
1663 | * If you pass two values to the sock_alloc_send_skb | ||
1664 | * it tries to grab the large buffer with GFP_NOFS | ||
1665 | * (which can fail easily), and if it fails grab the | ||
1666 | * fallback size buffer which is under a page and will | ||
1667 | * succeed. [Alan] | ||
1668 | */ | ||
1669 | size = min_t(int, size, skb_tailroom(skb)); | ||
1670 | |||
1671 | |||
1672 | /* Only send the fds in the first buffer */ | 1658 | /* Only send the fds in the first buffer */ |
1673 | err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); | 1659 | err = unix_scm_to_skb(siocb->scm, skb, !fds_sent); |
1674 | if (err < 0) { | 1660 | if (err < 0) { |
@@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, | |||
1678 | max_level = err + 1; | 1664 | max_level = err + 1; |
1679 | fds_sent = true; | 1665 | fds_sent = true; |
1680 | 1666 | ||
1681 | err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); | 1667 | skb_put(skb, size - data_len); |
1668 | skb->data_len = data_len; | ||
1669 | skb->len = size; | ||
1670 | err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size); | ||
1682 | if (err) { | 1671 | if (err) { |
1683 | kfree_skb(skb); | 1672 | kfree_skb(skb); |
1684 | goto out_err; | 1673 | goto out_err; |
@@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, | |||
1890 | return timeo; | 1879 | return timeo; |
1891 | } | 1880 | } |
1892 | 1881 | ||
1882 | static unsigned int unix_skb_len(const struct sk_buff *skb) | ||
1883 | { | ||
1884 | return skb->len - UNIXCB(skb).consumed; | ||
1885 | } | ||
1886 | |||
1893 | static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, | 1887 | static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, |
1894 | struct msghdr *msg, size_t size, | 1888 | struct msghdr *msg, size_t size, |
1895 | int flags) | 1889 | int flags) |
@@ -1977,8 +1971,8 @@ again: | |||
1977 | } | 1971 | } |
1978 | 1972 | ||
1979 | skip = sk_peek_offset(sk, flags); | 1973 | skip = sk_peek_offset(sk, flags); |
1980 | while (skip >= skb->len) { | 1974 | while (skip >= unix_skb_len(skb)) { |
1981 | skip -= skb->len; | 1975 | skip -= unix_skb_len(skb); |
1982 | last = skb; | 1976 | last = skb; |
1983 | skb = skb_peek_next(skb, &sk->sk_receive_queue); | 1977 | skb = skb_peek_next(skb, &sk->sk_receive_queue); |
1984 | if (!skb) | 1978 | if (!skb) |
@@ -2005,8 +1999,9 @@ again: | |||
2005 | sunaddr = NULL; | 1999 | sunaddr = NULL; |
2006 | } | 2000 | } |
2007 | 2001 | ||
2008 | chunk = min_t(unsigned int, skb->len - skip, size); | 2002 | chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); |
2009 | if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) { | 2003 | if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, |
2004 | msg->msg_iov, chunk)) { | ||
2010 | if (copied == 0) | 2005 | if (copied == 0) |
2011 | copied = -EFAULT; | 2006 | copied = -EFAULT; |
2012 | break; | 2007 | break; |
@@ -2016,14 +2011,14 @@ again: | |||
2016 | 2011 | ||
2017 | /* Mark read part of skb as used */ | 2012 | /* Mark read part of skb as used */ |
2018 | if (!(flags & MSG_PEEK)) { | 2013 | if (!(flags & MSG_PEEK)) { |
2019 | skb_pull(skb, chunk); | 2014 | UNIXCB(skb).consumed += chunk; |
2020 | 2015 | ||
2021 | sk_peek_offset_bwd(sk, chunk); | 2016 | sk_peek_offset_bwd(sk, chunk); |
2022 | 2017 | ||
2023 | if (UNIXCB(skb).fp) | 2018 | if (UNIXCB(skb).fp) |
2024 | unix_detach_fds(siocb->scm, skb); | 2019 | unix_detach_fds(siocb->scm, skb); |
2025 | 2020 | ||
2026 | if (skb->len) | 2021 | if (unix_skb_len(skb)) |
2027 | break; | 2022 | break; |
2028 | 2023 | ||
2029 | skb_unlink(skb, &sk->sk_receive_queue); | 2024 | skb_unlink(skb, &sk->sk_receive_queue); |
@@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk) | |||
2107 | if (sk->sk_type == SOCK_STREAM || | 2102 | if (sk->sk_type == SOCK_STREAM || |
2108 | sk->sk_type == SOCK_SEQPACKET) { | 2103 | sk->sk_type == SOCK_SEQPACKET) { |
2109 | skb_queue_walk(&sk->sk_receive_queue, skb) | 2104 | skb_queue_walk(&sk->sk_receive_queue, skb) |
2110 | amount += skb->len; | 2105 | amount += unix_skb_len(skb); |
2111 | } else { | 2106 | } else { |
2112 | skb = skb_peek(&sk->sk_receive_queue); | 2107 | skb = skb_peek(&sk->sk_receive_queue); |
2113 | if (skb) | 2108 | if (skb) |