aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/core
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/skbuff.c37
-rw-r--r--net/core/sock.c49
2 files changed, 56 insertions, 30 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fe00d1208167..2ede3cfa8ffa 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1655,38 +1655,19 @@ static struct page *linear_to_page(struct page *page, unsigned int *len,
1655 unsigned int *offset, 1655 unsigned int *offset,
1656 struct sk_buff *skb, struct sock *sk) 1656 struct sk_buff *skb, struct sock *sk)
1657{ 1657{
1658 struct page *p = sk->sk_sndmsg_page; 1658 struct page_frag *pfrag = sk_page_frag(sk);
1659 unsigned int off;
1660 1659
1661 if (!p) { 1660 if (!sk_page_frag_refill(sk, pfrag))
1662new_page: 1661 return NULL;
1663 p = sk->sk_sndmsg_page = alloc_pages(sk->sk_allocation, 0);
1664 if (!p)
1665 return NULL;
1666
1667 off = sk->sk_sndmsg_off = 0;
1668 /* hold one ref to this page until it's full */
1669 } else {
1670 unsigned int mlen;
1671
1672 /* If we are the only user of the page, we can reset offset */
1673 if (page_count(p) == 1)
1674 sk->sk_sndmsg_off = 0;
1675 off = sk->sk_sndmsg_off;
1676 mlen = PAGE_SIZE - off;
1677 if (mlen < 64 && mlen < *len) {
1678 put_page(p);
1679 goto new_page;
1680 }
1681 1662
1682 *len = min_t(unsigned int, *len, mlen); 1663 *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);
1683 }
1684 1664
1685 memcpy(page_address(p) + off, page_address(page) + *offset, *len); 1665 memcpy(page_address(pfrag->page) + pfrag->offset,
1686 sk->sk_sndmsg_off += *len; 1666 page_address(page) + *offset, *len);
1687 *offset = off; 1667 *offset = pfrag->offset;
1668 pfrag->offset += *len;
1688 1669
1689 return p; 1670 return pfrag->page;
1690} 1671}
1691 1672
1692static bool spd_can_coalesce(const struct splice_pipe_desc *spd, 1673static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
diff --git a/net/core/sock.c b/net/core/sock.c
index 2693f7649222..727114cd6f7e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1744,6 +1744,45 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1744} 1744}
1745EXPORT_SYMBOL(sock_alloc_send_skb); 1745EXPORT_SYMBOL(sock_alloc_send_skb);
1746 1746
1747/* On 32bit arches, an skb frag is limited to 2^15 */
1748#define SKB_FRAG_PAGE_ORDER get_order(32768)
1749
1750bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1751{
1752 int order;
1753
1754 if (pfrag->page) {
1755 if (atomic_read(&pfrag->page->_count) == 1) {
1756 pfrag->offset = 0;
1757 return true;
1758 }
1759 if (pfrag->offset < pfrag->size)
1760 return true;
1761 put_page(pfrag->page);
1762 }
1763
1764 /* We restrict high order allocations to users that can afford to wait */
1765 order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1766
1767 do {
1768 gfp_t gfp = sk->sk_allocation;
1769
1770 if (order)
1771 gfp |= __GFP_COMP | __GFP_NOWARN;
1772 pfrag->page = alloc_pages(gfp, order);
1773 if (likely(pfrag->page)) {
1774 pfrag->offset = 0;
1775 pfrag->size = PAGE_SIZE << order;
1776 return true;
1777 }
1778 } while (--order >= 0);
1779
1780 sk_enter_memory_pressure(sk);
1781 sk_stream_moderate_sndbuf(sk);
1782 return false;
1783}
1784EXPORT_SYMBOL(sk_page_frag_refill);
1785
1747static void __lock_sock(struct sock *sk) 1786static void __lock_sock(struct sock *sk)
1748 __releases(&sk->sk_lock.slock) 1787 __releases(&sk->sk_lock.slock)
1749 __acquires(&sk->sk_lock.slock) 1788 __acquires(&sk->sk_lock.slock)
@@ -2173,8 +2212,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2173 sk->sk_error_report = sock_def_error_report; 2212 sk->sk_error_report = sock_def_error_report;
2174 sk->sk_destruct = sock_def_destruct; 2213 sk->sk_destruct = sock_def_destruct;
2175 2214
2176 sk->sk_sndmsg_page = NULL; 2215 sk->sk_frag.page = NULL;
2177 sk->sk_sndmsg_off = 0; 2216 sk->sk_frag.offset = 0;
2178 sk->sk_peek_off = -1; 2217 sk->sk_peek_off = -1;
2179 2218
2180 sk->sk_peer_pid = NULL; 2219 sk->sk_peer_pid = NULL;
@@ -2417,6 +2456,12 @@ void sk_common_release(struct sock *sk)
2417 xfrm_sk_free_policy(sk); 2456 xfrm_sk_free_policy(sk);
2418 2457
2419 sk_refcnt_debug_release(sk); 2458 sk_refcnt_debug_release(sk);
2459
2460 if (sk->sk_frag.page) {
2461 put_page(sk->sk_frag.page);
2462 sk->sk_frag.page = NULL;
2463 }
2464
2420 sock_put(sk); 2465 sock_put(sk);
2421} 2466}
2422EXPORT_SYMBOL(sk_common_release); 2467EXPORT_SYMBOL(sk_common_release);