aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv6/ip6_output.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/ipv6/ip6_output.c
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6/ip6_output.c')
-rw-r--r--net/ipv6/ip6_output.c65
1 files changed, 25 insertions, 40 deletions
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3dd4a37488d5..aece3e792f84 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1279,8 +1279,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1279 if (dst_allfrag(rt->dst.path)) 1279 if (dst_allfrag(rt->dst.path))
1280 cork->flags |= IPCORK_ALLFRAG; 1280 cork->flags |= IPCORK_ALLFRAG;
1281 cork->length = 0; 1281 cork->length = 0;
1282 sk->sk_sndmsg_page = NULL;
1283 sk->sk_sndmsg_off = 0;
1284 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; 1282 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1285 length += exthdrlen; 1283 length += exthdrlen;
1286 transhdrlen += exthdrlen; 1284 transhdrlen += exthdrlen;
@@ -1504,48 +1502,31 @@ alloc_new_skb:
1504 } 1502 }
1505 } else { 1503 } else {
1506 int i = skb_shinfo(skb)->nr_frags; 1504 int i = skb_shinfo(skb)->nr_frags;
1507 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 1505 struct page_frag *pfrag = sk_page_frag(sk);
1508 struct page *page = sk->sk_sndmsg_page;
1509 int off = sk->sk_sndmsg_off;
1510 unsigned int left;
1511
1512 if (page && (left = PAGE_SIZE - off) > 0) {
1513 if (copy >= left)
1514 copy = left;
1515 if (page != skb_frag_page(frag)) {
1516 if (i == MAX_SKB_FRAGS) {
1517 err = -EMSGSIZE;
1518 goto error;
1519 }
1520 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1521 skb_frag_ref(skb, i);
1522 frag = &skb_shinfo(skb)->frags[i];
1523 }
1524 } else if(i < MAX_SKB_FRAGS) {
1525 if (copy > PAGE_SIZE)
1526 copy = PAGE_SIZE;
1527 page = alloc_pages(sk->sk_allocation, 0);
1528 if (page == NULL) {
1529 err = -ENOMEM;
1530 goto error;
1531 }
1532 sk->sk_sndmsg_page = page;
1533 sk->sk_sndmsg_off = 0;
1534 1506
1535 skb_fill_page_desc(skb, i, page, 0, 0); 1507 err = -ENOMEM;
1536 frag = &skb_shinfo(skb)->frags[i]; 1508 if (!sk_page_frag_refill(sk, pfrag))
1537 } else {
1538 err = -EMSGSIZE;
1539 goto error; 1509 goto error;
1510
1511 if (!skb_can_coalesce(skb, i, pfrag->page,
1512 pfrag->offset)) {
1513 err = -EMSGSIZE;
1514 if (i == MAX_SKB_FRAGS)
1515 goto error;
1516
1517 __skb_fill_page_desc(skb, i, pfrag->page,
1518 pfrag->offset, 0);
1519 skb_shinfo(skb)->nr_frags = ++i;
1520 get_page(pfrag->page);
1540 } 1521 }
1522 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1541 if (getfrag(from, 1523 if (getfrag(from,
1542 skb_frag_address(frag) + skb_frag_size(frag), 1524 page_address(pfrag->page) + pfrag->offset,
1543 offset, copy, skb->len, skb) < 0) { 1525 offset, copy, skb->len, skb) < 0)
1544 err = -EFAULT; 1526 goto error_efault;
1545 goto error; 1527
1546 } 1528 pfrag->offset += copy;
1547 sk->sk_sndmsg_off += copy; 1529 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1548 skb_frag_size_add(frag, copy);
1549 skb->len += copy; 1530 skb->len += copy;
1550 skb->data_len += copy; 1531 skb->data_len += copy;
1551 skb->truesize += copy; 1532 skb->truesize += copy;
@@ -1554,7 +1535,11 @@ alloc_new_skb:
1554 offset += copy; 1535 offset += copy;
1555 length -= copy; 1536 length -= copy;
1556 } 1537 }
1538
1557 return 0; 1539 return 0;
1540
1541error_efault:
1542 err = -EFAULT;
1558error: 1543error:
1559 cork->length -= length; 1544 cork->length -= length;
1560 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1545 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);