aboutsummaryrefslogtreecommitdiffstats
path: root/net/ipv4/ip_output.c
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-09-23 19:04:42 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-24 16:31:37 -0400
commit5640f7685831e088fe6c2e1f863a6805962f8e81 (patch)
treefb7660173338a45c27d610eb59ba20cf5c2b91b8 /net/ipv4/ip_output.c
parentb98b8babd6e3370fadb7c6eaacb00eb2f6344a6c (diff)
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Vijay Subramanian <subramanian.vijay@gmail.com> Cc: Alexander Duyck <alexander.h.duyck@intel.com> Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv4/ip_output.c')
-rw-r--r--net/ipv4/ip_output.c70
1 files changed, 28 insertions, 42 deletions
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a5beab1dc958..24a29a39e9a8 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -793,6 +793,7 @@ static int __ip_append_data(struct sock *sk,
793 struct flowi4 *fl4, 793 struct flowi4 *fl4,
794 struct sk_buff_head *queue, 794 struct sk_buff_head *queue,
795 struct inet_cork *cork, 795 struct inet_cork *cork,
796 struct page_frag *pfrag,
796 int getfrag(void *from, char *to, int offset, 797 int getfrag(void *from, char *to, int offset,
797 int len, int odd, struct sk_buff *skb), 798 int len, int odd, struct sk_buff *skb),
798 void *from, int length, int transhdrlen, 799 void *from, int length, int transhdrlen,
@@ -987,47 +988,30 @@ alloc_new_skb:
987 } 988 }
988 } else { 989 } else {
989 int i = skb_shinfo(skb)->nr_frags; 990 int i = skb_shinfo(skb)->nr_frags;
990 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
991 struct page *page = cork->page;
992 int off = cork->off;
993 unsigned int left;
994
995 if (page && (left = PAGE_SIZE - off) > 0) {
996 if (copy >= left)
997 copy = left;
998 if (page != skb_frag_page(frag)) {
999 if (i == MAX_SKB_FRAGS) {
1000 err = -EMSGSIZE;
1001 goto error;
1002 }
1003 skb_fill_page_desc(skb, i, page, off, 0);
1004 skb_frag_ref(skb, i);
1005 frag = &skb_shinfo(skb)->frags[i];
1006 }
1007 } else if (i < MAX_SKB_FRAGS) {
1008 if (copy > PAGE_SIZE)
1009 copy = PAGE_SIZE;
1010 page = alloc_pages(sk->sk_allocation, 0);
1011 if (page == NULL) {
1012 err = -ENOMEM;
1013 goto error;
1014 }
1015 cork->page = page;
1016 cork->off = 0;
1017 991
1018 skb_fill_page_desc(skb, i, page, 0, 0); 992 err = -ENOMEM;
1019 frag = &skb_shinfo(skb)->frags[i]; 993 if (!sk_page_frag_refill(sk, pfrag))
1020 } else {
1021 err = -EMSGSIZE;
1022 goto error;
1023 }
1024 if (getfrag(from, skb_frag_address(frag)+skb_frag_size(frag),
1025 offset, copy, skb->len, skb) < 0) {
1026 err = -EFAULT;
1027 goto error; 994 goto error;
995
996 if (!skb_can_coalesce(skb, i, pfrag->page,
997 pfrag->offset)) {
998 err = -EMSGSIZE;
999 if (i == MAX_SKB_FRAGS)
1000 goto error;
1001
1002 __skb_fill_page_desc(skb, i, pfrag->page,
1003 pfrag->offset, 0);
1004 skb_shinfo(skb)->nr_frags = ++i;
1005 get_page(pfrag->page);
1028 } 1006 }
1029 cork->off += copy; 1007 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1030 skb_frag_size_add(frag, copy); 1008 if (getfrag(from,
1009 page_address(pfrag->page) + pfrag->offset,
1010 offset, copy, skb->len, skb) < 0)
1011 goto error_efault;
1012
1013 pfrag->offset += copy;
1014 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1031 skb->len += copy; 1015 skb->len += copy;
1032 skb->data_len += copy; 1016 skb->data_len += copy;
1033 skb->truesize += copy; 1017 skb->truesize += copy;
@@ -1039,6 +1023,8 @@ alloc_new_skb:
1039 1023
1040 return 0; 1024 return 0;
1041 1025
1026error_efault:
1027 err = -EFAULT;
1042error: 1028error:
1043 cork->length -= length; 1029 cork->length -= length;
1044 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); 1030 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
@@ -1079,8 +1065,6 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1079 cork->dst = &rt->dst; 1065 cork->dst = &rt->dst;
1080 cork->length = 0; 1066 cork->length = 0;
1081 cork->tx_flags = ipc->tx_flags; 1067 cork->tx_flags = ipc->tx_flags;
1082 cork->page = NULL;
1083 cork->off = 0;
1084 1068
1085 return 0; 1069 return 0;
1086} 1070}
@@ -1117,7 +1101,8 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
1117 transhdrlen = 0; 1101 transhdrlen = 0;
1118 } 1102 }
1119 1103
1120 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base, getfrag, 1104 return __ip_append_data(sk, fl4, &sk->sk_write_queue, &inet->cork.base,
1105 sk_page_frag(sk), getfrag,
1121 from, length, transhdrlen, flags); 1106 from, length, transhdrlen, flags);
1122} 1107}
1123 1108
@@ -1439,7 +1424,8 @@ struct sk_buff *ip_make_skb(struct sock *sk,
1439 if (err) 1424 if (err)
1440 return ERR_PTR(err); 1425 return ERR_PTR(err);
1441 1426
1442 err = __ip_append_data(sk, fl4, &queue, &cork, getfrag, 1427 err = __ip_append_data(sk, fl4, &queue, &cork,
1428 &current->task_frag, getfrag,
1443 from, length, transhdrlen, flags); 1429 from, length, transhdrlen, flags);
1444 if (err) { 1430 if (err) {
1445 __ip_flush_pending_frames(sk, &queue, &cork); 1431 __ip_flush_pending_frames(sk, &queue, &cork);