aboutsummaryrefslogtreecommitdiffstats
path: root/net/core
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-04-26 20:33:38 -0400
committerDavid S. Miller <davem@davemloft.net>2012-04-30 21:35:11 -0400
commitd3836f21b0af5513ef55701dd3f50b8c42e44c7a (patch)
tree69a471411b1dbbc2bb0997dd5f9f53fce6c74a7e /net/core
parent49cbb1c1e6fd8fb069ef9fbfadc97042168f93bf (diff)
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but has the drawback the data cannot be converted to a page fragment if needed. We have three spots were it hurts : 1) GRO aggregation When a linear skb must be appended to another skb, GRO uses the frag_list fallback, very inefficient since we keep all struct sk_buff around. So drivers enabling GRO but delivering linear skbs to network stack aren't enabling full GRO power. 2) splice(socket -> pipe). We must copy the linear part to a page fragment. This kind of defeats splice() purpose (zero copy claim) 3) TCP coalescing. Recently introduced, this permits to group several contiguous segments into a single skb. This shortens queue lengths and save kernel memory, and greatly reduce probabilities of TCP collapses. This coalescing doesnt work on linear skbs (or we would need to copy data, this would be too slow) Given all these issues, the following patch introduces the possibility of having skb->head be a fragment in itself. We use a new skb flag, skb->head_frag to carry this information. build_skb() is changed to accept a frag_size argument. Drivers willing to provide a page fragment instead of kmalloc() data will set a non zero value, set to the fragment size. Then, on situations we need to convert the skb head to a frag in itself, we can check if skb->head_frag is set and avoid the copies or various fallbacks we have. This means drivers currently using frags could be updated to avoid the current skb->head allocation and reduce their memory footprint (aka skb truesize). (thats 512 or 1024 bytes saved per skb). This also makes bpf/netfilter faster since the 'first frag' will be part of skb linear part, no need to copy data. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Maciej Żenczykowski <maze@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Matt Carlson <mcarlson@broadcom.com> Cc: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core')
-rw-r--r--net/core/skbuff.c24
1 files changed, 18 insertions, 6 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2342a7250391..effa75d0e318 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -245,6 +245,7 @@ EXPORT_SYMBOL(__alloc_skb);
245/** 245/**
246 * build_skb - build a network buffer 246 * build_skb - build a network buffer
247 * @data: data buffer provided by caller 247 * @data: data buffer provided by caller
248 * @frag_size: size of fragment, or 0 if head was kmalloced
248 * 249 *
249 * Allocate a new &sk_buff. Caller provides space holding head and 250 * Allocate a new &sk_buff. Caller provides space holding head and
250 * skb_shared_info. @data must have been allocated by kmalloc() 251 * skb_shared_info. @data must have been allocated by kmalloc()
@@ -258,20 +259,21 @@ EXPORT_SYMBOL(__alloc_skb);
258 * before giving packet to stack. 259 * before giving packet to stack.
259 * RX rings only contains data buffers, not full skbs. 260 * RX rings only contains data buffers, not full skbs.
260 */ 261 */
261struct sk_buff *build_skb(void *data) 262struct sk_buff *build_skb(void *data, unsigned int frag_size)
262{ 263{
263 struct skb_shared_info *shinfo; 264 struct skb_shared_info *shinfo;
264 struct sk_buff *skb; 265 struct sk_buff *skb;
265 unsigned int size; 266 unsigned int size = frag_size ? : ksize(data);
266 267
267 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); 268 skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
268 if (!skb) 269 if (!skb)
269 return NULL; 270 return NULL;
270 271
271 size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 272 size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
272 273
273 memset(skb, 0, offsetof(struct sk_buff, tail)); 274 memset(skb, 0, offsetof(struct sk_buff, tail));
274 skb->truesize = SKB_TRUESIZE(size); 275 skb->truesize = SKB_TRUESIZE(size);
276 skb->head_frag = frag_size != 0;
275 atomic_set(&skb->users, 1); 277 atomic_set(&skb->users, 1);
276 skb->head = data; 278 skb->head = data;
277 skb->data = data; 279 skb->data = data;
@@ -376,6 +378,14 @@ static void skb_clone_fraglist(struct sk_buff *skb)
376 skb_get(list); 378 skb_get(list);
377} 379}
378 380
381static void skb_free_head(struct sk_buff *skb)
382{
383 if (skb->head_frag)
384 put_page(virt_to_head_page(skb->head));
385 else
386 kfree(skb->head);
387}
388
379static void skb_release_data(struct sk_buff *skb) 389static void skb_release_data(struct sk_buff *skb)
380{ 390{
381 if (!skb->cloned || 391 if (!skb->cloned ||
@@ -402,7 +412,7 @@ static void skb_release_data(struct sk_buff *skb)
402 if (skb_has_frag_list(skb)) 412 if (skb_has_frag_list(skb))
403 skb_drop_fraglist(skb); 413 skb_drop_fraglist(skb);
404 414
405 kfree(skb->head); 415 skb_free_head(skb);
406 } 416 }
407} 417}
408 418
@@ -644,6 +654,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
644 C(tail); 654 C(tail);
645 C(end); 655 C(end);
646 C(head); 656 C(head);
657 C(head_frag);
647 C(data); 658 C(data);
648 C(truesize); 659 C(truesize);
649 atomic_set(&n->users, 1); 660 atomic_set(&n->users, 1);
@@ -940,7 +951,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
940 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; 951 fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta;
941 } 952 }
942 953
943 if (fastpath && 954 if (fastpath && !skb->head_frag &&
944 size + sizeof(struct skb_shared_info) <= ksize(skb->head)) { 955 size + sizeof(struct skb_shared_info) <= ksize(skb->head)) {
945 memmove(skb->head + size, skb_shinfo(skb), 956 memmove(skb->head + size, skb_shinfo(skb),
946 offsetof(struct skb_shared_info, 957 offsetof(struct skb_shared_info,
@@ -967,7 +978,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
967 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); 978 offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
968 979
969 if (fastpath) { 980 if (fastpath) {
970 kfree(skb->head); 981 skb_free_head(skb);
971 } else { 982 } else {
972 /* copy this zero copy skb frags */ 983 /* copy this zero copy skb frags */
973 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 984 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
@@ -985,6 +996,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
985 off = (data + nhead) - skb->head; 996 off = (data + nhead) - skb->head;
986 997
987 skb->head = data; 998 skb->head = data;
999 skb->head_frag = 0;
988adjust_others: 1000adjust_others:
989 skb->data += off; 1001 skb->data += off;
990#ifdef NET_SKBUFF_DATA_USES_OFFSET 1002#ifdef NET_SKBUFF_DATA_USES_OFFSET