aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/skbuff.h
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-04-26 20:33:38 -0400
committerDavid S. Miller <davem@davemloft.net>2012-04-30 21:35:11 -0400
commitd3836f21b0af5513ef55701dd3f50b8c42e44c7a (patch)
tree69a471411b1dbbc2bb0997dd5f9f53fce6c74a7e /include/linux/skbuff.h
parent49cbb1c1e6fd8fb069ef9fbfadc97042168f93bf (diff)
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but has the drawback the data cannot be converted to a page fragment if needed. We have three spots were it hurts : 1) GRO aggregation When a linear skb must be appended to another skb, GRO uses the frag_list fallback, very inefficient since we keep all struct sk_buff around. So drivers enabling GRO but delivering linear skbs to network stack aren't enabling full GRO power. 2) splice(socket -> pipe). We must copy the linear part to a page fragment. This kind of defeats splice() purpose (zero copy claim) 3) TCP coalescing. Recently introduced, this permits to group several contiguous segments into a single skb. This shortens queue lengths and save kernel memory, and greatly reduce probabilities of TCP collapses. This coalescing doesnt work on linear skbs (or we would need to copy data, this would be too slow) Given all these issues, the following patch introduces the possibility of having skb->head be a fragment in itself. We use a new skb flag, skb->head_frag to carry this information. build_skb() is changed to accept a frag_size argument. Drivers willing to provide a page fragment instead of kmalloc() data will set a non zero value, set to the fragment size. Then, on situations we need to convert the skb head to a frag in itself, we can check if skb->head_frag is set and avoid the copies or various fallbacks we have. This means drivers currently using frags could be updated to avoid the current skb->head allocation and reduce their memory footprint (aka skb truesize). (thats 512 or 1024 bytes saved per skb). This also makes bpf/netfilter faster since the 'first frag' will be part of skb linear part, no need to copy data. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Maciej Żenczykowski <maze@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Matt Carlson <mcarlson@broadcom.com> Cc: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/linux/skbuff.h')
-rw-r--r--include/linux/skbuff.h5
1 files changed, 3 insertions, 2 deletions
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4a656b51825e..9d28a22a8554 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -470,7 +470,8 @@ struct sk_buff {
470 __u8 wifi_acked_valid:1; 470 __u8 wifi_acked_valid:1;
471 __u8 wifi_acked:1; 471 __u8 wifi_acked:1;
472 __u8 no_fcs:1; 472 __u8 no_fcs:1;
473 /* 9/11 bit hole (depending on ndisc_nodetype presence) */ 473 __u8 head_frag:1;
474 /* 8/10 bit hole (depending on ndisc_nodetype presence) */
474 kmemcheck_bitfield_end(flags2); 475 kmemcheck_bitfield_end(flags2);
475 476
476#ifdef CONFIG_NET_DMA 477#ifdef CONFIG_NET_DMA
@@ -562,7 +563,7 @@ extern void consume_skb(struct sk_buff *skb);
562extern void __kfree_skb(struct sk_buff *skb); 563extern void __kfree_skb(struct sk_buff *skb);
563extern struct sk_buff *__alloc_skb(unsigned int size, 564extern struct sk_buff *__alloc_skb(unsigned int size,
564 gfp_t priority, int fclone, int node); 565 gfp_t priority, int fclone, int node);
565extern struct sk_buff *build_skb(void *data); 566extern struct sk_buff *build_skb(void *data, unsigned int frag_size);
566static inline struct sk_buff *alloc_skb(unsigned int size, 567static inline struct sk_buff *alloc_skb(unsigned int size,
567 gfp_t priority) 568 gfp_t priority)
568{ 569{