aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorEric Dumazet <edumazet@google.com>2012-04-26 20:33:38 -0400
committerDavid S. Miller <davem@davemloft.net>2012-04-30 21:35:11 -0400
commitd3836f21b0af5513ef55701dd3f50b8c42e44c7a (patch)
tree69a471411b1dbbc2bb0997dd5f9f53fce6c74a7e /drivers
parent49cbb1c1e6fd8fb069ef9fbfadc97042168f93bf (diff)
net: allow skb->head to be a page fragment
skb->head is currently allocated from kmalloc(). This is convenient but has the drawback the data cannot be converted to a page fragment if needed. We have three spots were it hurts : 1) GRO aggregation When a linear skb must be appended to another skb, GRO uses the frag_list fallback, very inefficient since we keep all struct sk_buff around. So drivers enabling GRO but delivering linear skbs to network stack aren't enabling full GRO power. 2) splice(socket -> pipe). We must copy the linear part to a page fragment. This kind of defeats splice() purpose (zero copy claim) 3) TCP coalescing. Recently introduced, this permits to group several contiguous segments into a single skb. This shortens queue lengths and save kernel memory, and greatly reduce probabilities of TCP collapses. This coalescing doesnt work on linear skbs (or we would need to copy data, this would be too slow) Given all these issues, the following patch introduces the possibility of having skb->head be a fragment in itself. We use a new skb flag, skb->head_frag to carry this information. build_skb() is changed to accept a frag_size argument. Drivers willing to provide a page fragment instead of kmalloc() data will set a non zero value, set to the fragment size. Then, on situations we need to convert the skb head to a frag in itself, we can check if skb->head_frag is set and avoid the copies or various fallbacks we have. This means drivers currently using frags could be updated to avoid the current skb->head allocation and reduce their memory footprint (aka skb truesize). (thats 512 or 1024 bytes saved per skb). This also makes bpf/netfilter faster since the 'first frag' will be part of skb linear part, no need to copy data. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Maciej Żenczykowski <maze@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Matt Carlson <mcarlson@broadcom.com> Cc: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/ethernet/broadcom/bnx2.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c4
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c2
3 files changed, 4 insertions, 4 deletions
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index ab55979b3756..ac7b74488531 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3006,7 +3006,7 @@ error:
3006 3006
3007 dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size, 3007 dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size,
3008 PCI_DMA_FROMDEVICE); 3008 PCI_DMA_FROMDEVICE);
3009 skb = build_skb(data); 3009 skb = build_skb(data, 0);
3010 if (!skb) { 3010 if (!skb) {
3011 kfree(data); 3011 kfree(data);
3012 goto error; 3012 goto error;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index afa6cbb6b193..be0e90382d9e 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -513,7 +513,7 @@ static inline void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
513 dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping), 513 dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(rx_buf, mapping),
514 fp->rx_buf_size, DMA_FROM_DEVICE); 514 fp->rx_buf_size, DMA_FROM_DEVICE);
515 if (likely(new_data)) 515 if (likely(new_data))
516 skb = build_skb(data); 516 skb = build_skb(data, 0);
517 517
518 if (likely(skb)) { 518 if (likely(skb)) {
519#ifdef BNX2X_STOP_ON_ERROR 519#ifdef BNX2X_STOP_ON_ERROR
@@ -721,7 +721,7 @@ int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
721 dma_unmap_addr(rx_buf, mapping), 721 dma_unmap_addr(rx_buf, mapping),
722 fp->rx_buf_size, 722 fp->rx_buf_size,
723 DMA_FROM_DEVICE); 723 DMA_FROM_DEVICE);
724 skb = build_skb(data); 724 skb = build_skb(data, 0);
725 if (unlikely(!skb)) { 725 if (unlikely(!skb)) {
726 kfree(data); 726 kfree(data);
727 fp->eth_q_stats.rx_skb_alloc_failed++; 727 fp->eth_q_stats.rx_skb_alloc_failed++;
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 0c3e7c70ffbc..d481b0a99847 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -5844,7 +5844,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
5844 pci_unmap_single(tp->pdev, dma_addr, skb_size, 5844 pci_unmap_single(tp->pdev, dma_addr, skb_size,
5845 PCI_DMA_FROMDEVICE); 5845 PCI_DMA_FROMDEVICE);
5846 5846
5847 skb = build_skb(data); 5847 skb = build_skb(data, 0);
5848 if (!skb) { 5848 if (!skb) {
5849 kfree(data); 5849 kfree(data);
5850 goto drop_it_no_recycle; 5850 goto drop_it_no_recycle;