aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Dumazet <dada1@cosmosbay.com>2009-05-12 16:48:02 -0400
committerDavid S. Miller <davem@davemloft.net>2009-05-17 23:47:44 -0400
commitd62fda082c48b417b47a553860abf75d9cf8b591 (patch)
tree1b2679e4fcce72eb6ac584ecf9cc039fe9ea2c4a
parent9dc20c5f78c53bf57fb7874b6e942842e1db20d3 (diff)
bnx2: bnx2_tx_int() optimizations
When using bnx2 in a high transmit load, bnx2_tx_int() cost is pretty high. There are two reasons. One is an expensive call to bnx2_get_hw_tx_cons(bnapi) for each freed skb One is cpu stalls when accessing skb_is_gso(skb) / skb_shinfo(skb)->nr_frags because of two cache line misses. (One to get skb->end/head to compute skb_shinfo(skb), one to get is_gso/nr_frags) This patch : 1) avoids calling bnx2_get_hw_tx_cons(bnapi) too many times. 2) makes bnx2_start_xmit() cache is_gso & nr_frags into sw_tx_bd descriptor. This uses a litle bit more ram (256 longs per device on x86), but helps a lot. 3) uses a prefetch(&skb->end) to speedup dev_kfree_skb(), bringing cache line that will be needed in skb_release_data() result is 5 % bandwidth increase in benchmarks, involving UDP or TCP receive & transmits, when a cpu is dedicated to ksoftirqd for bnx2. bnx2_tx_int going from 3.33 % cpu to 0.5 % cpu in oprofile Note : skb_dma_unmap() still very expensive but this is for another patch, not related to bnx2 (2.9 % of cpu, while it does nothing on x86_32) Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/bnx2.c18
-rw-r--r--drivers/net/bnx2.h2
2 files changed, 13 insertions, 7 deletions
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index b0cb29d4cc01..c37acc1d10ac 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2630,14 +2630,15 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
2630 tx_buf = &txr->tx_buf_ring[sw_ring_cons]; 2630 tx_buf = &txr->tx_buf_ring[sw_ring_cons];
2631 skb = tx_buf->skb; 2631 skb = tx_buf->skb;
2632 2632
2633 /* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */
2634 prefetch(&skb->end);
2635
2633 /* partial BD completions possible with TSO packets */ 2636 /* partial BD completions possible with TSO packets */
2634 if (skb_is_gso(skb)) { 2637 if (tx_buf->is_gso) {
2635 u16 last_idx, last_ring_idx; 2638 u16 last_idx, last_ring_idx;
2636 2639
2637 last_idx = sw_cons + 2640 last_idx = sw_cons + tx_buf->nr_frags + 1;
2638 skb_shinfo(skb)->nr_frags + 1; 2641 last_ring_idx = sw_ring_cons + tx_buf->nr_frags + 1;
2639 last_ring_idx = sw_ring_cons +
2640 skb_shinfo(skb)->nr_frags + 1;
2641 if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) { 2642 if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) {
2642 last_idx++; 2643 last_idx++;
2643 } 2644 }
@@ -2649,7 +2650,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
2649 skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE); 2650 skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE);
2650 2651
2651 tx_buf->skb = NULL; 2652 tx_buf->skb = NULL;
2652 last = skb_shinfo(skb)->nr_frags; 2653 last = tx_buf->nr_frags;
2653 2654
2654 for (i = 0; i < last; i++) { 2655 for (i = 0; i < last; i++) {
2655 sw_cons = NEXT_TX_BD(sw_cons); 2656 sw_cons = NEXT_TX_BD(sw_cons);
@@ -2662,7 +2663,8 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
2662 if (tx_pkt == budget) 2663 if (tx_pkt == budget)
2663 break; 2664 break;
2664 2665
2665 hw_cons = bnx2_get_hw_tx_cons(bnapi); 2666 if (hw_cons == sw_cons)
2667 hw_cons = bnx2_get_hw_tx_cons(bnapi);
2666 } 2668 }
2667 2669
2668 txr->hw_tx_cons = hw_cons; 2670 txr->hw_tx_cons = hw_cons;
@@ -6179,6 +6181,8 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
6179 txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START; 6181 txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START;
6180 6182
6181 last_frag = skb_shinfo(skb)->nr_frags; 6183 last_frag = skb_shinfo(skb)->nr_frags;
6184 tx_buf->nr_frags = last_frag;
6185 tx_buf->is_gso = skb_is_gso(skb);
6182 6186
6183 for (i = 0; i < last_frag; i++) { 6187 for (i = 0; i < last_frag; i++) {
6184 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 6188 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index 5b570e17c839..026ed1c84698 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -6552,6 +6552,8 @@ struct sw_pg {
6552 6552
6553struct sw_tx_bd { 6553struct sw_tx_bd {
6554 struct sk_buff *skb; 6554 struct sk_buff *skb;
6555 unsigned short is_gso;
6556 unsigned short nr_frags;
6555}; 6557};
6556 6558
6557#define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT) 6559#define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT)