diff options
author | Eric Dumazet <dada1@cosmosbay.com> | 2009-05-12 16:48:02 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-05-17 23:47:44 -0400 |
commit | d62fda082c48b417b47a553860abf75d9cf8b591 (patch) | |
tree | 1b2679e4fcce72eb6ac584ecf9cc039fe9ea2c4a | |
parent | 9dc20c5f78c53bf57fb7874b6e942842e1db20d3 (diff) |
bnx2: bnx2_tx_int() optimizations
When using bnx2 in a high transmit load, bnx2_tx_int() cost is pretty high.
There are two reasons.
One is an expensive call to bnx2_get_hw_tx_cons(bnapi) for each freed skb
One is cpu stalls when accessing skb_is_gso(skb) / skb_shinfo(skb)->nr_frags
because of two cache line misses.
(One to get skb->end/head to compute skb_shinfo(skb),
one to get is_gso/nr_frags)
This patch :
1) avoids calling bnx2_get_hw_tx_cons(bnapi) too many times.
2) makes bnx2_start_xmit() cache is_gso & nr_frags into sw_tx_bd descriptor.
This uses a litle bit more ram (256 longs per device on x86), but helps a lot.
3) uses a prefetch(&skb->end) to speedup dev_kfree_skb(), bringing
cache line that will be needed in skb_release_data()
result is 5 % bandwidth increase in benchmarks, involving UDP or TCP receive
& transmits, when a cpu is dedicated to ksoftirqd for bnx2.
bnx2_tx_int going from 3.33 % cpu to 0.5 % cpu in oprofile
Note : skb_dma_unmap() still very expensive but this is for another patch,
not related to bnx2 (2.9 % of cpu, while it does nothing on x86_32)
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/bnx2.c | 18 | ||||
-rw-r--r-- | drivers/net/bnx2.h | 2 |
2 files changed, 13 insertions, 7 deletions
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index b0cb29d4cc01..c37acc1d10ac 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c | |||
@@ -2630,14 +2630,15 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget) | |||
2630 | tx_buf = &txr->tx_buf_ring[sw_ring_cons]; | 2630 | tx_buf = &txr->tx_buf_ring[sw_ring_cons]; |
2631 | skb = tx_buf->skb; | 2631 | skb = tx_buf->skb; |
2632 | 2632 | ||
2633 | /* prefetch skb_end_pointer() to speedup skb_shinfo(skb) */ | ||
2634 | prefetch(&skb->end); | ||
2635 | |||
2633 | /* partial BD completions possible with TSO packets */ | 2636 | /* partial BD completions possible with TSO packets */ |
2634 | if (skb_is_gso(skb)) { | 2637 | if (tx_buf->is_gso) { |
2635 | u16 last_idx, last_ring_idx; | 2638 | u16 last_idx, last_ring_idx; |
2636 | 2639 | ||
2637 | last_idx = sw_cons + | 2640 | last_idx = sw_cons + tx_buf->nr_frags + 1; |
2638 | skb_shinfo(skb)->nr_frags + 1; | 2641 | last_ring_idx = sw_ring_cons + tx_buf->nr_frags + 1; |
2639 | last_ring_idx = sw_ring_cons + | ||
2640 | skb_shinfo(skb)->nr_frags + 1; | ||
2641 | if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) { | 2642 | if (unlikely(last_ring_idx >= MAX_TX_DESC_CNT)) { |
2642 | last_idx++; | 2643 | last_idx++; |
2643 | } | 2644 | } |
@@ -2649,7 +2650,7 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget) | |||
2649 | skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE); | 2650 | skb_dma_unmap(&bp->pdev->dev, skb, DMA_TO_DEVICE); |
2650 | 2651 | ||
2651 | tx_buf->skb = NULL; | 2652 | tx_buf->skb = NULL; |
2652 | last = skb_shinfo(skb)->nr_frags; | 2653 | last = tx_buf->nr_frags; |
2653 | 2654 | ||
2654 | for (i = 0; i < last; i++) { | 2655 | for (i = 0; i < last; i++) { |
2655 | sw_cons = NEXT_TX_BD(sw_cons); | 2656 | sw_cons = NEXT_TX_BD(sw_cons); |
@@ -2662,7 +2663,8 @@ bnx2_tx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget) | |||
2662 | if (tx_pkt == budget) | 2663 | if (tx_pkt == budget) |
2663 | break; | 2664 | break; |
2664 | 2665 | ||
2665 | hw_cons = bnx2_get_hw_tx_cons(bnapi); | 2666 | if (hw_cons == sw_cons) |
2667 | hw_cons = bnx2_get_hw_tx_cons(bnapi); | ||
2666 | } | 2668 | } |
2667 | 2669 | ||
2668 | txr->hw_tx_cons = hw_cons; | 2670 | txr->hw_tx_cons = hw_cons; |
@@ -6179,6 +6181,8 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
6179 | txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START; | 6181 | txbd->tx_bd_vlan_tag_flags = vlan_tag_flags | TX_BD_FLAGS_START; |
6180 | 6182 | ||
6181 | last_frag = skb_shinfo(skb)->nr_frags; | 6183 | last_frag = skb_shinfo(skb)->nr_frags; |
6184 | tx_buf->nr_frags = last_frag; | ||
6185 | tx_buf->is_gso = skb_is_gso(skb); | ||
6182 | 6186 | ||
6183 | for (i = 0; i < last_frag; i++) { | 6187 | for (i = 0; i < last_frag; i++) { |
6184 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | 6188 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; |
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 5b570e17c839..026ed1c84698 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h | |||
@@ -6552,6 +6552,8 @@ struct sw_pg { | |||
6552 | 6552 | ||
6553 | struct sw_tx_bd { | 6553 | struct sw_tx_bd { |
6554 | struct sk_buff *skb; | 6554 | struct sk_buff *skb; |
6555 | unsigned short is_gso; | ||
6556 | unsigned short nr_frags; | ||
6555 | }; | 6557 | }; |
6556 | 6558 | ||
6557 | #define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT) | 6559 | #define SW_RXBD_RING_SIZE (sizeof(struct sw_bd) * RX_DESC_CNT) |