diff options
author | Michael Chan <mchan@broadcom.com> | 2006-08-15 04:39:10 -0400 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2006-08-17 19:29:51 -0400 |
commit | 2f8af120a159a843948749ea88bcacda9779b132 (patch) | |
tree | e633037a72db856b8078e814e3451dd8f44c14de | |
parent | fb33f82568d32016b3b3c00819574f9526e52be3 (diff) |
[BNX2]: Fix tx race condition.
Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int()
similar to the one in tg3 discovered by Herbert Xu:
CPU0 CPU1
bnx2_start_xmit()
if (tx_ring_full) {
tx_lock
bnx2_tx()
if (!netif_queue_stopped)
netif_stop_queue()
if (!tx_ring_full)
update_tx_ring
netif_wake_queue()
tx_unlock
}
Even though tx_ring is updated before the if statement in bnx2_tx_int() in
program order, it can be re-ordered by the CPU as shown above. This
scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has
just freed up the entire tx_ring. The possibility of this happening
should be very rare though.
The following changes are made, very much identical to the tg3 fix:
1. Add memory barrier to fix the above race condition.
2. Eliminate the private tx_lock altogether and rely solely on
netif_tx_lock. This eliminates one spinlock in bnx2_start_xmit()
when the ring is full.
3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling
netif_wake_queue().
4. Add memory barrier to bnx2_tx_avail().
5. Add bp->tx_wake_thresh which is set to half the tx ring size.
6. Check for the full wake queue condition before getting
netif_tx_lock in tg3_tx(). This reduces the number of unnecessary
spinlocks when the tx ring is full in a steady-state condition.
Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | drivers/net/bnx2.c | 35 | ||||
-rw-r--r-- | drivers/net/bnx2.h | 12 |
2 files changed, 24 insertions, 23 deletions
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index db73de0d2511..2099edbbfdfd 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c | |||
@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl); | |||
209 | 209 | ||
210 | static inline u32 bnx2_tx_avail(struct bnx2 *bp) | 210 | static inline u32 bnx2_tx_avail(struct bnx2 *bp) |
211 | { | 211 | { |
212 | u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); | 212 | u32 diff; |
213 | 213 | ||
214 | smp_mb(); | ||
215 | diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); | ||
214 | if (diff > MAX_TX_DESC_CNT) | 216 | if (diff > MAX_TX_DESC_CNT) |
215 | diff = (diff & MAX_TX_DESC_CNT) - 1; | 217 | diff = (diff & MAX_TX_DESC_CNT) - 1; |
216 | return (bp->tx_ring_size - diff); | 218 | return (bp->tx_ring_size - diff); |
@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp) | |||
1686 | } | 1688 | } |
1687 | 1689 | ||
1688 | bp->tx_cons = sw_cons; | 1690 | bp->tx_cons = sw_cons; |
1691 | /* Need to make the tx_cons update visible to bnx2_start_xmit() | ||
1692 | * before checking for netif_queue_stopped(). Without the | ||
1693 | * memory barrier, there is a small possibility that bnx2_start_xmit() | ||
1694 | * will miss it and cause the queue to be stopped forever. | ||
1695 | */ | ||
1696 | smp_mb(); | ||
1689 | 1697 | ||
1690 | if (unlikely(netif_queue_stopped(bp->dev))) { | 1698 | if (unlikely(netif_queue_stopped(bp->dev)) && |
1691 | spin_lock(&bp->tx_lock); | 1699 | (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) { |
1700 | netif_tx_lock(bp->dev); | ||
1692 | if ((netif_queue_stopped(bp->dev)) && | 1701 | if ((netif_queue_stopped(bp->dev)) && |
1693 | (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { | 1702 | (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) |
1694 | |||
1695 | netif_wake_queue(bp->dev); | 1703 | netif_wake_queue(bp->dev); |
1696 | } | 1704 | netif_tx_unlock(bp->dev); |
1697 | spin_unlock(&bp->tx_lock); | ||
1698 | } | 1705 | } |
1699 | } | 1706 | } |
1700 | 1707 | ||
@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp) | |||
3503 | struct tx_bd *txbd; | 3510 | struct tx_bd *txbd; |
3504 | u32 val; | 3511 | u32 val; |
3505 | 3512 | ||
3513 | bp->tx_wake_thresh = bp->tx_ring_size / 2; | ||
3514 | |||
3506 | txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT]; | 3515 | txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT]; |
3507 | 3516 | ||
3508 | txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32; | 3517 | txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32; |
@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid) | |||
4390 | #endif | 4399 | #endif |
4391 | 4400 | ||
4392 | /* Called with netif_tx_lock. | 4401 | /* Called with netif_tx_lock. |
4393 | * hard_start_xmit is pseudo-lockless - a lock is only required when | 4402 | * bnx2_tx_int() runs without netif_tx_lock unless it needs to call |
4394 | * the tx queue is full. This way, we get the benefit of lockless | 4403 | * netif_wake_queue(). |
4395 | * operations most of the time without the complexities to handle | ||
4396 | * netif_stop_queue/wake_queue race conditions. | ||
4397 | */ | 4404 | */ |
4398 | static int | 4405 | static int |
4399 | bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) | 4406 | bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) |
@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) | |||
4512 | dev->trans_start = jiffies; | 4519 | dev->trans_start = jiffies; |
4513 | 4520 | ||
4514 | if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { | 4521 | if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { |
4515 | spin_lock(&bp->tx_lock); | ||
4516 | netif_stop_queue(dev); | 4522 | netif_stop_queue(dev); |
4517 | 4523 | if (bnx2_tx_avail(bp) > bp->tx_wake_thresh) | |
4518 | if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS) | ||
4519 | netif_wake_queue(dev); | 4524 | netif_wake_queue(dev); |
4520 | spin_unlock(&bp->tx_lock); | ||
4521 | } | 4525 | } |
4522 | 4526 | ||
4523 | return NETDEV_TX_OK; | 4527 | return NETDEV_TX_OK; |
@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev) | |||
5628 | bp->pdev = pdev; | 5632 | bp->pdev = pdev; |
5629 | 5633 | ||
5630 | spin_lock_init(&bp->phy_lock); | 5634 | spin_lock_init(&bp->phy_lock); |
5631 | spin_lock_init(&bp->tx_lock); | ||
5632 | INIT_WORK(&bp->reset_task, bnx2_reset_task, bp); | 5635 | INIT_WORK(&bp->reset_task, bnx2_reset_task, bp); |
5633 | 5636 | ||
5634 | dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0); | 5637 | dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0); |
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h index 658c5ee95c73..fe804763c607 100644 --- a/drivers/net/bnx2.h +++ b/drivers/net/bnx2.h | |||
@@ -3890,10 +3890,6 @@ struct bnx2 { | |||
3890 | u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES))); | 3890 | u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES))); |
3891 | u16 tx_prod; | 3891 | u16 tx_prod; |
3892 | 3892 | ||
3893 | struct tx_bd *tx_desc_ring; | ||
3894 | struct sw_bd *tx_buf_ring; | ||
3895 | int tx_ring_size; | ||
3896 | |||
3897 | u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES))); | 3893 | u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES))); |
3898 | u16 hw_tx_cons; | 3894 | u16 hw_tx_cons; |
3899 | 3895 | ||
@@ -3916,9 +3912,11 @@ struct bnx2 { | |||
3916 | struct sw_bd *rx_buf_ring; | 3912 | struct sw_bd *rx_buf_ring; |
3917 | struct rx_bd *rx_desc_ring[MAX_RX_RINGS]; | 3913 | struct rx_bd *rx_desc_ring[MAX_RX_RINGS]; |
3918 | 3914 | ||
3919 | /* Only used to synchronize netif_stop_queue/wake_queue when tx */ | 3915 | /* TX constants */ |
3920 | /* ring is full */ | 3916 | struct tx_bd *tx_desc_ring; |
3921 | spinlock_t tx_lock; | 3917 | struct sw_bd *tx_buf_ring; |
3918 | int tx_ring_size; | ||
3919 | u32 tx_wake_thresh; | ||
3922 | 3920 | ||
3923 | /* End of fields used in the performance code paths. */ | 3921 | /* End of fields used in the performance code paths. */ |
3924 | 3922 | ||