aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Chan <mchan@broadcom.com>2006-08-15 04:39:10 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2006-08-17 19:29:51 -0400
commit2f8af120a159a843948749ea88bcacda9779b132 (patch)
treee633037a72db856b8078e814e3451dd8f44c14de
parentfb33f82568d32016b3b3c00819574f9526e52be3 (diff)
[BNX2]: Fix tx race condition.
Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int() similar to the one in tg3 discovered by Herbert Xu: CPU0 CPU1 bnx2_start_xmit() if (tx_ring_full) { tx_lock bnx2_tx() if (!netif_queue_stopped) netif_stop_queue() if (!tx_ring_full) update_tx_ring netif_wake_queue() tx_unlock } Even though tx_ring is updated before the if statement in bnx2_tx_int() in program order, it can be re-ordered by the CPU as shown above. This scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has just freed up the entire tx_ring. The possibility of this happening should be very rare though. The following changes are made, very much identical to the tg3 fix: 1. Add memory barrier to fix the above race condition. 2. Eliminate the private tx_lock altogether and rely solely on netif_tx_lock. This eliminates one spinlock in bnx2_start_xmit() when the ring is full. 3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling netif_wake_queue(). 4. Add memory barrier to bnx2_tx_avail(). 5. Add bp->tx_wake_thresh which is set to half the tx ring size. 6. Check for the full wake queue condition before getting netif_tx_lock in tg3_tx(). This reduces the number of unnecessary spinlocks when the tx ring is full in a steady-state condition. Signed-off-by: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/bnx2.c35
-rw-r--r--drivers/net/bnx2.h12
2 files changed, 24 insertions, 23 deletions
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index db73de0d2511..2099edbbfdfd 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
209 209
210static inline u32 bnx2_tx_avail(struct bnx2 *bp) 210static inline u32 bnx2_tx_avail(struct bnx2 *bp)
211{ 211{
212 u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons); 212 u32 diff;
213 213
214 smp_mb();
215 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
214 if (diff > MAX_TX_DESC_CNT) 216 if (diff > MAX_TX_DESC_CNT)
215 diff = (diff & MAX_TX_DESC_CNT) - 1; 217 diff = (diff & MAX_TX_DESC_CNT) - 1;
216 return (bp->tx_ring_size - diff); 218 return (bp->tx_ring_size - diff);
@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp)
1686 } 1688 }
1687 1689
1688 bp->tx_cons = sw_cons; 1690 bp->tx_cons = sw_cons;
1691 /* Need to make the tx_cons update visible to bnx2_start_xmit()
1692 * before checking for netif_queue_stopped(). Without the
1693 * memory barrier, there is a small possibility that bnx2_start_xmit()
1694 * will miss it and cause the queue to be stopped forever.
1695 */
1696 smp_mb();
1689 1697
1690 if (unlikely(netif_queue_stopped(bp->dev))) { 1698 if (unlikely(netif_queue_stopped(bp->dev)) &&
1691 spin_lock(&bp->tx_lock); 1699 (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) {
1700 netif_tx_lock(bp->dev);
1692 if ((netif_queue_stopped(bp->dev)) && 1701 if ((netif_queue_stopped(bp->dev)) &&
1693 (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) { 1702 (bnx2_tx_avail(bp) > bp->tx_wake_thresh))
1694
1695 netif_wake_queue(bp->dev); 1703 netif_wake_queue(bp->dev);
1696 } 1704 netif_tx_unlock(bp->dev);
1697 spin_unlock(&bp->tx_lock);
1698 } 1705 }
1699} 1706}
1700 1707
@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp)
3503 struct tx_bd *txbd; 3510 struct tx_bd *txbd;
3504 u32 val; 3511 u32 val;
3505 3512
3513 bp->tx_wake_thresh = bp->tx_ring_size / 2;
3514
3506 txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT]; 3515 txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];
3507 3516
3508 txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32; 3517 txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;
@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
4390#endif 4399#endif
4391 4400
4392/* Called with netif_tx_lock. 4401/* Called with netif_tx_lock.
4393 * hard_start_xmit is pseudo-lockless - a lock is only required when 4402 * bnx2_tx_int() runs without netif_tx_lock unless it needs to call
4394 * the tx queue is full. This way, we get the benefit of lockless 4403 * netif_wake_queue().
4395 * operations most of the time without the complexities to handle
4396 * netif_stop_queue/wake_queue race conditions.
4397 */ 4404 */
4398static int 4405static int
4399bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev) 4406bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
4512 dev->trans_start = jiffies; 4519 dev->trans_start = jiffies;
4513 4520
4514 if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) { 4521 if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
4515 spin_lock(&bp->tx_lock);
4516 netif_stop_queue(dev); 4522 netif_stop_queue(dev);
4517 4523 if (bnx2_tx_avail(bp) > bp->tx_wake_thresh)
4518 if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
4519 netif_wake_queue(dev); 4524 netif_wake_queue(dev);
4520 spin_unlock(&bp->tx_lock);
4521 } 4525 }
4522 4526
4523 return NETDEV_TX_OK; 4527 return NETDEV_TX_OK;
@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
5628 bp->pdev = pdev; 5632 bp->pdev = pdev;
5629 5633
5630 spin_lock_init(&bp->phy_lock); 5634 spin_lock_init(&bp->phy_lock);
5631 spin_lock_init(&bp->tx_lock);
5632 INIT_WORK(&bp->reset_task, bnx2_reset_task, bp); 5635 INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);
5633 5636
5634 dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0); 5637 dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);
diff --git a/drivers/net/bnx2.h b/drivers/net/bnx2.h
index 658c5ee95c73..fe804763c607 100644
--- a/drivers/net/bnx2.h
+++ b/drivers/net/bnx2.h
@@ -3890,10 +3890,6 @@ struct bnx2 {
3890 u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES))); 3890 u32 tx_prod_bseq __attribute__((aligned(L1_CACHE_BYTES)));
3891 u16 tx_prod; 3891 u16 tx_prod;
3892 3892
3893 struct tx_bd *tx_desc_ring;
3894 struct sw_bd *tx_buf_ring;
3895 int tx_ring_size;
3896
3897 u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES))); 3893 u16 tx_cons __attribute__((aligned(L1_CACHE_BYTES)));
3898 u16 hw_tx_cons; 3894 u16 hw_tx_cons;
3899 3895
@@ -3916,9 +3912,11 @@ struct bnx2 {
3916 struct sw_bd *rx_buf_ring; 3912 struct sw_bd *rx_buf_ring;
3917 struct rx_bd *rx_desc_ring[MAX_RX_RINGS]; 3913 struct rx_bd *rx_desc_ring[MAX_RX_RINGS];
3918 3914
3919 /* Only used to synchronize netif_stop_queue/wake_queue when tx */ 3915 /* TX constants */
3920 /* ring is full */ 3916 struct tx_bd *tx_desc_ring;
3921 spinlock_t tx_lock; 3917 struct sw_bd *tx_buf_ring;
3918 int tx_ring_size;
3919 u32 tx_wake_thresh;
3922 3920
3923 /* End of fields used in the performance code paths. */ 3921 /* End of fields used in the performance code paths. */
3924 3922