[BNX2]: Fix tx race condition.

Fix a subtle race condition between bnx2_start_xmit() and bnx2_tx_int() similar to the one in tg3 discovered by Herbert Xu: CPU0 CPU1 bnx2_start_xmit() if (tx_ring_full) { tx_lock bnx2_tx() if (!netif_queue_stopped) netif_stop_queue() if (!tx_ring_full) update_tx_ring netif_wake_queue() tx_unlock } Even though tx_ring is updated before the if statement in bnx2_tx_int() in program order, it can be re-ordered by the CPU as shown above. This scenario can cause the tx queue to be stopped forever if bnx2_tx_int() has just freed up the entire tx_ring. The possibility of this happening should be very rare though. The following changes are made, very much identical to the tg3 fix: 1. Add memory barrier to fix the above race condition. 2. Eliminate the private tx_lock altogether and rely solely on netif_tx_lock. This eliminates one spinlock in bnx2_start_xmit() when the ring is full. 3. Because of 2, use netif_tx_lock in bnx2_tx_int() before calling netif_wake_queue(). 4. Add memory barrier to bnx2_tx_avail(). 5. Add bp->tx_wake_thresh which is set to half the tx ring size. 6. Check for the full wake queue condition before getting netif_tx_lock in tg3_tx(). This reduces the number of unnecessary spinlocks when the tx ring is full in a steady-state condition. Signed-off-by: Michael Chan <mchan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Michael Chan <mchan@broadcom.com> 2006-08-15 04:39:10 -0400
committer: David S. Miller <davem@sunset.davemloft.net> 2006-08-17 19:29:51 -0400
commit: 2f8af120a159a843948749ea88bcacda9779b132 (patch)
tree: e633037a72db856b8078e814e3451dd8f44c14de /drivers/net/bnx2.c
parent: fb33f82568d32016b3b3c00819574f9526e52be3 (diff)
1 files changed, 19 insertions, 16 deletions
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index db73de0d2511..2099edbbfdfd 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
 static inline u32 bnx2_tx_avail(struct bnx2 *bp)
 {
-        u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
+        u32 diff;
+        smp_mb();
+        diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
        if (diff > MAX_TX_DESC_CNT)
                diff = (diff & MAX_TX_DESC_CNT) - 1;
        return (bp->tx_ring_size - diff);
@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp)
        }
        bp->tx_cons = sw_cons;
+        /* Need to make the tx_cons update visible to bnx2_start_xmit()
+         * before checking for netif_queue_stopped().  Without the
+         * memory barrier, there is a small possibility that bnx2_start_xmit()
+         * will miss it and cause the queue to be stopped forever.
+         */
+        smp_mb();
-        if (unlikely(netif_queue_stopped(bp->dev))) {
+        if (unlikely(netif_queue_stopped(bp->dev)) &&
-                spin_lock(&bp->tx_lock);
+                     (bnx2_tx_avail(bp) > bp->tx_wake_thresh)) {
+                netif_tx_lock(bp->dev);
                if ((netif_queue_stopped(bp->dev)) &&
-                    (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) {
+                    (bnx2_tx_avail(bp) > bp->tx_wake_thresh))
                        netif_wake_queue(bp->dev);
-                }
+                netif_tx_unlock(bp->dev);
-                spin_unlock(&bp->tx_lock);
        }
 }
@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp)
        struct tx_bd *txbd;
        u32 val;
+        bp->tx_wake_thresh = bp->tx_ring_size / 2;
        txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];
                
        txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;
@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
 #endif
 /* Called with netif_tx_lock.
- * hard_start_xmit is pseudo-lockless - a lock is only required when
+ * bnx2_tx_int() runs without netif_tx_lock unless it needs to call
- * the tx queue is full. This way, we get the benefit of lockless
+ * netif_wake_queue().
- * operations most of the time without the complexities to handle
- * netif_stop_queue/wake_queue race conditions.
 */
 static int
 bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
        dev->trans_start = jiffies;
        if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
-                spin_lock(&bp->tx_lock);
                netif_stop_queue(dev);
-                
+                if (bnx2_tx_avail(bp) > bp->tx_wake_thresh)
-                if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
                        netif_wake_queue(dev);
-                spin_unlock(&bp->tx_lock);
        }
        return NETDEV_TX_OK;
@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
        bp->pdev = pdev;
        spin_lock_init(&bp->phy_lock);
-        spin_lock_init(&bp->tx_lock);
        INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);
        dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);
author	Michael Chan <mchan@broadcom.com>	2006-08-15 04:39:10 -0400
committer	David S. Miller <davem@sunset.davemloft.net>	2006-08-17 19:29:51 -0400
commit	2f8af120a159a843948749ea88bcacda9779b132 (patch)
tree	e633037a72db856b8078e814e3451dd8f44c14de /drivers/net/bnx2.c
parent	fb33f82568d32016b3b3c00819574f9526e52be3 (diff)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index db73de0d2511..2099edbbfdfd 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c
@@ -209,8 +209,10 @@ MODULE_DEVICE_TABLE(pci, bnx2_pci_tbl);
209		209
210	static inline u32 bnx2_tx_avail(struct bnx2 *bp)	210	static inline u32 bnx2_tx_avail(struct bnx2 *bp)
211	{	211	{
212	u32 diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);	212	u32 diff;
213		213
		214	smp_mb();
		215	diff = TX_RING_IDX(bp->tx_prod) - TX_RING_IDX(bp->tx_cons);
214	if (diff > MAX_TX_DESC_CNT)	216	if (diff > MAX_TX_DESC_CNT)
215	diff = (diff & MAX_TX_DESC_CNT) - 1;	217	diff = (diff & MAX_TX_DESC_CNT) - 1;
216	return (bp->tx_ring_size - diff);	218	return (bp->tx_ring_size - diff);
@@ -1686,15 +1688,20 @@ bnx2_tx_int(struct bnx2 *bp)
1686	}	1688	}
1687		1689
1688	bp->tx_cons = sw_cons;	1690	bp->tx_cons = sw_cons;
		1691	/* Need to make the tx_cons update visible to bnx2_start_xmit()
		1692	* before checking for netif_queue_stopped(). Without the
		1693	* memory barrier, there is a small possibility that bnx2_start_xmit()
		1694	* will miss it and cause the queue to be stopped forever.
		1695	*/
		1696	smp_mb();
1689		1697
1690	if (unlikely(netif_queue_stopped(bp->dev))) {	1698	if (unlikely(netif_queue_stopped(bp->dev)) &&
1691	spin_lock(&bp->tx_lock);	1699	(bnx2_tx_avail(bp) > bp->tx_wake_thresh)) {
		1700	netif_tx_lock(bp->dev);
1692	if ((netif_queue_stopped(bp->dev)) &&	1701	if ((netif_queue_stopped(bp->dev)) &&
1693	(bnx2_tx_avail(bp) > MAX_SKB_FRAGS)) {	1702	(bnx2_tx_avail(bp) > bp->tx_wake_thresh))
1694
1695	netif_wake_queue(bp->dev);	1703	netif_wake_queue(bp->dev);
1696	}	1704	netif_tx_unlock(bp->dev);
1697	spin_unlock(&bp->tx_lock);
1698	}	1705	}
1699	}	1706	}
1700		1707
@@ -3503,6 +3510,8 @@ bnx2_init_tx_ring(struct bnx2 *bp)
3503	struct tx_bd *txbd;	3510	struct tx_bd *txbd;
3504	u32 val;	3511	u32 val;
3505		3512
		3513	bp->tx_wake_thresh = bp->tx_ring_size / 2;
		3514
3506	txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];	3515	txbd = &bp->tx_desc_ring[MAX_TX_DESC_CNT];
3507		3516
3508	txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;	3517	txbd->tx_bd_haddr_hi = (u64) bp->tx_desc_mapping >> 32;
@@ -4390,10 +4399,8 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
4390	#endif	4399	#endif
4391		4400
4392	/* Called with netif_tx_lock.	4401	/* Called with netif_tx_lock.
4393	* hard_start_xmit is pseudo-lockless - a lock is only required when	4402	* bnx2_tx_int() runs without netif_tx_lock unless it needs to call
4394	* the tx queue is full. This way, we get the benefit of lockless	4403	* netif_wake_queue().
4395	* operations most of the time without the complexities to handle
4396	* netif_stop_queue/wake_queue race conditions.
4397	*/	4404	*/
4398	static int	4405	static int
4399	bnx2_start_xmit(struct sk_buff skb, struct net_device dev)	4406	bnx2_start_xmit(struct sk_buff skb, struct net_device dev)
@@ -4512,12 +4519,9 @@ bnx2_start_xmit(struct sk_buff skb, struct net_device dev)
4512	dev->trans_start = jiffies;	4519	dev->trans_start = jiffies;
4513		4520
4514	if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {	4521	if (unlikely(bnx2_tx_avail(bp) <= MAX_SKB_FRAGS)) {
4515	spin_lock(&bp->tx_lock);
4516	netif_stop_queue(dev);	4522	netif_stop_queue(dev);
4517		4523	if (bnx2_tx_avail(bp) > bp->tx_wake_thresh)
4518	if (bnx2_tx_avail(bp) > MAX_SKB_FRAGS)
4519	netif_wake_queue(dev);	4524	netif_wake_queue(dev);
4520	spin_unlock(&bp->tx_lock);
4521	}	4525	}
4522		4526
4523	return NETDEV_TX_OK;	4527	return NETDEV_TX_OK;
@@ -5628,7 +5632,6 @@ bnx2_init_board(struct pci_dev pdev, struct net_device dev)
5628	bp->pdev = pdev;	5632	bp->pdev = pdev;
5629		5633
5630	spin_lock_init(&bp->phy_lock);	5634	spin_lock_init(&bp->phy_lock);
5631	spin_lock_init(&bp->tx_lock);
5632	INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);	5635	INIT_WORK(&bp->reset_task, bnx2_reset_task, bp);
5633		5636
5634	dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);	5637	dev->base_addr = dev->mem_start = pci_resource_start(pdev, 0);