aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net
diff options
context:
space:
mode:
authorBruce Allan <bruce.w.allan@intel.com>2012-08-24 16:38:11 -0400
committerDavid S. Miller <davem@davemloft.net>2012-08-30 12:40:22 -0400
commitd821a4c4d11ad160925dab2bb009b8444beff484 (patch)
tree48e17b3bbdf314726bab228b81a02265dbd61fc1 /drivers/net
parent99469c32f79a32d8481f87be0d3c66dad286f4ec (diff)
e1000e: DoS while TSO enabled caused by link partner with small MSS
With a low enough MSS on the link partner and TSO enabled locally, the networking stack can periodically send a very large (e.g. 64KB) TCP message for which the driver will attempt to use more Tx descriptors than are available by default in the Tx ring. This is due to a workaround in the code that imposes a limit of only 4 MSS-sized segments per descriptor which appears to be a carry-over from the older e1000 driver and may be applicable only to some older PCI or PCIx parts which are not supported in e1000e. When the driver gets a message that is too large to fit across the configured number of Tx descriptors, it stops the upper stack from queueing any more and gets stuck in this state. After a timeout, the upper stack assumes the adapter is hung and calls the driver to reset it. Remove the unnecessary limitation of using up to only 4 MSS-sized segments per Tx descriptor, and put in a hard failure test to catch when attempting to check for message sizes larger than would fit in the whole Tx ring. Refactor the remaining logic that limits the size of data per Tx descriptor from a seemingly arbitrary 8KB to a limit based on the dynamic size of the Tx packet buffer as described in the hardware specification. Also, fix the logic in the check for space in the Tx ring for the next largest possible packet after the current one has been successfully queued for transmit, and use the appropriate defines for default ring sizes in e1000_probe instead of magic values. This issue goes back to the introduction of e1000e in 2.6.24 when it was split off from e1000. Reported-by: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: Bruce Allan <bruce.w.allan@intel.com> Cc: Stable <stable@vger.kernel.org> [2.6.24+] Tested-by: Aaron Brown <aaron.f.brown@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net')
-rw-r--r--drivers/net/ethernet/intel/e1000e/e1000.h1
-rw-r--r--drivers/net/ethernet/intel/e1000e/netdev.c48
2 files changed, 24 insertions, 25 deletions
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index cd153326c3cf..cb3356c9af80 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -310,6 +310,7 @@ struct e1000_adapter {
310 */ 310 */
311 struct e1000_ring *tx_ring /* One per active queue */ 311 struct e1000_ring *tx_ring /* One per active queue */
312 ____cacheline_aligned_in_smp; 312 ____cacheline_aligned_in_smp;
313 u32 tx_fifo_limit;
313 314
314 struct napi_struct napi; 315 struct napi_struct napi;
315 316
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 46c3b1f9ff89..d01a099475a1 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3517,6 +3517,15 @@ void e1000e_reset(struct e1000_adapter *adapter)
3517 } 3517 }
3518 3518
3519 /* 3519 /*
3520 * Alignment of Tx data is on an arbitrary byte boundary with the
3521 * maximum size per Tx descriptor limited only to the transmit
3522 * allocation of the packet buffer minus 96 bytes with an upper
3523 * limit of 24KB due to receive synchronization limitations.
3524 */
3525 adapter->tx_fifo_limit = min_t(u32, ((er32(PBA) >> 16) << 10) - 96,
3526 24 << 10);
3527
3528 /*
3520 * Disable Adaptive Interrupt Moderation if 2 full packets cannot 3529 * Disable Adaptive Interrupt Moderation if 2 full packets cannot
3521 * fit in receive buffer. 3530 * fit in receive buffer.
3522 */ 3531 */
@@ -4785,12 +4794,9 @@ static bool e1000_tx_csum(struct e1000_ring *tx_ring, struct sk_buff *skb)
4785 return 1; 4794 return 1;
4786} 4795}
4787 4796
4788#define E1000_MAX_PER_TXD 8192
4789#define E1000_MAX_TXD_PWR 12
4790
4791static int e1000_tx_map(struct e1000_ring *tx_ring, struct sk_buff *skb, 4797static int e1000_tx_map(struct e1000_ring *tx_ring, struct sk_buff *skb,
4792 unsigned int first, unsigned int max_per_txd, 4798 unsigned int first, unsigned int max_per_txd,
4793 unsigned int nr_frags, unsigned int mss) 4799 unsigned int nr_frags)
4794{ 4800{
4795 struct e1000_adapter *adapter = tx_ring->adapter; 4801 struct e1000_adapter *adapter = tx_ring->adapter;
4796 struct pci_dev *pdev = adapter->pdev; 4802 struct pci_dev *pdev = adapter->pdev;
@@ -5023,20 +5029,19 @@ static int __e1000_maybe_stop_tx(struct e1000_ring *tx_ring, int size)
5023 5029
5024static int e1000_maybe_stop_tx(struct e1000_ring *tx_ring, int size) 5030static int e1000_maybe_stop_tx(struct e1000_ring *tx_ring, int size)
5025{ 5031{
5032 BUG_ON(size > tx_ring->count);
5033
5026 if (e1000_desc_unused(tx_ring) >= size) 5034 if (e1000_desc_unused(tx_ring) >= size)
5027 return 0; 5035 return 0;
5028 return __e1000_maybe_stop_tx(tx_ring, size); 5036 return __e1000_maybe_stop_tx(tx_ring, size);
5029} 5037}
5030 5038
5031#define TXD_USE_COUNT(S, X) (((S) >> (X)) + 1)
5032static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb, 5039static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
5033 struct net_device *netdev) 5040 struct net_device *netdev)
5034{ 5041{
5035 struct e1000_adapter *adapter = netdev_priv(netdev); 5042 struct e1000_adapter *adapter = netdev_priv(netdev);
5036 struct e1000_ring *tx_ring = adapter->tx_ring; 5043 struct e1000_ring *tx_ring = adapter->tx_ring;
5037 unsigned int first; 5044 unsigned int first;
5038 unsigned int max_per_txd = E1000_MAX_PER_TXD;
5039 unsigned int max_txd_pwr = E1000_MAX_TXD_PWR;
5040 unsigned int tx_flags = 0; 5045 unsigned int tx_flags = 0;
5041 unsigned int len = skb_headlen(skb); 5046 unsigned int len = skb_headlen(skb);
5042 unsigned int nr_frags; 5047 unsigned int nr_frags;
@@ -5056,18 +5061,8 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
5056 } 5061 }
5057 5062
5058 mss = skb_shinfo(skb)->gso_size; 5063 mss = skb_shinfo(skb)->gso_size;
5059 /*
5060 * The controller does a simple calculation to
5061 * make sure there is enough room in the FIFO before
5062 * initiating the DMA for each buffer. The calc is:
5063 * 4 = ceil(buffer len/mss). To make sure we don't
5064 * overrun the FIFO, adjust the max buffer len if mss
5065 * drops.
5066 */
5067 if (mss) { 5064 if (mss) {
5068 u8 hdr_len; 5065 u8 hdr_len;
5069 max_per_txd = min(mss << 2, max_per_txd);
5070 max_txd_pwr = fls(max_per_txd) - 1;
5071 5066
5072 /* 5067 /*
5073 * TSO Workaround for 82571/2/3 Controllers -- if skb->data 5068 * TSO Workaround for 82571/2/3 Controllers -- if skb->data
@@ -5097,12 +5092,12 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
5097 count++; 5092 count++;
5098 count++; 5093 count++;
5099 5094
5100 count += TXD_USE_COUNT(len, max_txd_pwr); 5095 count += DIV_ROUND_UP(len, adapter->tx_fifo_limit);
5101 5096
5102 nr_frags = skb_shinfo(skb)->nr_frags; 5097 nr_frags = skb_shinfo(skb)->nr_frags;
5103 for (f = 0; f < nr_frags; f++) 5098 for (f = 0; f < nr_frags; f++)
5104 count += TXD_USE_COUNT(skb_frag_size(&skb_shinfo(skb)->frags[f]), 5099 count += DIV_ROUND_UP(skb_frag_size(&skb_shinfo(skb)->frags[f]),
5105 max_txd_pwr); 5100 adapter->tx_fifo_limit);
5106 5101
5107 if (adapter->hw.mac.tx_pkt_filtering) 5102 if (adapter->hw.mac.tx_pkt_filtering)
5108 e1000_transfer_dhcp_info(adapter, skb); 5103 e1000_transfer_dhcp_info(adapter, skb);
@@ -5144,15 +5139,18 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
5144 tx_flags |= E1000_TX_FLAGS_NO_FCS; 5139 tx_flags |= E1000_TX_FLAGS_NO_FCS;
5145 5140
5146 /* if count is 0 then mapping error has occurred */ 5141 /* if count is 0 then mapping error has occurred */
5147 count = e1000_tx_map(tx_ring, skb, first, max_per_txd, nr_frags, mss); 5142 count = e1000_tx_map(tx_ring, skb, first, adapter->tx_fifo_limit,
5143 nr_frags);
5148 if (count) { 5144 if (count) {
5149 skb_tx_timestamp(skb); 5145 skb_tx_timestamp(skb);
5150 5146
5151 netdev_sent_queue(netdev, skb->len); 5147 netdev_sent_queue(netdev, skb->len);
5152 e1000_tx_queue(tx_ring, tx_flags, count); 5148 e1000_tx_queue(tx_ring, tx_flags, count);
5153 /* Make sure there is space in the ring for the next send. */ 5149 /* Make sure there is space in the ring for the next send. */
5154 e1000_maybe_stop_tx(tx_ring, MAX_SKB_FRAGS + 2); 5150 e1000_maybe_stop_tx(tx_ring,
5155 5151 (MAX_SKB_FRAGS *
5152 DIV_ROUND_UP(PAGE_SIZE,
5153 adapter->tx_fifo_limit) + 2));
5156 } else { 5154 } else {
5157 dev_kfree_skb_any(skb); 5155 dev_kfree_skb_any(skb);
5158 tx_ring->buffer_info[first].time_stamp = 0; 5156 tx_ring->buffer_info[first].time_stamp = 0;
@@ -6327,8 +6325,8 @@ static int __devinit e1000_probe(struct pci_dev *pdev,
6327 adapter->hw.phy.autoneg_advertised = 0x2f; 6325 adapter->hw.phy.autoneg_advertised = 0x2f;
6328 6326
6329 /* ring size defaults */ 6327 /* ring size defaults */
6330 adapter->rx_ring->count = 256; 6328 adapter->rx_ring->count = E1000_DEFAULT_RXD;
6331 adapter->tx_ring->count = 256; 6329 adapter->tx_ring->count = E1000_DEFAULT_TXD;
6332 6330
6333 /* 6331 /*
6334 * Initial Wake on LAN setting - If APM wake is enabled in 6332 * Initial Wake on LAN setting - If APM wake is enabled in