diff options
author | Eric Dumazet <edumazet@google.com> | 2014-10-05 05:35:13 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2014-10-06 01:04:15 -0400 |
commit | 3d03641cb4ea050f969dd3ab34113adb95287f36 (patch) | |
tree | b1cbd2ba62c956e93d88ccde59437d46b21e13da /drivers/net/ethernet/mellanox/mlx4/en_tx.c | |
parent | dc9b06d156cefb95f7f6a3ac0521a3efa31d6805 (diff) |
net/mlx4_en: Avoid a cache line miss in TX completion for single frag skb's
Add frag0_dma/frag0_byte_count into mlx4_en_tx_info to avoid a cache
line miss in TX completion for frames having one dma element. (We avoid
reading back the tx descriptor)
Note this could be extended to 2/3 dma elements later, as we have free
room in mlx4_en_tx_info
Also, mlx4_en_free_tx_desc() no longer accesses skb_shinfo(). We use a
new nr_maps fields in mlx4_en_tx_info to avoid 2 or 3 cache misses.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx4/en_tx.c')
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx4/en_tx.c | 83 |
1 files changed, 46 insertions, 37 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 14479068001f..edc4a8810368 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c | |||
@@ -259,38 +259,40 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, | |||
259 | struct mlx4_en_tx_ring *ring, | 259 | struct mlx4_en_tx_ring *ring, |
260 | int index, u8 owner, u64 timestamp) | 260 | int index, u8 owner, u64 timestamp) |
261 | { | 261 | { |
262 | struct mlx4_en_dev *mdev = priv->mdev; | ||
263 | struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; | 262 | struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; |
264 | struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; | 263 | struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE; |
265 | struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; | 264 | struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; |
266 | struct sk_buff *skb = tx_info->skb; | ||
267 | struct skb_frag_struct *frag; | ||
268 | void *end = ring->buf + ring->buf_size; | 265 | void *end = ring->buf + ring->buf_size; |
269 | int frags = skb_shinfo(skb)->nr_frags; | 266 | struct sk_buff *skb = tx_info->skb; |
267 | int nr_maps = tx_info->nr_maps; | ||
270 | int i; | 268 | int i; |
271 | struct skb_shared_hwtstamps hwts; | ||
272 | 269 | ||
273 | if (timestamp) { | 270 | if (unlikely(timestamp)) { |
274 | mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp); | 271 | struct skb_shared_hwtstamps hwts; |
272 | |||
273 | mlx4_en_fill_hwtstamps(priv->mdev, &hwts, timestamp); | ||
275 | skb_tstamp_tx(skb, &hwts); | 274 | skb_tstamp_tx(skb, &hwts); |
276 | } | 275 | } |
277 | 276 | ||
278 | /* Optimize the common case when there are no wraparounds */ | 277 | /* Optimize the common case when there are no wraparounds */ |
279 | if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { | 278 | if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { |
280 | if (!tx_info->inl) { | 279 | if (!tx_info->inl) { |
281 | if (tx_info->linear) { | 280 | if (tx_info->linear) |
282 | dma_unmap_single(priv->ddev, | 281 | dma_unmap_single(priv->ddev, |
283 | (dma_addr_t) be64_to_cpu(data->addr), | 282 | tx_info->map0_dma, |
284 | be32_to_cpu(data->byte_count), | 283 | tx_info->map0_byte_count, |
285 | PCI_DMA_TODEVICE); | 284 | PCI_DMA_TODEVICE); |
286 | ++data; | 285 | else |
287 | } | 286 | dma_unmap_page(priv->ddev, |
288 | 287 | tx_info->map0_dma, | |
289 | for (i = 0; i < frags; i++) { | 288 | tx_info->map0_byte_count, |
290 | frag = &skb_shinfo(skb)->frags[i]; | 289 | PCI_DMA_TODEVICE); |
290 | for (i = 1; i < nr_maps; i++) { | ||
291 | data++; | ||
291 | dma_unmap_page(priv->ddev, | 292 | dma_unmap_page(priv->ddev, |
292 | (dma_addr_t) be64_to_cpu(data[i].addr), | 293 | (dma_addr_t)be64_to_cpu(data->addr), |
293 | skb_frag_size(frag), PCI_DMA_TODEVICE); | 294 | be32_to_cpu(data->byte_count), |
295 | PCI_DMA_TODEVICE); | ||
294 | } | 296 | } |
295 | } | 297 | } |
296 | } else { | 298 | } else { |
@@ -299,23 +301,25 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, | |||
299 | data = ring->buf + ((void *)data - end); | 301 | data = ring->buf + ((void *)data - end); |
300 | } | 302 | } |
301 | 303 | ||
302 | if (tx_info->linear) { | 304 | if (tx_info->linear) |
303 | dma_unmap_single(priv->ddev, | 305 | dma_unmap_single(priv->ddev, |
304 | (dma_addr_t) be64_to_cpu(data->addr), | 306 | tx_info->map0_dma, |
305 | be32_to_cpu(data->byte_count), | 307 | tx_info->map0_byte_count, |
306 | PCI_DMA_TODEVICE); | 308 | PCI_DMA_TODEVICE); |
307 | ++data; | 309 | else |
308 | } | 310 | dma_unmap_page(priv->ddev, |
309 | 311 | tx_info->map0_dma, | |
310 | for (i = 0; i < frags; i++) { | 312 | tx_info->map0_byte_count, |
313 | PCI_DMA_TODEVICE); | ||
314 | for (i = 1; i < nr_maps; i++) { | ||
315 | data++; | ||
311 | /* Check for wraparound before unmapping */ | 316 | /* Check for wraparound before unmapping */ |
312 | if ((void *) data >= end) | 317 | if ((void *) data >= end) |
313 | data = ring->buf; | 318 | data = ring->buf; |
314 | frag = &skb_shinfo(skb)->frags[i]; | ||
315 | dma_unmap_page(priv->ddev, | 319 | dma_unmap_page(priv->ddev, |
316 | (dma_addr_t) be64_to_cpu(data->addr), | 320 | (dma_addr_t)be64_to_cpu(data->addr), |
317 | skb_frag_size(frag), PCI_DMA_TODEVICE); | 321 | be32_to_cpu(data->byte_count), |
318 | ++data; | 322 | PCI_DMA_TODEVICE); |
319 | } | 323 | } |
320 | } | 324 | } |
321 | } | 325 | } |
@@ -751,19 +755,22 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) | |||
751 | tx_info->linear = (lso_header_size < skb_headlen(skb) && | 755 | tx_info->linear = (lso_header_size < skb_headlen(skb) && |
752 | !is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0; | 756 | !is_inline(ring->inline_thold, skb, NULL)) ? 1 : 0; |
753 | 757 | ||
754 | data += skb_shinfo(skb)->nr_frags + tx_info->linear - 1; | 758 | tx_info->nr_maps = skb_shinfo(skb)->nr_frags + tx_info->linear; |
759 | data += tx_info->nr_maps - 1; | ||
755 | 760 | ||
756 | if (is_inline(ring->inline_thold, skb, &fragptr)) { | 761 | if (is_inline(ring->inline_thold, skb, &fragptr)) { |
757 | tx_info->inl = 1; | 762 | tx_info->inl = 1; |
758 | } else { | 763 | } else { |
764 | dma_addr_t dma = 0; | ||
765 | u32 byte_count = 0; | ||
766 | |||
759 | /* Map fragments if any */ | 767 | /* Map fragments if any */ |
760 | for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { | 768 | for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) { |
761 | const struct skb_frag_struct *frag; | 769 | const struct skb_frag_struct *frag; |
762 | dma_addr_t dma; | ||
763 | |||
764 | frag = &skb_shinfo(skb)->frags[i]; | 770 | frag = &skb_shinfo(skb)->frags[i]; |
771 | byte_count = skb_frag_size(frag); | ||
765 | dma = skb_frag_dma_map(ddev, frag, | 772 | dma = skb_frag_dma_map(ddev, frag, |
766 | 0, skb_frag_size(frag), | 773 | 0, byte_count, |
767 | DMA_TO_DEVICE); | 774 | DMA_TO_DEVICE); |
768 | if (dma_mapping_error(ddev, dma)) | 775 | if (dma_mapping_error(ddev, dma)) |
769 | goto tx_drop_unmap; | 776 | goto tx_drop_unmap; |
@@ -771,14 +778,13 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) | |||
771 | data->addr = cpu_to_be64(dma); | 778 | data->addr = cpu_to_be64(dma); |
772 | data->lkey = ring->mr_key; | 779 | data->lkey = ring->mr_key; |
773 | wmb(); | 780 | wmb(); |
774 | data->byte_count = cpu_to_be32(skb_frag_size(frag)); | 781 | data->byte_count = cpu_to_be32(byte_count); |
775 | --data; | 782 | --data; |
776 | } | 783 | } |
777 | 784 | ||
778 | /* Map linear part if needed */ | 785 | /* Map linear part if needed */ |
779 | if (tx_info->linear) { | 786 | if (tx_info->linear) { |
780 | u32 byte_count = skb_headlen(skb) - lso_header_size; | 787 | byte_count = skb_headlen(skb) - lso_header_size; |
781 | dma_addr_t dma; | ||
782 | 788 | ||
783 | dma = dma_map_single(ddev, skb->data + | 789 | dma = dma_map_single(ddev, skb->data + |
784 | lso_header_size, byte_count, | 790 | lso_header_size, byte_count, |
@@ -792,6 +798,9 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) | |||
792 | data->byte_count = cpu_to_be32(byte_count); | 798 | data->byte_count = cpu_to_be32(byte_count); |
793 | } | 799 | } |
794 | tx_info->inl = 0; | 800 | tx_info->inl = 0; |
801 | /* tx completion can avoid cache line miss for common cases */ | ||
802 | tx_info->map0_dma = dma; | ||
803 | tx_info->map0_byte_count = byte_count; | ||
795 | } | 804 | } |
796 | 805 | ||
797 | /* | 806 | /* |