diff options
author | Olof Johansson <olof@lixom.net> | 2007-10-02 17:27:15 -0400 |
---|---|---|
committer | David S. Miller <davem@sunset.davemloft.net> | 2007-10-10 19:54:25 -0400 |
commit | ad5da10a64bdca1ed39b25946727a1ce2659f3d4 (patch) | |
tree | 471d87010666ee8af865c6e573515322c798e3ec | |
parent | 8dc121a4b620090e594945fd36f878836fc5a14a (diff) |
pasemi_mac: further performance tweaks
pasemi_mac: further performance tweaks
Misc driver tweaks for pasemi_mac:
* Increase ring size (really needed mostly on 10G)
* Take out an unneeded barrier
* Move around a few prefetches and reorder a few calls
* Don't try to clean on full tx buffer, just let things
take their course and stop the queue directly
* Avoid filling on the same line as the interface is
working on to reduce cache line bouncing
* Avoid unneeded clearing of software state (and make the
interface shutdown code handle it)
* Fix up some of the tx ring wrap logic.
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
-rw-r--r-- | drivers/net/pasemi_mac.c | 92 |
1 files changed, 47 insertions, 45 deletions
diff --git a/drivers/net/pasemi_mac.c b/drivers/net/pasemi_mac.c index b3994f5d2d2b..4a451f8c6f4d 100644 --- a/drivers/net/pasemi_mac.c +++ b/drivers/net/pasemi_mac.c | |||
@@ -56,8 +56,8 @@ | |||
56 | 56 | ||
57 | 57 | ||
58 | /* Must be a power of two */ | 58 | /* Must be a power of two */ |
59 | #define RX_RING_SIZE 512 | 59 | #define RX_RING_SIZE 4096 |
60 | #define TX_RING_SIZE 512 | 60 | #define TX_RING_SIZE 4096 |
61 | 61 | ||
62 | #define DEFAULT_MSG_ENABLE \ | 62 | #define DEFAULT_MSG_ENABLE \ |
63 | (NETIF_MSG_DRV | \ | 63 | (NETIF_MSG_DRV | \ |
@@ -336,8 +336,16 @@ static void pasemi_mac_free_tx_resources(struct net_device *dev) | |||
336 | struct pasemi_mac_buffer *info; | 336 | struct pasemi_mac_buffer *info; |
337 | dma_addr_t dmas[MAX_SKB_FRAGS+1]; | 337 | dma_addr_t dmas[MAX_SKB_FRAGS+1]; |
338 | int freed; | 338 | int freed; |
339 | int start, limit; | ||
339 | 340 | ||
340 | for (i = 0; i < TX_RING_SIZE; i += freed) { | 341 | start = mac->tx->next_to_clean; |
342 | limit = mac->tx->next_to_fill; | ||
343 | |||
344 | /* Compensate for when fill has wrapped and clean has not */ | ||
345 | if (start > limit) | ||
346 | limit += TX_RING_SIZE; | ||
347 | |||
348 | for (i = start; i < limit; i += freed) { | ||
341 | info = &TX_RING_INFO(mac, i+1); | 349 | info = &TX_RING_INFO(mac, i+1); |
342 | if (info->dma && info->skb) { | 350 | if (info->dma && info->skb) { |
343 | for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++) | 351 | for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++) |
@@ -520,9 +528,6 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit) | |||
520 | n = mac->rx->next_to_clean; | 528 | n = mac->rx->next_to_clean; |
521 | 529 | ||
522 | for (count = limit; count; count--) { | 530 | for (count = limit; count; count--) { |
523 | |||
524 | rmb(); | ||
525 | |||
526 | macrx = RX_RING(mac, n); | 531 | macrx = RX_RING(mac, n); |
527 | 532 | ||
528 | if ((macrx & XCT_MACRX_E) || | 533 | if ((macrx & XCT_MACRX_E) || |
@@ -550,14 +555,10 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit) | |||
550 | break; | 555 | break; |
551 | } | 556 | } |
552 | 557 | ||
553 | prefetchw(info); | ||
554 | |||
555 | skb = info->skb; | 558 | skb = info->skb; |
556 | prefetchw(skb); | ||
557 | info->dma = 0; | ||
558 | 559 | ||
559 | pci_unmap_single(mac->dma_pdev, dma, skb->len, | 560 | prefetch(skb); |
560 | PCI_DMA_FROMDEVICE); | 561 | prefetch(&skb->data_len); |
561 | 562 | ||
562 | len = (macrx & XCT_MACRX_LLEN_M) >> XCT_MACRX_LLEN_S; | 563 | len = (macrx & XCT_MACRX_LLEN_M) >> XCT_MACRX_LLEN_S; |
563 | 564 | ||
@@ -576,10 +577,9 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit) | |||
576 | } else | 577 | } else |
577 | info->skb = NULL; | 578 | info->skb = NULL; |
578 | 579 | ||
579 | /* Need to zero it out since hardware doesn't, since the | 580 | pci_unmap_single(mac->dma_pdev, dma, len, PCI_DMA_FROMDEVICE); |
580 | * replenish loop uses it to tell when it's done. | 581 | |
581 | */ | 582 | info->dma = 0; |
582 | RX_BUFF(mac, i) = 0; | ||
583 | 583 | ||
584 | skb_put(skb, len); | 584 | skb_put(skb, len); |
585 | 585 | ||
@@ -599,6 +599,11 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit) | |||
599 | RX_RING(mac, n) = 0; | 599 | RX_RING(mac, n) = 0; |
600 | RX_RING(mac, n+1) = 0; | 600 | RX_RING(mac, n+1) = 0; |
601 | 601 | ||
602 | /* Need to zero it out since hardware doesn't, since the | ||
603 | * replenish loop uses it to tell when it's done. | ||
604 | */ | ||
605 | RX_BUFF(mac, i) = 0; | ||
606 | |||
602 | n += 2; | 607 | n += 2; |
603 | } | 608 | } |
604 | 609 | ||
@@ -621,27 +626,33 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit) | |||
621 | static int pasemi_mac_clean_tx(struct pasemi_mac *mac) | 626 | static int pasemi_mac_clean_tx(struct pasemi_mac *mac) |
622 | { | 627 | { |
623 | int i, j; | 628 | int i, j; |
624 | struct pasemi_mac_buffer *info; | 629 | unsigned int start, descr_count, buf_count, batch_limit; |
625 | unsigned int start, descr_count, buf_count, limit; | 630 | unsigned int ring_limit; |
626 | unsigned int total_count; | 631 | unsigned int total_count; |
627 | unsigned long flags; | 632 | unsigned long flags; |
628 | struct sk_buff *skbs[TX_CLEAN_BATCHSIZE]; | 633 | struct sk_buff *skbs[TX_CLEAN_BATCHSIZE]; |
629 | dma_addr_t dmas[TX_CLEAN_BATCHSIZE][MAX_SKB_FRAGS+1]; | 634 | dma_addr_t dmas[TX_CLEAN_BATCHSIZE][MAX_SKB_FRAGS+1]; |
630 | 635 | ||
631 | total_count = 0; | 636 | total_count = 0; |
632 | limit = TX_CLEAN_BATCHSIZE; | 637 | batch_limit = TX_CLEAN_BATCHSIZE; |
633 | restart: | 638 | restart: |
634 | spin_lock_irqsave(&mac->tx->lock, flags); | 639 | spin_lock_irqsave(&mac->tx->lock, flags); |
635 | 640 | ||
636 | start = mac->tx->next_to_clean; | 641 | start = mac->tx->next_to_clean; |
642 | ring_limit = mac->tx->next_to_fill; | ||
643 | |||
644 | /* Compensate for when fill has wrapped but clean has not */ | ||
645 | if (start > ring_limit) | ||
646 | ring_limit += TX_RING_SIZE; | ||
637 | 647 | ||
638 | buf_count = 0; | 648 | buf_count = 0; |
639 | descr_count = 0; | 649 | descr_count = 0; |
640 | 650 | ||
641 | for (i = start; | 651 | for (i = start; |
642 | descr_count < limit && i < mac->tx->next_to_fill; | 652 | descr_count < batch_limit && i < ring_limit; |
643 | i += buf_count) { | 653 | i += buf_count) { |
644 | u64 mactx = TX_RING(mac, i); | 654 | u64 mactx = TX_RING(mac, i); |
655 | struct sk_buff *skb; | ||
645 | 656 | ||
646 | if ((mactx & XCT_MACTX_E) || | 657 | if ((mactx & XCT_MACTX_E) || |
647 | (*mac->tx_status & PAS_STATUS_ERROR)) | 658 | (*mac->tx_status & PAS_STATUS_ERROR)) |
@@ -651,19 +662,15 @@ restart: | |||
651 | /* Not yet transmitted */ | 662 | /* Not yet transmitted */ |
652 | break; | 663 | break; |
653 | 664 | ||
654 | info = &TX_RING_INFO(mac, i+1); | 665 | skb = TX_RING_INFO(mac, i+1).skb; |
655 | skbs[descr_count] = info->skb; | 666 | skbs[descr_count] = skb; |
656 | 667 | ||
657 | buf_count = 2 + skb_shinfo(info->skb)->nr_frags; | 668 | buf_count = 2 + skb_shinfo(skb)->nr_frags; |
658 | for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++) | 669 | for (j = 0; j <= skb_shinfo(skb)->nr_frags; j++) |
659 | dmas[descr_count][j] = TX_RING_INFO(mac, i+1+j).dma; | 670 | dmas[descr_count][j] = TX_RING_INFO(mac, i+1+j).dma; |
660 | 671 | ||
661 | |||
662 | info->dma = 0; | ||
663 | TX_RING(mac, i) = 0; | 672 | TX_RING(mac, i) = 0; |
664 | TX_RING(mac, i+1) = 0; | 673 | TX_RING(mac, i+1) = 0; |
665 | TX_RING_INFO(mac, i+1).skb = 0; | ||
666 | TX_RING_INFO(mac, i+1).dma = 0; | ||
667 | 674 | ||
668 | /* Since we always fill with an even number of entries, make | 675 | /* Since we always fill with an even number of entries, make |
669 | * sure we skip any unused one at the end as well. | 676 | * sure we skip any unused one at the end as well. |
@@ -672,7 +679,7 @@ restart: | |||
672 | buf_count++; | 679 | buf_count++; |
673 | descr_count++; | 680 | descr_count++; |
674 | } | 681 | } |
675 | mac->tx->next_to_clean = i; | 682 | mac->tx->next_to_clean = i & (TX_RING_SIZE-1); |
676 | 683 | ||
677 | spin_unlock_irqrestore(&mac->tx->lock, flags); | 684 | spin_unlock_irqrestore(&mac->tx->lock, flags); |
678 | netif_wake_queue(mac->netdev); | 685 | netif_wake_queue(mac->netdev); |
@@ -683,7 +690,7 @@ restart: | |||
683 | total_count += descr_count; | 690 | total_count += descr_count; |
684 | 691 | ||
685 | /* If the batch was full, try to clean more */ | 692 | /* If the batch was full, try to clean more */ |
686 | if (descr_count == limit) | 693 | if (descr_count == batch_limit) |
687 | goto restart; | 694 | goto restart; |
688 | 695 | ||
689 | return total_count; | 696 | return total_count; |
@@ -1106,19 +1113,14 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev) | |||
1106 | 1113 | ||
1107 | spin_lock_irqsave(&txring->lock, flags); | 1114 | spin_lock_irqsave(&txring->lock, flags); |
1108 | 1115 | ||
1109 | if (RING_AVAIL(txring) <= nfrags+3) { | 1116 | /* Avoid stepping on the same cache line that the DMA controller |
1110 | spin_unlock_irqrestore(&txring->lock, flags); | 1117 | * is currently about to send, so leave at least 8 words available. |
1111 | pasemi_mac_clean_tx(mac); | 1118 | * Total free space needed is mactx + fragments + 8 |
1112 | pasemi_mac_restart_tx_intr(mac); | 1119 | */ |
1113 | spin_lock_irqsave(&txring->lock, flags); | 1120 | if (RING_AVAIL(txring) < nfrags + 10) { |
1114 | 1121 | /* no room -- stop the queue and wait for tx intr */ | |
1115 | if (RING_AVAIL(txring) <= nfrags+3) { | 1122 | netif_stop_queue(dev); |
1116 | /* Still no room -- stop the queue and wait for tx | 1123 | goto out_err; |
1117 | * intr when there's room. | ||
1118 | */ | ||
1119 | netif_stop_queue(dev); | ||
1120 | goto out_err; | ||
1121 | } | ||
1122 | } | 1124 | } |
1123 | 1125 | ||
1124 | TX_RING(mac, txring->next_to_fill) = mactx; | 1126 | TX_RING(mac, txring->next_to_fill) = mactx; |
@@ -1137,8 +1139,8 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev) | |||
1137 | if (nfrags & 1) | 1139 | if (nfrags & 1) |
1138 | nfrags++; | 1140 | nfrags++; |
1139 | 1141 | ||
1140 | txring->next_to_fill += nfrags + 1; | 1142 | txring->next_to_fill = (txring->next_to_fill + nfrags + 1) & |
1141 | 1143 | (TX_RING_SIZE-1); | |
1142 | 1144 | ||
1143 | dev->stats.tx_packets++; | 1145 | dev->stats.tx_packets++; |
1144 | dev->stats.tx_bytes += skb->len; | 1146 | dev->stats.tx_bytes += skb->len; |