aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/ethernet/intel/ice/ice_txrx.c
diff options
context:
space:
mode:
authorAnirudh Venkataramanan <anirudh.venkataramanan@intel.com>2018-03-20 10:58:14 -0400
committerJeff Kirsher <jeffrey.t.kirsher@intel.com>2018-03-26 14:27:05 -0400
commit2b245cb29421abbad508e93cdfedf81adc12edf1 (patch)
treea43188f96548d3bb9c521bbef6cc4ea200b58040 /drivers/net/ethernet/intel/ice/ice_txrx.c
parentcdedef59deb020e78721d820a5692100128c8c73 (diff)
ice: Implement transmit and NAPI support
This patch implements ice_start_xmit (the handler for ndo_start_xmit) and related functions. ice_start_xmit ultimately calls ice_tx_map, where the Tx descriptor is built and posted to the hardware by bumping the ring tail. This patch also implements ice_napi_poll, which is invoked when there's an interrupt on the VSI's queues. The interrupt can be due to either a completed Tx or an Rx event. In case of a completed Tx/Rx event, resources are reclaimed. Additionally, in case of an Rx event, the skb is fetched and passed up to the network stack. Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> Tested-by: Tony Brelinski <tonyx.brelinski@intel.com> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Diffstat (limited to 'drivers/net/ethernet/intel/ice/ice_txrx.c')
-rw-r--r--drivers/net/ethernet/intel/ice/ice_txrx.c1026
1 files changed, 1024 insertions, 2 deletions
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 6190ea30ee01..1ccf8e69b85a 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include "ice.h" 8#include "ice.h"
9 9
10#define ICE_RX_HDR_SIZE 256
11
10/** 12/**
11 * ice_unmap_and_free_tx_buf - Release a Tx buffer 13 * ice_unmap_and_free_tx_buf - Release a Tx buffer
12 * @ring: the ring that owns the buffer 14 * @ring: the ring that owns the buffer
@@ -93,6 +95,129 @@ void ice_free_tx_ring(struct ice_ring *tx_ring)
93} 95}
94 96
95/** 97/**
98 * ice_clean_tx_irq - Reclaim resources after transmit completes
99 * @vsi: the VSI we care about
100 * @tx_ring: Tx ring to clean
101 * @napi_budget: Used to determine if we are in netpoll
102 *
103 * Returns true if there's any budget left (e.g. the clean is finished)
104 */
105static bool ice_clean_tx_irq(struct ice_vsi *vsi, struct ice_ring *tx_ring,
106 int napi_budget)
107{
108 unsigned int total_bytes = 0, total_pkts = 0;
109 unsigned int budget = vsi->work_lmt;
110 s16 i = tx_ring->next_to_clean;
111 struct ice_tx_desc *tx_desc;
112 struct ice_tx_buf *tx_buf;
113
114 tx_buf = &tx_ring->tx_buf[i];
115 tx_desc = ICE_TX_DESC(tx_ring, i);
116 i -= tx_ring->count;
117
118 do {
119 struct ice_tx_desc *eop_desc = tx_buf->next_to_watch;
120
121 /* if next_to_watch is not set then there is no work pending */
122 if (!eop_desc)
123 break;
124
125 smp_rmb(); /* prevent any other reads prior to eop_desc */
126
127 /* if the descriptor isn't done, no work yet to do */
128 if (!(eop_desc->cmd_type_offset_bsz &
129 cpu_to_le64(ICE_TX_DESC_DTYPE_DESC_DONE)))
130 break;
131
132 /* clear next_to_watch to prevent false hangs */
133 tx_buf->next_to_watch = NULL;
134
135 /* update the statistics for this packet */
136 total_bytes += tx_buf->bytecount;
137 total_pkts += tx_buf->gso_segs;
138
139 /* free the skb */
140 napi_consume_skb(tx_buf->skb, napi_budget);
141
142 /* unmap skb header data */
143 dma_unmap_single(tx_ring->dev,
144 dma_unmap_addr(tx_buf, dma),
145 dma_unmap_len(tx_buf, len),
146 DMA_TO_DEVICE);
147
148 /* clear tx_buf data */
149 tx_buf->skb = NULL;
150 dma_unmap_len_set(tx_buf, len, 0);
151
152 /* unmap remaining buffers */
153 while (tx_desc != eop_desc) {
154 tx_buf++;
155 tx_desc++;
156 i++;
157 if (unlikely(!i)) {
158 i -= tx_ring->count;
159 tx_buf = tx_ring->tx_buf;
160 tx_desc = ICE_TX_DESC(tx_ring, 0);
161 }
162
163 /* unmap any remaining paged data */
164 if (dma_unmap_len(tx_buf, len)) {
165 dma_unmap_page(tx_ring->dev,
166 dma_unmap_addr(tx_buf, dma),
167 dma_unmap_len(tx_buf, len),
168 DMA_TO_DEVICE);
169 dma_unmap_len_set(tx_buf, len, 0);
170 }
171 }
172
173 /* move us one more past the eop_desc for start of next pkt */
174 tx_buf++;
175 tx_desc++;
176 i++;
177 if (unlikely(!i)) {
178 i -= tx_ring->count;
179 tx_buf = tx_ring->tx_buf;
180 tx_desc = ICE_TX_DESC(tx_ring, 0);
181 }
182
183 prefetch(tx_desc);
184
185 /* update budget accounting */
186 budget--;
187 } while (likely(budget));
188
189 i += tx_ring->count;
190 tx_ring->next_to_clean = i;
191 u64_stats_update_begin(&tx_ring->syncp);
192 tx_ring->stats.bytes += total_bytes;
193 tx_ring->stats.pkts += total_pkts;
194 u64_stats_update_end(&tx_ring->syncp);
195 tx_ring->q_vector->tx.total_bytes += total_bytes;
196 tx_ring->q_vector->tx.total_pkts += total_pkts;
197
198 netdev_tx_completed_queue(txring_txq(tx_ring), total_pkts,
199 total_bytes);
200
201#define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
202 if (unlikely(total_pkts && netif_carrier_ok(tx_ring->netdev) &&
203 (ICE_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
204 /* Make sure that anybody stopping the queue after this
205 * sees the new next_to_clean.
206 */
207 smp_mb();
208 if (__netif_subqueue_stopped(tx_ring->netdev,
209 tx_ring->q_index) &&
210 !test_bit(__ICE_DOWN, vsi->state)) {
211 netif_wake_subqueue(tx_ring->netdev,
212 tx_ring->q_index);
213 ++tx_ring->tx_stats.restart_q;
214 }
215 }
216
217 return !!budget;
218}
219
220/**
96 * ice_setup_tx_ring - Allocate the Tx descriptors 221 * ice_setup_tx_ring - Allocate the Tx descriptors
97 * @tx_ring: the tx ring to set up 222 * @tx_ring: the tx ring to set up
98 * 223 *
@@ -274,13 +399,17 @@ static bool ice_alloc_mapped_page(struct ice_ring *rx_ring,
274 dma_addr_t dma; 399 dma_addr_t dma;
275 400
276 /* since we are recycling buffers we should seldom need to alloc */ 401 /* since we are recycling buffers we should seldom need to alloc */
277 if (likely(page)) 402 if (likely(page)) {
403 rx_ring->rx_stats.page_reuse_count++;
278 return true; 404 return true;
405 }
279 406
280 /* alloc new page for storage */ 407 /* alloc new page for storage */
281 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); 408 page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
282 if (unlikely(!page)) 409 if (unlikely(!page)) {
410 rx_ring->rx_stats.alloc_page_failed++;
283 return false; 411 return false;
412 }
284 413
285 /* map page for use */ 414 /* map page for use */
286 dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); 415 dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
@@ -290,6 +419,7 @@ static bool ice_alloc_mapped_page(struct ice_ring *rx_ring,
290 */ 419 */
291 if (dma_mapping_error(rx_ring->dev, dma)) { 420 if (dma_mapping_error(rx_ring->dev, dma)) {
292 __free_pages(page, 0); 421 __free_pages(page, 0);
422 rx_ring->rx_stats.alloc_page_failed++;
293 return false; 423 return false;
294 } 424 }
295 425
@@ -359,3 +489,895 @@ no_bufs:
359 */ 489 */
360 return true; 490 return true;
361} 491}
492
493/**
494 * ice_page_is_reserved - check if reuse is possible
495 * @page: page struct to check
496 */
497static bool ice_page_is_reserved(struct page *page)
498{
499 return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
500}
501
502/**
503 * ice_add_rx_frag - Add contents of Rx buffer to sk_buff
504 * @rx_buf: buffer containing page to add
505 * @rx_desc: descriptor containing length of buffer written by hardware
506 * @skb: sk_buf to place the data into
507 *
508 * This function will add the data contained in rx_buf->page to the skb.
509 * This is done either through a direct copy if the data in the buffer is
510 * less than the skb header size, otherwise it will just attach the page as
511 * a frag to the skb.
512 *
513 * The function will then update the page offset if necessary and return
514 * true if the buffer can be reused by the adapter.
515 */
516static bool ice_add_rx_frag(struct ice_rx_buf *rx_buf,
517 union ice_32b_rx_flex_desc *rx_desc,
518 struct sk_buff *skb)
519{
520#if (PAGE_SIZE < 8192)
521 unsigned int truesize = ICE_RXBUF_2048;
522#else
523 unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
524 unsigned int truesize;
525#endif /* PAGE_SIZE < 8192) */
526
527 struct page *page;
528 unsigned int size;
529
530 size = le16_to_cpu(rx_desc->wb.pkt_len) &
531 ICE_RX_FLX_DESC_PKT_LEN_M;
532
533 page = rx_buf->page;
534
535#if (PAGE_SIZE >= 8192)
536 truesize = ALIGN(size, L1_CACHE_BYTES);
537#endif /* PAGE_SIZE >= 8192) */
538
539 /* will the data fit in the skb we allocated? if so, just
540 * copy it as it is pretty small anyway
541 */
542 if (size <= ICE_RX_HDR_SIZE && !skb_is_nonlinear(skb)) {
543 unsigned char *va = page_address(page) + rx_buf->page_offset;
544
545 memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
546
547 /* page is not reserved, we can reuse buffer as-is */
548 if (likely(!ice_page_is_reserved(page)))
549 return true;
550
551 /* this page cannot be reused so discard it */
552 __free_pages(page, 0);
553 return false;
554 }
555
556 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
557 rx_buf->page_offset, size, truesize);
558
559 /* avoid re-using remote pages */
560 if (unlikely(ice_page_is_reserved(page)))
561 return false;
562
563#if (PAGE_SIZE < 8192)
564 /* if we are only owner of page we can reuse it */
565 if (unlikely(page_count(page) != 1))
566 return false;
567
568 /* flip page offset to other buffer */
569 rx_buf->page_offset ^= truesize;
570#else
571 /* move offset up to the next cache line */
572 rx_buf->page_offset += truesize;
573
574 if (rx_buf->page_offset > last_offset)
575 return false;
576#endif /* PAGE_SIZE < 8192) */
577
578 /* Even if we own the page, we are not allowed to use atomic_set()
579 * This would break get_page_unless_zero() users.
580 */
581 get_page(rx_buf->page);
582
583 return true;
584}
585
586/**
587 * ice_reuse_rx_page - page flip buffer and store it back on the ring
588 * @rx_ring: rx descriptor ring to store buffers on
589 * @old_buf: donor buffer to have page reused
590 *
591 * Synchronizes page for reuse by the adapter
592 */
593static void ice_reuse_rx_page(struct ice_ring *rx_ring,
594 struct ice_rx_buf *old_buf)
595{
596 u16 nta = rx_ring->next_to_alloc;
597 struct ice_rx_buf *new_buf;
598
599 new_buf = &rx_ring->rx_buf[nta];
600
601 /* update, and store next to alloc */
602 nta++;
603 rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
604
605 /* transfer page from old buffer to new buffer */
606 *new_buf = *old_buf;
607}
608
609/**
610 * ice_fetch_rx_buf - Allocate skb and populate it
611 * @rx_ring: rx descriptor ring to transact packets on
612 * @rx_desc: descriptor containing info written by hardware
613 *
614 * This function allocates an skb on the fly, and populates it with the page
615 * data from the current receive descriptor, taking care to set up the skb
616 * correctly, as well as handling calling the page recycle function if
617 * necessary.
618 */
619static struct sk_buff *ice_fetch_rx_buf(struct ice_ring *rx_ring,
620 union ice_32b_rx_flex_desc *rx_desc)
621{
622 struct ice_rx_buf *rx_buf;
623 struct sk_buff *skb;
624 struct page *page;
625
626 rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
627 page = rx_buf->page;
628 prefetchw(page);
629
630 skb = rx_buf->skb;
631
632 if (likely(!skb)) {
633 u8 *page_addr = page_address(page) + rx_buf->page_offset;
634
635 /* prefetch first cache line of first page */
636 prefetch(page_addr);
637#if L1_CACHE_BYTES < 128
638 prefetch((void *)(page_addr + L1_CACHE_BYTES));
639#endif /* L1_CACHE_BYTES */
640
641 /* allocate a skb to store the frags */
642 skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
643 ICE_RX_HDR_SIZE,
644 GFP_ATOMIC | __GFP_NOWARN);
645 if (unlikely(!skb)) {
646 rx_ring->rx_stats.alloc_buf_failed++;
647 return NULL;
648 }
649
650 /* we will be copying header into skb->data in
651 * pskb_may_pull so it is in our interest to prefetch
652 * it now to avoid a possible cache miss
653 */
654 prefetchw(skb->data);
655
656 skb_record_rx_queue(skb, rx_ring->q_index);
657 } else {
658 /* we are reusing so sync this buffer for CPU use */
659 dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
660 rx_buf->page_offset,
661 ICE_RXBUF_2048,
662 DMA_FROM_DEVICE);
663
664 rx_buf->skb = NULL;
665 }
666
667 /* pull page into skb */
668 if (ice_add_rx_frag(rx_buf, rx_desc, skb)) {
669 /* hand second half of page back to the ring */
670 ice_reuse_rx_page(rx_ring, rx_buf);
671 rx_ring->rx_stats.page_reuse_count++;
672 } else {
673 /* we are not reusing the buffer so unmap it */
674 dma_unmap_page(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
675 DMA_FROM_DEVICE);
676 }
677
678 /* clear contents of buffer_info */
679 rx_buf->page = NULL;
680
681 return skb;
682}
683
684/**
685 * ice_pull_tail - ice specific version of skb_pull_tail
686 * @skb: pointer to current skb being adjusted
687 *
688 * This function is an ice specific version of __pskb_pull_tail. The
689 * main difference between this version and the original function is that
690 * this function can make several assumptions about the state of things
691 * that allow for significant optimizations versus the standard function.
692 * As a result we can do things like drop a frag and maintain an accurate
693 * truesize for the skb.
694 */
695static void ice_pull_tail(struct sk_buff *skb)
696{
697 struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
698 unsigned int pull_len;
699 unsigned char *va;
700
701 /* it is valid to use page_address instead of kmap since we are
702 * working with pages allocated out of the lomem pool per
703 * alloc_page(GFP_ATOMIC)
704 */
705 va = skb_frag_address(frag);
706
707 /* we need the header to contain the greater of either ETH_HLEN or
708 * 60 bytes if the skb->len is less than 60 for skb_pad.
709 */
710 pull_len = eth_get_headlen(va, ICE_RX_HDR_SIZE);
711
712 /* align pull length to size of long to optimize memcpy performance */
713 skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
714
715 /* update all of the pointers */
716 skb_frag_size_sub(frag, pull_len);
717 frag->page_offset += pull_len;
718 skb->data_len -= pull_len;
719 skb->tail += pull_len;
720}
721
722/**
723 * ice_cleanup_headers - Correct empty headers
724 * @skb: pointer to current skb being fixed
725 *
726 * Also address the case where we are pulling data in on pages only
727 * and as such no data is present in the skb header.
728 *
729 * In addition if skb is not at least 60 bytes we need to pad it so that
730 * it is large enough to qualify as a valid Ethernet frame.
731 *
732 * Returns true if an error was encountered and skb was freed.
733 */
734static bool ice_cleanup_headers(struct sk_buff *skb)
735{
736 /* place header in linear portion of buffer */
737 if (skb_is_nonlinear(skb))
738 ice_pull_tail(skb);
739
740 /* if eth_skb_pad returns an error the skb was freed */
741 if (eth_skb_pad(skb))
742 return true;
743
744 return false;
745}
746
747/**
748 * ice_test_staterr - tests bits in Rx descriptor status and error fields
749 * @rx_desc: pointer to receive descriptor (in le64 format)
750 * @stat_err_bits: value to mask
751 *
752 * This function does some fast chicanery in order to return the
753 * value of the mask which is really only used for boolean tests.
754 * The status_error_len doesn't need to be shifted because it begins
755 * at offset zero.
756 */
757static bool ice_test_staterr(union ice_32b_rx_flex_desc *rx_desc,
758 const u16 stat_err_bits)
759{
760 return !!(rx_desc->wb.status_error0 &
761 cpu_to_le16(stat_err_bits));
762}
763
764/**
765 * ice_is_non_eop - process handling of non-EOP buffers
766 * @rx_ring: Rx ring being processed
767 * @rx_desc: Rx descriptor for current buffer
768 * @skb: Current socket buffer containing buffer in progress
769 *
770 * This function updates next to clean. If the buffer is an EOP buffer
771 * this function exits returning false, otherwise it will place the
772 * sk_buff in the next buffer to be chained and return true indicating
773 * that this is in fact a non-EOP buffer.
774 */
775static bool ice_is_non_eop(struct ice_ring *rx_ring,
776 union ice_32b_rx_flex_desc *rx_desc,
777 struct sk_buff *skb)
778{
779 u32 ntc = rx_ring->next_to_clean + 1;
780
781 /* fetch, update, and store next to clean */
782 ntc = (ntc < rx_ring->count) ? ntc : 0;
783 rx_ring->next_to_clean = ntc;
784
785 prefetch(ICE_RX_DESC(rx_ring, ntc));
786
787 /* if we are the last buffer then there is nothing else to do */
788#define ICE_RXD_EOF BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)
789 if (likely(ice_test_staterr(rx_desc, ICE_RXD_EOF)))
790 return false;
791
792 /* place skb in next buffer to be received */
793 rx_ring->rx_buf[ntc].skb = skb;
794 rx_ring->rx_stats.non_eop_descs++;
795
796 return true;
797}
798
799/**
800 * ice_receive_skb - Send a completed packet up the stack
801 * @rx_ring: rx ring in play
802 * @skb: packet to send up
803 * @vlan_tag: vlan tag for packet
804 *
805 * This function sends the completed packet (via. skb) up the stack using
806 * gro receive functions (with/without vlan tag)
807 */
808static void ice_receive_skb(struct ice_ring *rx_ring, struct sk_buff *skb,
809 u16 vlan_tag)
810{
811 if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
812 (vlan_tag & VLAN_VID_MASK)) {
813 __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
814 }
815 napi_gro_receive(&rx_ring->q_vector->napi, skb);
816}
817
818/**
819 * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
820 * @rx_ring: rx descriptor ring to transact packets on
821 * @budget: Total limit on number of packets to process
822 *
823 * This function provides a "bounce buffer" approach to Rx interrupt
824 * processing. The advantage to this is that on systems that have
825 * expensive overhead for IOMMU access this provides a means of avoiding
826 * it by maintaining the mapping of the page to the system.
827 *
828 * Returns amount of work completed
829 */
830static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
831{
832 unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
833 u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
834 bool failure = false;
835
836 /* start the loop to process RX packets bounded by 'budget' */
837 while (likely(total_rx_pkts < (unsigned int)budget)) {
838 union ice_32b_rx_flex_desc *rx_desc;
839 struct sk_buff *skb;
840 u16 stat_err_bits;
841 u16 vlan_tag = 0;
842
843 /* return some buffers to hardware, one at a time is too slow */
844 if (cleaned_count >= ICE_RX_BUF_WRITE) {
845 failure = failure ||
846 ice_alloc_rx_bufs(rx_ring, cleaned_count);
847 cleaned_count = 0;
848 }
849
850 /* get the RX desc from RX ring based on 'next_to_clean' */
851 rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
852
853 /* status_error_len will always be zero for unused descriptors
854 * because it's cleared in cleanup, and overlaps with hdr_addr
855 * which is always zero because packet split isn't used, if the
856 * hardware wrote DD then it will be non-zero
857 */
858 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S);
859 if (!ice_test_staterr(rx_desc, stat_err_bits))
860 break;
861
862 /* This memory barrier is needed to keep us from reading
863 * any other fields out of the rx_desc until we know the
864 * DD bit is set.
865 */
866 dma_rmb();
867
868 /* allocate (if needed) and populate skb */
869 skb = ice_fetch_rx_buf(rx_ring, rx_desc);
870 if (!skb)
871 break;
872
873 cleaned_count++;
874
875 /* skip if it is NOP desc */
876 if (ice_is_non_eop(rx_ring, rx_desc, skb))
877 continue;
878
879 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
880 if (unlikely(ice_test_staterr(rx_desc, stat_err_bits))) {
881 dev_kfree_skb_any(skb);
882 continue;
883 }
884
885 stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_L2TAG1P_S);
886 if (ice_test_staterr(rx_desc, stat_err_bits))
887 vlan_tag = le16_to_cpu(rx_desc->wb.l2tag1);
888
889 /* correct empty headers and pad skb if needed (to make valid
890 * ethernet frame
891 */
892 if (ice_cleanup_headers(skb)) {
893 skb = NULL;
894 continue;
895 }
896
897 /* probably a little skewed due to removing CRC */
898 total_rx_bytes += skb->len;
899
900 /* send completed skb up the stack */
901 ice_receive_skb(rx_ring, skb, vlan_tag);
902
903 /* update budget accounting */
904 total_rx_pkts++;
905 }
906
907 /* update queue and vector specific stats */
908 u64_stats_update_begin(&rx_ring->syncp);
909 rx_ring->stats.pkts += total_rx_pkts;
910 rx_ring->stats.bytes += total_rx_bytes;
911 u64_stats_update_end(&rx_ring->syncp);
912 rx_ring->q_vector->rx.total_pkts += total_rx_pkts;
913 rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
914
915 /* guarantee a trip back through this routine if there was a failure */
916 return failure ? budget : (int)total_rx_pkts;
917}
918
919/**
920 * ice_napi_poll - NAPI polling Rx/Tx cleanup routine
921 * @napi: napi struct with our devices info in it
922 * @budget: amount of work driver is allowed to do this pass, in packets
923 *
924 * This function will clean all queues associated with a q_vector.
925 *
926 * Returns the amount of work done
927 */
928int ice_napi_poll(struct napi_struct *napi, int budget)
929{
930 struct ice_q_vector *q_vector =
931 container_of(napi, struct ice_q_vector, napi);
932 struct ice_vsi *vsi = q_vector->vsi;
933 struct ice_pf *pf = vsi->back;
934 bool clean_complete = true;
935 int budget_per_ring = 0;
936 struct ice_ring *ring;
937 int work_done = 0;
938
939 /* Since the actual Tx work is minimal, we can give the Tx a larger
940 * budget and be more aggressive about cleaning up the Tx descriptors.
941 */
942 ice_for_each_ring(ring, q_vector->tx)
943 if (!ice_clean_tx_irq(vsi, ring, budget))
944 clean_complete = false;
945
946 /* Handle case where we are called by netpoll with a budget of 0 */
947 if (budget <= 0)
948 return budget;
949
950 /* We attempt to distribute budget to each Rx queue fairly, but don't
951 * allow the budget to go below 1 because that would exit polling early.
952 */
953 if (q_vector->num_ring_rx)
954 budget_per_ring = max(budget / q_vector->num_ring_rx, 1);
955
956 ice_for_each_ring(ring, q_vector->rx) {
957 int cleaned;
958
959 cleaned = ice_clean_rx_irq(ring, budget_per_ring);
960 work_done += cleaned;
961 /* if we clean as many as budgeted, we must not be done */
962 if (cleaned >= budget_per_ring)
963 clean_complete = false;
964 }
965
966 /* If work not completed, return budget and polling will return */
967 if (!clean_complete)
968 return budget;
969
970 /* Work is done so exit the polling mode and re-enable the interrupt */
971 napi_complete_done(napi, work_done);
972 if (test_bit(ICE_FLAG_MSIX_ENA, pf->flags))
973 ice_irq_dynamic_ena(&vsi->back->hw, vsi, q_vector);
974 return 0;
975}
976
977/* helper function for building cmd/type/offset */
978static __le64
979build_ctob(u64 td_cmd, u64 td_offset, unsigned int size, u64 td_tag)
980{
981 return cpu_to_le64(ICE_TX_DESC_DTYPE_DATA |
982 (td_cmd << ICE_TXD_QW1_CMD_S) |
983 (td_offset << ICE_TXD_QW1_OFFSET_S) |
984 ((u64)size << ICE_TXD_QW1_TX_BUF_SZ_S) |
985 (td_tag << ICE_TXD_QW1_L2TAG1_S));
986}
987
988/**
989 * __ice_maybe_stop_tx - 2nd level check for tx stop conditions
990 * @tx_ring: the ring to be checked
991 * @size: the size buffer we want to assure is available
992 *
993 * Returns -EBUSY if a stop is needed, else 0
994 */
995static int __ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size)
996{
997 netif_stop_subqueue(tx_ring->netdev, tx_ring->q_index);
998 /* Memory barrier before checking head and tail */
999 smp_mb();
1000
1001 /* Check again in a case another CPU has just made room available. */
1002 if (likely(ICE_DESC_UNUSED(tx_ring) < size))
1003 return -EBUSY;
1004
1005 /* A reprieve! - use start_subqueue because it doesn't call schedule */
1006 netif_start_subqueue(tx_ring->netdev, tx_ring->q_index);
1007 ++tx_ring->tx_stats.restart_q;
1008 return 0;
1009}
1010
1011/**
1012 * ice_maybe_stop_tx - 1st level check for tx stop conditions
1013 * @tx_ring: the ring to be checked
1014 * @size: the size buffer we want to assure is available
1015 *
1016 * Returns 0 if stop is not needed
1017 */
1018static int ice_maybe_stop_tx(struct ice_ring *tx_ring, unsigned int size)
1019{
1020 if (likely(ICE_DESC_UNUSED(tx_ring) >= size))
1021 return 0;
1022 return __ice_maybe_stop_tx(tx_ring, size);
1023}
1024
1025/**
1026 * ice_tx_map - Build the Tx descriptor
1027 * @tx_ring: ring to send buffer on
1028 * @first: first buffer info buffer to use
1029 *
1030 * This function loops over the skb data pointed to by *first
1031 * and gets a physical address for each memory location and programs
1032 * it and the length into the transmit descriptor.
1033 */
1034static void ice_tx_map(struct ice_ring *tx_ring, struct ice_tx_buf *first)
1035{
1036 u64 td_offset = 0, td_tag = 0, td_cmd = 0;
1037 u16 i = tx_ring->next_to_use;
1038 struct skb_frag_struct *frag;
1039 unsigned int data_len, size;
1040 struct ice_tx_desc *tx_desc;
1041 struct ice_tx_buf *tx_buf;
1042 struct sk_buff *skb;
1043 dma_addr_t dma;
1044
1045 skb = first->skb;
1046
1047 data_len = skb->data_len;
1048 size = skb_headlen(skb);
1049
1050 tx_desc = ICE_TX_DESC(tx_ring, i);
1051
1052 dma = dma_map_single(tx_ring->dev, skb->data, size, DMA_TO_DEVICE);
1053
1054 tx_buf = first;
1055
1056 for (frag = &skb_shinfo(skb)->frags[0];; frag++) {
1057 unsigned int max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1058
1059 if (dma_mapping_error(tx_ring->dev, dma))
1060 goto dma_error;
1061
1062 /* record length, and DMA address */
1063 dma_unmap_len_set(tx_buf, len, size);
1064 dma_unmap_addr_set(tx_buf, dma, dma);
1065
1066 /* align size to end of page */
1067 max_data += -dma & (ICE_MAX_READ_REQ_SIZE - 1);
1068 tx_desc->buf_addr = cpu_to_le64(dma);
1069
1070 /* account for data chunks larger than the hardware
1071 * can handle
1072 */
1073 while (unlikely(size > ICE_MAX_DATA_PER_TXD)) {
1074 tx_desc->cmd_type_offset_bsz =
1075 build_ctob(td_cmd, td_offset, max_data, td_tag);
1076
1077 tx_desc++;
1078 i++;
1079
1080 if (i == tx_ring->count) {
1081 tx_desc = ICE_TX_DESC(tx_ring, 0);
1082 i = 0;
1083 }
1084
1085 dma += max_data;
1086 size -= max_data;
1087
1088 max_data = ICE_MAX_DATA_PER_TXD_ALIGNED;
1089 tx_desc->buf_addr = cpu_to_le64(dma);
1090 }
1091
1092 if (likely(!data_len))
1093 break;
1094
1095 tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
1096 size, td_tag);
1097
1098 tx_desc++;
1099 i++;
1100
1101 if (i == tx_ring->count) {
1102 tx_desc = ICE_TX_DESC(tx_ring, 0);
1103 i = 0;
1104 }
1105
1106 size = skb_frag_size(frag);
1107 data_len -= size;
1108
1109 dma = skb_frag_dma_map(tx_ring->dev, frag, 0, size,
1110 DMA_TO_DEVICE);
1111
1112 tx_buf = &tx_ring->tx_buf[i];
1113 }
1114
1115 /* record bytecount for BQL */
1116 netdev_tx_sent_queue(txring_txq(tx_ring), first->bytecount);
1117
1118 /* record SW timestamp if HW timestamp is not available */
1119 skb_tx_timestamp(first->skb);
1120
1121 i++;
1122 if (i == tx_ring->count)
1123 i = 0;
1124
1125 /* write last descriptor with RS and EOP bits */
1126 td_cmd |= (u64)(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS);
1127 tx_desc->cmd_type_offset_bsz =
1128 build_ctob(td_cmd, td_offset, size, td_tag);
1129
1130 /* Force memory writes to complete before letting h/w know there
1131 * are new descriptors to fetch.
1132 *
1133 * We also use this memory barrier to make certain all of the
1134 * status bits have been updated before next_to_watch is written.
1135 */
1136 wmb();
1137
1138 /* set next_to_watch value indicating a packet is present */
1139 first->next_to_watch = tx_desc;
1140
1141 tx_ring->next_to_use = i;
1142
1143 ice_maybe_stop_tx(tx_ring, DESC_NEEDED);
1144
1145 /* notify HW of packet */
1146 if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) {
1147 writel(i, tx_ring->tail);
1148
1149 /* we need this if more than one processor can write to our tail
1150 * at a time, it synchronizes IO on IA64/Altix systems
1151 */
1152 mmiowb();
1153 }
1154
1155 return;
1156
1157dma_error:
1158 /* clear dma mappings for failed tx_buf map */
1159 for (;;) {
1160 tx_buf = &tx_ring->tx_buf[i];
1161 ice_unmap_and_free_tx_buf(tx_ring, tx_buf);
1162 if (tx_buf == first)
1163 break;
1164 if (i == 0)
1165 i = tx_ring->count;
1166 i--;
1167 }
1168
1169 tx_ring->next_to_use = i;
1170}
1171
1172/**
1173 * ice_txd_use_count - estimate the number of descriptors needed for Tx
1174 * @size: transmit request size in bytes
1175 *
1176 * Due to hardware alignment restrictions (4K alignment), we need to
1177 * assume that we can have no more than 12K of data per descriptor, even
1178 * though each descriptor can take up to 16K - 1 bytes of aligned memory.
1179 * Thus, we need to divide by 12K. But division is slow! Instead,
1180 * we decompose the operation into shifts and one relatively cheap
1181 * multiply operation.
1182 *
1183 * To divide by 12K, we first divide by 4K, then divide by 3:
1184 * To divide by 4K, shift right by 12 bits
1185 * To divide by 3, multiply by 85, then divide by 256
1186 * (Divide by 256 is done by shifting right by 8 bits)
1187 * Finally, we add one to round up. Because 256 isn't an exact multiple of
1188 * 3, we'll underestimate near each multiple of 12K. This is actually more
1189 * accurate as we have 4K - 1 of wiggle room that we can fit into the last
1190 * segment. For our purposes this is accurate out to 1M which is orders of
1191 * magnitude greater than our largest possible GSO size.
1192 *
1193 * This would then be implemented as:
1194 * return (((size >> 12) * 85) >> 8) + 1;
1195 *
1196 * Since multiplication and division are commutative, we can reorder
1197 * operations into:
1198 * return ((size * 85) >> 20) + 1;
1199 */
1200static unsigned int ice_txd_use_count(unsigned int size)
1201{
1202 return ((size * 85) >> 20) + 1;
1203}
1204
1205/**
1206 * ice_xmit_desc_count - calculate number of tx descriptors needed
1207 * @skb: send buffer
1208 *
1209 * Returns number of data descriptors needed for this skb.
1210 */
1211static unsigned int ice_xmit_desc_count(struct sk_buff *skb)
1212{
1213 const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
1214 unsigned int nr_frags = skb_shinfo(skb)->nr_frags;
1215 unsigned int count = 0, size = skb_headlen(skb);
1216
1217 for (;;) {
1218 count += ice_txd_use_count(size);
1219
1220 if (!nr_frags--)
1221 break;
1222
1223 size = skb_frag_size(frag++);
1224 }
1225
1226 return count;
1227}
1228
1229/**
1230 * __ice_chk_linearize - Check if there are more than 8 buffers per packet
1231 * @skb: send buffer
1232 *
1233 * Note: This HW can't DMA more than 8 buffers to build a packet on the wire
1234 * and so we need to figure out the cases where we need to linearize the skb.
1235 *
1236 * For TSO we need to count the TSO header and segment payload separately.
1237 * As such we need to check cases where we have 7 fragments or more as we
1238 * can potentially require 9 DMA transactions, 1 for the TSO header, 1 for
1239 * the segment payload in the first descriptor, and another 7 for the
1240 * fragments.
1241 */
1242static bool __ice_chk_linearize(struct sk_buff *skb)
1243{
1244 const struct skb_frag_struct *frag, *stale;
1245 int nr_frags, sum;
1246
1247 /* no need to check if number of frags is less than 7 */
1248 nr_frags = skb_shinfo(skb)->nr_frags;
1249 if (nr_frags < (ICE_MAX_BUF_TXD - 1))
1250 return false;
1251
1252 /* We need to walk through the list and validate that each group
1253 * of 6 fragments totals at least gso_size.
1254 */
1255 nr_frags -= ICE_MAX_BUF_TXD - 2;
1256 frag = &skb_shinfo(skb)->frags[0];
1257
1258 /* Initialize size to the negative value of gso_size minus 1. We
1259 * use this as the worst case scenerio in which the frag ahead
1260 * of us only provides one byte which is why we are limited to 6
1261 * descriptors for a single transmit as the header and previous
1262 * fragment are already consuming 2 descriptors.
1263 */
1264 sum = 1 - skb_shinfo(skb)->gso_size;
1265
1266 /* Add size of frags 0 through 4 to create our initial sum */
1267 sum += skb_frag_size(frag++);
1268 sum += skb_frag_size(frag++);
1269 sum += skb_frag_size(frag++);
1270 sum += skb_frag_size(frag++);
1271 sum += skb_frag_size(frag++);
1272
1273 /* Walk through fragments adding latest fragment, testing it, and
1274 * then removing stale fragments from the sum.
1275 */
1276 stale = &skb_shinfo(skb)->frags[0];
1277 for (;;) {
1278 sum += skb_frag_size(frag++);
1279
1280 /* if sum is negative we failed to make sufficient progress */
1281 if (sum < 0)
1282 return true;
1283
1284 if (!nr_frags--)
1285 break;
1286
1287 sum -= skb_frag_size(stale++);
1288 }
1289
1290 return false;
1291}
1292
1293/**
1294 * ice_chk_linearize - Check if there are more than 8 fragments per packet
1295 * @skb: send buffer
1296 * @count: number of buffers used
1297 *
1298 * Note: Our HW can't scatter-gather more than 8 fragments to build
1299 * a packet on the wire and so we need to figure out the cases where we
1300 * need to linearize the skb.
1301 */
1302static bool ice_chk_linearize(struct sk_buff *skb, unsigned int count)
1303{
1304 /* Both TSO and single send will work if count is less than 8 */
1305 if (likely(count < ICE_MAX_BUF_TXD))
1306 return false;
1307
1308 if (skb_is_gso(skb))
1309 return __ice_chk_linearize(skb);
1310
1311 /* we can support up to 8 data buffers for a single send */
1312 return count != ICE_MAX_BUF_TXD;
1313}
1314
1315/**
1316 * ice_xmit_frame_ring - Sends buffer on Tx ring
1317 * @skb: send buffer
1318 * @tx_ring: ring to send buffer on
1319 *
1320 * Returns NETDEV_TX_OK if sent, else an error code
1321 */
1322static netdev_tx_t
1323ice_xmit_frame_ring(struct sk_buff *skb, struct ice_ring *tx_ring)
1324{
1325 struct ice_tx_buf *first;
1326 unsigned int count;
1327
1328 count = ice_xmit_desc_count(skb);
1329 if (ice_chk_linearize(skb, count)) {
1330 if (__skb_linearize(skb))
1331 goto out_drop;
1332 count = ice_txd_use_count(skb->len);
1333 tx_ring->tx_stats.tx_linearize++;
1334 }
1335
1336 /* need: 1 descriptor per page * PAGE_SIZE/ICE_MAX_DATA_PER_TXD,
1337 * + 1 desc for skb_head_len/ICE_MAX_DATA_PER_TXD,
1338 * + 4 desc gap to avoid the cache line where head is,
1339 * + 1 desc for context descriptor,
1340 * otherwise try next time
1341 */
1342 if (ice_maybe_stop_tx(tx_ring, count + 4 + 1)) {
1343 tx_ring->tx_stats.tx_busy++;
1344 return NETDEV_TX_BUSY;
1345 }
1346
1347 /* record the location of the first descriptor for this packet */
1348 first = &tx_ring->tx_buf[tx_ring->next_to_use];
1349 first->skb = skb;
1350 first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN);
1351 first->gso_segs = 1;
1352
1353 ice_tx_map(tx_ring, first);
1354 return NETDEV_TX_OK;
1355
1356out_drop:
1357 dev_kfree_skb_any(skb);
1358 return NETDEV_TX_OK;
1359}
1360
1361/**
1362 * ice_start_xmit - Selects the correct VSI and Tx queue to send buffer
1363 * @skb: send buffer
1364 * @netdev: network interface device structure
1365 *
1366 * Returns NETDEV_TX_OK if sent, else an error code
1367 */
1368netdev_tx_t ice_start_xmit(struct sk_buff *skb, struct net_device *netdev)
1369{
1370 struct ice_netdev_priv *np = netdev_priv(netdev);
1371 struct ice_vsi *vsi = np->vsi;
1372 struct ice_ring *tx_ring;
1373
1374 tx_ring = vsi->tx_rings[skb->queue_mapping];
1375
1376 /* hardware can't handle really short frames, hardware padding works
1377 * beyond this point
1378 */
1379 if (skb_put_padto(skb, ICE_MIN_TX_LEN))
1380 return NETDEV_TX_OK;
1381
1382 return ice_xmit_frame_ring(skb, tx_ring);
1383}