aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
authorchetan loke <loke.chetan@gmail.com>2011-08-19 06:18:16 -0400
committerDavid S. Miller <davem@davemloft.net>2011-08-24 22:40:40 -0400
commitf6fb8f100b807378fda19e83e5ac6828b638603a (patch)
tree66d30265f08fbf9745e3feb9af6f5a06fe38d71b /net/packet/af_packet.c
parent0d4691ce112be025019999df5f2a5e00c03f03c2 (diff)
af-packet: TPACKET_V3 flexible buffer implementation.
1) Blocks can be configured with non-static frame-size. 2) Read/poll is at a block-level(as opposed to packet-level). 3) Added poll timeout to avoid indefinite user-space wait on idle links. 4) Added user-configurable knobs: 4.1) block::timeout. 4.2) tpkt_hdr::sk_rxhash. Changes: C1) tpacket_rcv() C1.1) packet_current_frame() is replaced by packet_current_rx_frame() The bulk of the processing is then moved in the following chain: packet_current_rx_frame() __packet_lookup_frame_in_block fill_curr_block() or retire_current_block dispatch_next_block or return NULL(queue is plugged/paused) Signed-off-by: Chetan Loke <loke.chetan@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c937
1 files changed, 891 insertions, 46 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c698cec0a445..4371e3a67789 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -40,6 +40,10 @@
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING. 42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
43 * 47 *
44 * This program is free software; you can redistribute it and/or 48 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License 49 * modify it under the terms of the GNU General Public License
@@ -161,9 +165,56 @@ struct packet_mreq_max {
161 unsigned char mr_address[MAX_ADDR_LEN]; 165 unsigned char mr_address[MAX_ADDR_LEN];
162}; 166};
163 167
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
165 int closing, int tx_ring); 169 int closing, int tx_ring);
166 170
171
172#define V3_ALIGNMENT (8)
173
174#define BLK_HDR_LEN (ALIGN(sizeof(struct block_desc), V3_ALIGNMENT))
175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
179/* kbdq - kernel block descriptor queue */
180struct kbdq_core {
181 struct pgv *pkbdq;
182 unsigned int feature_req_word;
183 unsigned int hdrlen;
184 unsigned char reset_pending_on_curr_blk;
185 unsigned char delete_blk_timer;
186 unsigned short kactive_blk_num;
187 unsigned short blk_sizeof_priv;
188
189 /* last_kactive_blk_num:
190 * trick to see if user-space has caught up
191 * in order to avoid refreshing timer when every single pkt arrives.
192 */
193 unsigned short last_kactive_blk_num;
194
195 char *pkblk_start;
196 char *pkblk_end;
197 int kblk_size;
198 unsigned int knum_blocks;
199 uint64_t knxt_seq_num;
200 char *prev;
201 char *nxt_offset;
202 struct sk_buff *skb;
203
204 atomic_t blk_fill_in_prog;
205
206 /* Default is set to 8ms */
207#define DEFAULT_PRB_RETIRE_TOV (8)
208
209 unsigned short retire_blk_tov;
210 unsigned short version;
211 unsigned long tov_in_jiffies;
212
213 /* timer to retire an outstanding block */
214 struct timer_list retire_blk_timer;
215};
216
217#define PGV_FROM_VMALLOC 1
167struct pgv { 218struct pgv {
168 char *buffer; 219 char *buffer;
169}; 220};
@@ -179,12 +230,40 @@ struct packet_ring_buffer {
179 unsigned int pg_vec_pages; 230 unsigned int pg_vec_pages;
180 unsigned int pg_vec_len; 231 unsigned int pg_vec_len;
181 232
233 struct kbdq_core prb_bdqc;
182 atomic_t pending; 234 atomic_t pending;
183}; 235};
184 236
237#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
238#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
239#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
240#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
241#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
242#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
243#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
244
185struct packet_sock; 245struct packet_sock;
186static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 246static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187 247
248static void *packet_previous_frame(struct packet_sock *po,
249 struct packet_ring_buffer *rb,
250 int status);
251static void packet_increment_head(struct packet_ring_buffer *buff);
252static int prb_curr_blk_in_use(struct kbdq_core *,
253 struct block_desc *);
254static void *prb_dispatch_next_block(struct kbdq_core *,
255 struct packet_sock *);
256static void prb_retire_current_block(struct kbdq_core *,
257 struct packet_sock *, unsigned int status);
258static int prb_queue_frozen(struct kbdq_core *);
259static void prb_open_block(struct kbdq_core *, struct block_desc *);
260static void prb_retire_rx_blk_timer_expired(unsigned long);
261static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *);
262static void prb_init_blk_timer(struct packet_sock *, struct kbdq_core *,
263 void (*func) (unsigned long));
264static void prb_fill_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
265static void prb_clear_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
266static void prb_fill_vlan_info(struct kbdq_core *, struct tpacket3_hdr *);
188static void packet_flush_mclist(struct sock *sk); 267static void packet_flush_mclist(struct sock *sk);
189 268
190struct packet_fanout; 269struct packet_fanout;
@@ -193,6 +272,7 @@ struct packet_sock {
193 struct sock sk; 272 struct sock sk;
194 struct packet_fanout *fanout; 273 struct packet_fanout *fanout;
195 struct tpacket_stats stats; 274 struct tpacket_stats stats;
275 union tpacket_stats_u stats_u;
196 struct packet_ring_buffer rx_ring; 276 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring; 277 struct packet_ring_buffer tx_ring;
198 int copy_thresh; 278 int copy_thresh;
@@ -242,6 +322,15 @@ struct packet_skb_cb {
242 322
243#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 323#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
244 324
325#define GET_PBDQC_FROM_RB(x) ((struct kbdq_core *)(&(x)->prb_bdqc))
326#define GET_PBLOCK_DESC(x, bid) \
327 ((struct block_desc *)((x)->pkbdq[(bid)].buffer))
328#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
329 ((struct block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
330#define GET_NEXT_PRB_BLK_NUM(x) \
331 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
332 ((x)->kactive_blk_num+1) : 0)
333
245static inline struct packet_sock *pkt_sk(struct sock *sk) 334static inline struct packet_sock *pkt_sk(struct sock *sk)
246{ 335{
247 return (struct packet_sock *)sk; 336 return (struct packet_sock *)sk;
@@ -325,8 +414,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
325 h.h2->tp_status = status; 414 h.h2->tp_status = status;
326 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 415 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
327 break; 416 break;
417 case TPACKET_V3:
328 default: 418 default:
329 pr_err("TPACKET version not supported\n"); 419 WARN(1, "TPACKET version not supported.\n");
330 BUG(); 420 BUG();
331 } 421 }
332 422
@@ -351,8 +441,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
351 case TPACKET_V2: 441 case TPACKET_V2:
352 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 442 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
353 return h.h2->tp_status; 443 return h.h2->tp_status;
444 case TPACKET_V3:
354 default: 445 default:
355 pr_err("TPACKET version not supported\n"); 446 WARN(1, "TPACKET version not supported.\n");
356 BUG(); 447 BUG();
357 return 0; 448 return 0;
358 } 449 }
@@ -389,6 +480,665 @@ static inline void *packet_current_frame(struct packet_sock *po,
389 return packet_lookup_frame(po, rb, rb->head, status); 480 return packet_lookup_frame(po, rb, rb->head, status);
390} 481}
391 482
483static void prb_del_retire_blk_timer(struct kbdq_core *pkc)
484{
485 del_timer_sync(&pkc->retire_blk_timer);
486}
487
488static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
489 int tx_ring,
490 struct sk_buff_head *rb_queue)
491{
492 struct kbdq_core *pkc;
493
494 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
495
496 spin_lock(&rb_queue->lock);
497 pkc->delete_blk_timer = 1;
498 spin_unlock(&rb_queue->lock);
499
500 prb_del_retire_blk_timer(pkc);
501}
502
503static void prb_init_blk_timer(struct packet_sock *po,
504 struct kbdq_core *pkc,
505 void (*func) (unsigned long))
506{
507 init_timer(&pkc->retire_blk_timer);
508 pkc->retire_blk_timer.data = (long)po;
509 pkc->retire_blk_timer.function = func;
510 pkc->retire_blk_timer.expires = jiffies;
511}
512
513static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
514{
515 struct kbdq_core *pkc;
516
517 if (tx_ring)
518 BUG();
519
520 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
521 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
522}
523
524static int prb_calc_retire_blk_tmo(struct packet_sock *po,
525 int blk_size_in_bytes)
526{
527 struct net_device *dev;
528 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
529
530 dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
531 if (unlikely(dev == NULL))
532 return DEFAULT_PRB_RETIRE_TOV;
533
534 if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
535 struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
536
537 if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
538 switch (ecmd.speed) {
539 case SPEED_10000:
540 msec = 1;
541 div = 10000/1000;
542 break;
543 case SPEED_1000:
544 msec = 1;
545 div = 1000/1000;
546 break;
547 /*
548 * If the link speed is so slow you don't really
549 * need to worry about perf anyways
550 */
551 case SPEED_100:
552 case SPEED_10:
553 default:
554 return DEFAULT_PRB_RETIRE_TOV;
555 }
556 }
557 }
558
559 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
560
561 if (div)
562 mbits /= div;
563
564 tmo = mbits * msec;
565
566 if (div)
567 return tmo+1;
568 return tmo;
569}
570
571static void prb_init_ft_ops(struct kbdq_core *p1,
572 union tpacket_req_u *req_u)
573{
574 p1->feature_req_word = req_u->req3.tp_feature_req_word;
575}
576
577static void init_prb_bdqc(struct packet_sock *po,
578 struct packet_ring_buffer *rb,
579 struct pgv *pg_vec,
580 union tpacket_req_u *req_u, int tx_ring)
581{
582 struct kbdq_core *p1 = &rb->prb_bdqc;
583 struct block_desc *pbd;
584
585 memset(p1, 0x0, sizeof(*p1));
586
587 p1->knxt_seq_num = 1;
588 p1->pkbdq = pg_vec;
589 pbd = (struct block_desc *)pg_vec[0].buffer;
590 p1->pkblk_start = (char *)pg_vec[0].buffer;
591 p1->kblk_size = req_u->req3.tp_block_size;
592 p1->knum_blocks = req_u->req3.tp_block_nr;
593 p1->hdrlen = po->tp_hdrlen;
594 p1->version = po->tp_version;
595 p1->last_kactive_blk_num = 0;
596 po->stats_u.stats3.tp_freeze_q_cnt = 0;
597 if (req_u->req3.tp_retire_blk_tov)
598 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
599 else
600 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
601 req_u->req3.tp_block_size);
602 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
603 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
604
605 prb_init_ft_ops(p1, req_u);
606 prb_setup_retire_blk_timer(po, tx_ring);
607 prb_open_block(p1, pbd);
608}
609
610/* Do NOT update the last_blk_num first.
611 * Assumes sk_buff_head lock is held.
612 */
613static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc)
614{
615 mod_timer(&pkc->retire_blk_timer,
616 jiffies + pkc->tov_in_jiffies);
617 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
618}
619
620/*
621 * Timer logic:
622 * 1) We refresh the timer only when we open a block.
623 * By doing this we don't waste cycles refreshing the timer
624 * on packet-by-packet basis.
625 *
626 * With a 1MB block-size, on a 1Gbps line, it will take
627 * i) ~8 ms to fill a block + ii) memcpy etc.
628 * In this cut we are not accounting for the memcpy time.
629 *
630 * So, if the user sets the 'tmo' to 10ms then the timer
631 * will never fire while the block is still getting filled
632 * (which is what we want). However, the user could choose
633 * to close a block early and that's fine.
634 *
635 * But when the timer does fire, we check whether or not to refresh it.
636 * Since the tmo granularity is in msecs, it is not too expensive
637 * to refresh the timer, lets say every '8' msecs.
638 * Either the user can set the 'tmo' or we can derive it based on
639 * a) line-speed and b) block-size.
640 * prb_calc_retire_blk_tmo() calculates the tmo.
641 *
642 */
643static void prb_retire_rx_blk_timer_expired(unsigned long data)
644{
645 struct packet_sock *po = (struct packet_sock *)data;
646 struct kbdq_core *pkc = &po->rx_ring.prb_bdqc;
647 unsigned int frozen;
648 struct block_desc *pbd;
649
650 spin_lock(&po->sk.sk_receive_queue.lock);
651
652 frozen = prb_queue_frozen(pkc);
653 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
654
655 if (unlikely(pkc->delete_blk_timer))
656 goto out;
657
658 /* We only need to plug the race when the block is partially filled.
659 * tpacket_rcv:
660 * lock(); increment BLOCK_NUM_PKTS; unlock()
661 * copy_bits() is in progress ...
662 * timer fires on other cpu:
663 * we can't retire the current block because copy_bits
664 * is in progress.
665 *
666 */
667 if (BLOCK_NUM_PKTS(pbd)) {
668 while (atomic_read(&pkc->blk_fill_in_prog)) {
669 /* Waiting for skb_copy_bits to finish... */
670 cpu_relax();
671 }
672 }
673
674 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
675 if (!frozen) {
676 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
677 if (!prb_dispatch_next_block(pkc, po))
678 goto refresh_timer;
679 else
680 goto out;
681 } else {
682 /* Case 1. Queue was frozen because user-space was
683 * lagging behind.
684 */
685 if (prb_curr_blk_in_use(pkc, pbd)) {
686 /*
687 * Ok, user-space is still behind.
688 * So just refresh the timer.
689 */
690 goto refresh_timer;
691 } else {
692 /* Case 2. queue was frozen,user-space caught up,
693 * now the link went idle && the timer fired.
694 * We don't have a block to close.So we open this
695 * block and restart the timer.
696 * opening a block thaws the queue,restarts timer
697 * Thawing/timer-refresh is a side effect.
698 */
699 prb_open_block(pkc, pbd);
700 goto out;
701 }
702 }
703 }
704
705refresh_timer:
706 _prb_refresh_rx_retire_blk_timer(pkc);
707
708out:
709 spin_unlock(&po->sk.sk_receive_queue.lock);
710}
711
712static inline void prb_flush_block(struct kbdq_core *pkc1,
713 struct block_desc *pbd1, __u32 status)
714{
715 /* Flush everything minus the block header */
716
717#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
718 u8 *start, *end;
719
720 start = (u8 *)pbd1;
721
722 /* Skip the block header(we know header WILL fit in 4K) */
723 start += PAGE_SIZE;
724
725 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
726 for (; start < end; start += PAGE_SIZE)
727 flush_dcache_page(pgv_to_page(start));
728
729 smp_wmb();
730#endif
731
732 /* Now update the block status. */
733
734 BLOCK_STATUS(pbd1) = status;
735
736 /* Flush the block header */
737
738#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
739 start = (u8 *)pbd1;
740 flush_dcache_page(pgv_to_page(start));
741
742 smp_wmb();
743#endif
744}
745
746/*
747 * Side effect:
748 *
749 * 1) flush the block
750 * 2) Increment active_blk_num
751 *
752 * Note:We DONT refresh the timer on purpose.
753 * Because almost always the next block will be opened.
754 */
755static void prb_close_block(struct kbdq_core *pkc1, struct block_desc *pbd1,
756 struct packet_sock *po, unsigned int stat)
757{
758 __u32 status = TP_STATUS_USER | stat;
759
760 struct tpacket3_hdr *last_pkt;
761 struct hdr_v1 *h1 = &pbd1->hdr.bh1;
762
763 if (po->stats.tp_drops)
764 status |= TP_STATUS_LOSING;
765
766 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
767 last_pkt->tp_next_offset = 0;
768
769 /* Get the ts of the last pkt */
770 if (BLOCK_NUM_PKTS(pbd1)) {
771 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
772 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
773 } else {
774 /* Ok, we tmo'd - so get the current time */
775 struct timespec ts;
776 getnstimeofday(&ts);
777 h1->ts_last_pkt.ts_sec = ts.tv_sec;
778 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
779 }
780
781 smp_wmb();
782
783 /* Flush the block */
784 prb_flush_block(pkc1, pbd1, status);
785
786 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
787}
788
789static inline void prb_thaw_queue(struct kbdq_core *pkc)
790{
791 pkc->reset_pending_on_curr_blk = 0;
792}
793
794/*
795 * Side effect of opening a block:
796 *
797 * 1) prb_queue is thawed.
798 * 2) retire_blk_timer is refreshed.
799 *
800 */
801static void prb_open_block(struct kbdq_core *pkc1, struct block_desc *pbd1)
802{
803 struct timespec ts;
804 struct hdr_v1 *h1 = &pbd1->hdr.bh1;
805
806 smp_rmb();
807
808 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
809
810 /* We could have just memset this but we will lose the
811 * flexibility of making the priv area sticky
812 */
813 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
814 BLOCK_NUM_PKTS(pbd1) = 0;
815 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
816 getnstimeofday(&ts);
817 h1->ts_first_pkt.ts_sec = ts.tv_sec;
818 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
819 pkc1->pkblk_start = (char *)pbd1;
820 pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
821 BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
822 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
823 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
824 pbd1->version = pkc1->version;
825 pkc1->prev = pkc1->nxt_offset;
826 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
827 prb_thaw_queue(pkc1);
828 _prb_refresh_rx_retire_blk_timer(pkc1);
829
830 smp_wmb();
831
832 return;
833 }
834
835 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
836 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
837 dump_stack();
838 BUG();
839}
840
841/*
842 * Queue freeze logic:
843 * 1) Assume tp_block_nr = 8 blocks.
844 * 2) At time 't0', user opens Rx ring.
845 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
846 * 4) user-space is either sleeping or processing block '0'.
847 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
848 * it will close block-7,loop around and try to fill block '0'.
849 * call-flow:
850 * __packet_lookup_frame_in_block
851 * prb_retire_current_block()
852 * prb_dispatch_next_block()
853 * |->(BLOCK_STATUS == USER) evaluates to true
854 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
855 * 6) Now there are two cases:
856 * 6.1) Link goes idle right after the queue is frozen.
857 * But remember, the last open_block() refreshed the timer.
858 * When this timer expires,it will refresh itself so that we can
859 * re-open block-0 in near future.
860 * 6.2) Link is busy and keeps on receiving packets. This is a simple
861 * case and __packet_lookup_frame_in_block will check if block-0
862 * is free and can now be re-used.
863 */
864static inline void prb_freeze_queue(struct kbdq_core *pkc,
865 struct packet_sock *po)
866{
867 pkc->reset_pending_on_curr_blk = 1;
868 po->stats_u.stats3.tp_freeze_q_cnt++;
869}
870
871#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
872
873/*
874 * If the next block is free then we will dispatch it
875 * and return a good offset.
876 * Else, we will freeze the queue.
877 * So, caller must check the return value.
878 */
879static void *prb_dispatch_next_block(struct kbdq_core *pkc,
880 struct packet_sock *po)
881{
882 struct block_desc *pbd;
883
884 smp_rmb();
885
886 /* 1. Get current block num */
887 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
888
889 /* 2. If this block is currently in_use then freeze the queue */
890 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
891 prb_freeze_queue(pkc, po);
892 return NULL;
893 }
894
895 /*
896 * 3.
897 * open this block and return the offset where the first packet
898 * needs to get stored.
899 */
900 prb_open_block(pkc, pbd);
901 return (void *)pkc->nxt_offset;
902}
903
904static void prb_retire_current_block(struct kbdq_core *pkc,
905 struct packet_sock *po, unsigned int status)
906{
907 struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
908
909 /* retire/close the current block */
910 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
911 /*
912 * Plug the case where copy_bits() is in progress on
913 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
914 * have space to copy the pkt in the current block and
915 * called prb_retire_current_block()
916 *
917 * We don't need to worry about the TMO case because
918 * the timer-handler already handled this case.
919 */
920 if (!(status & TP_STATUS_BLK_TMO)) {
921 while (atomic_read(&pkc->blk_fill_in_prog)) {
922 /* Waiting for skb_copy_bits to finish... */
923 cpu_relax();
924 }
925 }
926 prb_close_block(pkc, pbd, po, status);
927 return;
928 }
929
930 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
931 dump_stack();
932 BUG();
933}
934
935static inline int prb_curr_blk_in_use(struct kbdq_core *pkc,
936 struct block_desc *pbd)
937{
938 return TP_STATUS_USER & BLOCK_STATUS(pbd);
939}
940
941static inline int prb_queue_frozen(struct kbdq_core *pkc)
942{
943 return pkc->reset_pending_on_curr_blk;
944}
945
946static inline void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
947{
948 struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
949 atomic_dec(&pkc->blk_fill_in_prog);
950}
951
952static inline void prb_fill_rxhash(struct kbdq_core *pkc,
953 struct tpacket3_hdr *ppd)
954{
955 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
956}
957
958static inline void prb_clear_rxhash(struct kbdq_core *pkc,
959 struct tpacket3_hdr *ppd)
960{
961 ppd->hv1.tp_rxhash = 0;
962}
963
964static inline void prb_fill_vlan_info(struct kbdq_core *pkc,
965 struct tpacket3_hdr *ppd)
966{
967 if (vlan_tx_tag_present(pkc->skb)) {
968 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
969 ppd->tp_status = TP_STATUS_VLAN_VALID;
970 } else {
971 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
972 }
973}
974
975static void prb_run_all_ft_ops(struct kbdq_core *pkc,
976 struct tpacket3_hdr *ppd)
977{
978 prb_fill_vlan_info(pkc, ppd);
979
980 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
981 prb_fill_rxhash(pkc, ppd);
982 else
983 prb_clear_rxhash(pkc, ppd);
984}
985
986static inline void prb_fill_curr_block(char *curr, struct kbdq_core *pkc,
987 struct block_desc *pbd,
988 unsigned int len)
989{
990 struct tpacket3_hdr *ppd;
991
992 ppd = (struct tpacket3_hdr *)curr;
993 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
994 pkc->prev = curr;
995 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
996 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
997 BLOCK_NUM_PKTS(pbd) += 1;
998 atomic_inc(&pkc->blk_fill_in_prog);
999 prb_run_all_ft_ops(pkc, ppd);
1000}
1001
1002/* Assumes caller has the sk->rx_queue.lock */
1003static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1004 struct sk_buff *skb,
1005 int status,
1006 unsigned int len
1007 )
1008{
1009 struct kbdq_core *pkc;
1010 struct block_desc *pbd;
1011 char *curr, *end;
1012
1013 pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
1014 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1015
1016 /* Queue is frozen when user space is lagging behind */
1017 if (prb_queue_frozen(pkc)) {
1018 /*
1019 * Check if that last block which caused the queue to freeze,
1020 * is still in_use by user-space.
1021 */
1022 if (prb_curr_blk_in_use(pkc, pbd)) {
1023 /* Can't record this packet */
1024 return NULL;
1025 } else {
1026 /*
1027 * Ok, the block was released by user-space.
1028 * Now let's open that block.
1029 * opening a block also thaws the queue.
1030 * Thawing is a side effect.
1031 */
1032 prb_open_block(pkc, pbd);
1033 }
1034 }
1035
1036 smp_mb();
1037 curr = pkc->nxt_offset;
1038 pkc->skb = skb;
1039 end = (char *) ((char *)pbd + pkc->kblk_size);
1040
1041 /* first try the current block */
1042 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1043 prb_fill_curr_block(curr, pkc, pbd, len);
1044 return (void *)curr;
1045 }
1046
1047 /* Ok, close the current block */
1048 prb_retire_current_block(pkc, po, 0);
1049
1050 /* Now, try to dispatch the next block */
1051 curr = (char *)prb_dispatch_next_block(pkc, po);
1052 if (curr) {
1053 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1054 prb_fill_curr_block(curr, pkc, pbd, len);
1055 return (void *)curr;
1056 }
1057
1058 /*
1059 * No free blocks are available.user_space hasn't caught up yet.
1060 * Queue was just frozen and now this packet will get dropped.
1061 */
1062 return NULL;
1063}
1064
1065static inline void *packet_current_rx_frame(struct packet_sock *po,
1066 struct sk_buff *skb,
1067 int status, unsigned int len)
1068{
1069 char *curr = NULL;
1070 switch (po->tp_version) {
1071 case TPACKET_V1:
1072 case TPACKET_V2:
1073 curr = packet_lookup_frame(po, &po->rx_ring,
1074 po->rx_ring.head, status);
1075 return curr;
1076 case TPACKET_V3:
1077 return __packet_lookup_frame_in_block(po, skb, status, len);
1078 default:
1079 WARN(1, "TPACKET version not supported\n");
1080 BUG();
1081 return 0;
1082 }
1083}
1084
1085static inline void *prb_lookup_block(struct packet_sock *po,
1086 struct packet_ring_buffer *rb,
1087 unsigned int previous,
1088 int status)
1089{
1090 struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1091 struct block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
1092
1093 if (status != BLOCK_STATUS(pbd))
1094 return NULL;
1095 return pbd;
1096}
1097
1098static inline int prb_previous_blk_num(struct packet_ring_buffer *rb)
1099{
1100 unsigned int prev;
1101 if (rb->prb_bdqc.kactive_blk_num)
1102 prev = rb->prb_bdqc.kactive_blk_num-1;
1103 else
1104 prev = rb->prb_bdqc.knum_blocks-1;
1105 return prev;
1106}
1107
1108/* Assumes caller has held the rx_queue.lock */
1109static inline void *__prb_previous_block(struct packet_sock *po,
1110 struct packet_ring_buffer *rb,
1111 int status)
1112{
1113 unsigned int previous = prb_previous_blk_num(rb);
1114 return prb_lookup_block(po, rb, previous, status);
1115}
1116
1117static inline void *packet_previous_rx_frame(struct packet_sock *po,
1118 struct packet_ring_buffer *rb,
1119 int status)
1120{
1121 if (po->tp_version <= TPACKET_V2)
1122 return packet_previous_frame(po, rb, status);
1123
1124 return __prb_previous_block(po, rb, status);
1125}
1126
1127static inline void packet_increment_rx_head(struct packet_sock *po,
1128 struct packet_ring_buffer *rb)
1129{
1130 switch (po->tp_version) {
1131 case TPACKET_V1:
1132 case TPACKET_V2:
1133 return packet_increment_head(rb);
1134 case TPACKET_V3:
1135 default:
1136 WARN(1, "TPACKET version not supported.\n");
1137 BUG();
1138 return;
1139 }
1140}
1141
392static inline void *packet_previous_frame(struct packet_sock *po, 1142static inline void *packet_previous_frame(struct packet_sock *po,
393 struct packet_ring_buffer *rb, 1143 struct packet_ring_buffer *rb,
394 int status) 1144 int status)
@@ -982,12 +1732,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
982 union { 1732 union {
983 struct tpacket_hdr *h1; 1733 struct tpacket_hdr *h1;
984 struct tpacket2_hdr *h2; 1734 struct tpacket2_hdr *h2;
1735 struct tpacket3_hdr *h3;
985 void *raw; 1736 void *raw;
986 } h; 1737 } h;
987 u8 *skb_head = skb->data; 1738 u8 *skb_head = skb->data;
988 int skb_len = skb->len; 1739 int skb_len = skb->len;
989 unsigned int snaplen, res; 1740 unsigned int snaplen, res;
990 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; 1741 unsigned long status = TP_STATUS_USER;
991 unsigned short macoff, netoff, hdrlen; 1742 unsigned short macoff, netoff, hdrlen;
992 struct sk_buff *copy_skb = NULL; 1743 struct sk_buff *copy_skb = NULL;
993 struct timeval tv; 1744 struct timeval tv;
@@ -1033,37 +1784,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1033 po->tp_reserve; 1784 po->tp_reserve;
1034 macoff = netoff - maclen; 1785 macoff = netoff - maclen;
1035 } 1786 }
1036 1787 if (po->tp_version <= TPACKET_V2) {
1037 if (macoff + snaplen > po->rx_ring.frame_size) { 1788 if (macoff + snaplen > po->rx_ring.frame_size) {
1038 if (po->copy_thresh && 1789 if (po->copy_thresh &&
1039 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 1790 atomic_read(&sk->sk_rmem_alloc) + skb->truesize
1040 (unsigned)sk->sk_rcvbuf) { 1791 < (unsigned)sk->sk_rcvbuf) {
1041 if (skb_shared(skb)) { 1792 if (skb_shared(skb)) {
1042 copy_skb = skb_clone(skb, GFP_ATOMIC); 1793 copy_skb = skb_clone(skb, GFP_ATOMIC);
1043 } else { 1794 } else {
1044 copy_skb = skb_get(skb); 1795 copy_skb = skb_get(skb);
1045 skb_head = skb->data; 1796 skb_head = skb->data;
1797 }
1798 if (copy_skb)
1799 skb_set_owner_r(copy_skb, sk);
1046 } 1800 }
1047 if (copy_skb) 1801 snaplen = po->rx_ring.frame_size - macoff;
1048 skb_set_owner_r(copy_skb, sk); 1802 if ((int)snaplen < 0)
1803 snaplen = 0;
1049 } 1804 }
1050 snaplen = po->rx_ring.frame_size - macoff;
1051 if ((int)snaplen < 0)
1052 snaplen = 0;
1053 } 1805 }
1054
1055 spin_lock(&sk->sk_receive_queue.lock); 1806 spin_lock(&sk->sk_receive_queue.lock);
1056 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); 1807 h.raw = packet_current_rx_frame(po, skb,
1808 TP_STATUS_KERNEL, (macoff+snaplen));
1057 if (!h.raw) 1809 if (!h.raw)
1058 goto ring_is_full; 1810 goto ring_is_full;
1059 packet_increment_head(&po->rx_ring); 1811 if (po->tp_version <= TPACKET_V2) {
1812 packet_increment_rx_head(po, &po->rx_ring);
1813 /*
1814 * LOSING will be reported till you read the stats,
1815 * because it's COR - Clear On Read.
1816 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1817 * at packet level.
1818 */
1819 if (po->stats.tp_drops)
1820 status |= TP_STATUS_LOSING;
1821 }
1060 po->stats.tp_packets++; 1822 po->stats.tp_packets++;
1061 if (copy_skb) { 1823 if (copy_skb) {
1062 status |= TP_STATUS_COPY; 1824 status |= TP_STATUS_COPY;
1063 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 1825 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1064 } 1826 }
1065 if (!po->stats.tp_drops)
1066 status &= ~TP_STATUS_LOSING;
1067 spin_unlock(&sk->sk_receive_queue.lock); 1827 spin_unlock(&sk->sk_receive_queue.lock);
1068 1828
1069 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 1829 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
@@ -1114,6 +1874,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1114 h.h2->tp_padding = 0; 1874 h.h2->tp_padding = 0;
1115 hdrlen = sizeof(*h.h2); 1875 hdrlen = sizeof(*h.h2);
1116 break; 1876 break;
1877 case TPACKET_V3:
1878 /* tp_nxt_offset,vlan are already populated above.
1879 * So DONT clear those fields here
1880 */
1881 h.h3->tp_status |= status;
1882 h.h3->tp_len = skb->len;
1883 h.h3->tp_snaplen = snaplen;
1884 h.h3->tp_mac = macoff;
1885 h.h3->tp_net = netoff;
1886 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1887 && shhwtstamps->syststamp.tv64)
1888 ts = ktime_to_timespec(shhwtstamps->syststamp);
1889 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1890 && shhwtstamps->hwtstamp.tv64)
1891 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1892 else if (skb->tstamp.tv64)
1893 ts = ktime_to_timespec(skb->tstamp);
1894 else
1895 getnstimeofday(&ts);
1896 h.h3->tp_sec = ts.tv_sec;
1897 h.h3->tp_nsec = ts.tv_nsec;
1898 hdrlen = sizeof(*h.h3);
1899 break;
1117 default: 1900 default:
1118 BUG(); 1901 BUG();
1119 } 1902 }
@@ -1134,13 +1917,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1134 { 1917 {
1135 u8 *start, *end; 1918 u8 *start, *end;
1136 1919
1137 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); 1920 if (po->tp_version <= TPACKET_V2) {
1138 for (start = h.raw; start < end; start += PAGE_SIZE) 1921 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1139 flush_dcache_page(pgv_to_page(start)); 1922 + macoff + snaplen);
1923 for (start = h.raw; start < end; start += PAGE_SIZE)
1924 flush_dcache_page(pgv_to_page(start));
1925 }
1140 smp_wmb(); 1926 smp_wmb();
1141 } 1927 }
1142#endif 1928#endif
1143 __packet_set_status(po, h.raw, status); 1929 if (po->tp_version <= TPACKET_V2)
1930 __packet_set_status(po, h.raw, status);
1931 else
1932 prb_clear_blk_fill_status(&po->rx_ring);
1144 1933
1145 sk->sk_data_ready(sk, 0); 1934 sk->sk_data_ready(sk, 0);
1146 1935
@@ -1631,7 +2420,7 @@ static int packet_release(struct socket *sock)
1631 struct sock *sk = sock->sk; 2420 struct sock *sk = sock->sk;
1632 struct packet_sock *po; 2421 struct packet_sock *po;
1633 struct net *net; 2422 struct net *net;
1634 struct tpacket_req req; 2423 union tpacket_req_u req_u;
1635 2424
1636 if (!sk) 2425 if (!sk)
1637 return 0; 2426 return 0;
@@ -1654,13 +2443,13 @@ static int packet_release(struct socket *sock)
1654 2443
1655 packet_flush_mclist(sk); 2444 packet_flush_mclist(sk);
1656 2445
1657 memset(&req, 0, sizeof(req)); 2446 memset(&req_u, 0, sizeof(req_u));
1658 2447
1659 if (po->rx_ring.pg_vec) 2448 if (po->rx_ring.pg_vec)
1660 packet_set_ring(sk, &req, 1, 0); 2449 packet_set_ring(sk, &req_u, 1, 0);
1661 2450
1662 if (po->tx_ring.pg_vec) 2451 if (po->tx_ring.pg_vec)
1663 packet_set_ring(sk, &req, 1, 1); 2452 packet_set_ring(sk, &req_u, 1, 1);
1664 2453
1665 fanout_release(sk); 2454 fanout_release(sk);
1666 2455
@@ -2280,15 +3069,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2280 case PACKET_RX_RING: 3069 case PACKET_RX_RING:
2281 case PACKET_TX_RING: 3070 case PACKET_TX_RING:
2282 { 3071 {
2283 struct tpacket_req req; 3072 union tpacket_req_u req_u;
3073 int len;
2284 3074
2285 if (optlen < sizeof(req)) 3075 switch (po->tp_version) {
3076 case TPACKET_V1:
3077 case TPACKET_V2:
3078 len = sizeof(req_u.req);
3079 break;
3080 case TPACKET_V3:
3081 default:
3082 len = sizeof(req_u.req3);
3083 break;
3084 }
3085 if (optlen < len)
2286 return -EINVAL; 3086 return -EINVAL;
2287 if (pkt_sk(sk)->has_vnet_hdr) 3087 if (pkt_sk(sk)->has_vnet_hdr)
2288 return -EINVAL; 3088 return -EINVAL;
2289 if (copy_from_user(&req, optval, sizeof(req))) 3089 if (copy_from_user(&req_u.req, optval, len))
2290 return -EFAULT; 3090 return -EFAULT;
2291 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); 3091 return packet_set_ring(sk, &req_u, 0,
3092 optname == PACKET_TX_RING);
2292 } 3093 }
2293 case PACKET_COPY_THRESH: 3094 case PACKET_COPY_THRESH:
2294 { 3095 {
@@ -2315,6 +3116,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2315 switch (val) { 3116 switch (val) {
2316 case TPACKET_V1: 3117 case TPACKET_V1:
2317 case TPACKET_V2: 3118 case TPACKET_V2:
3119 case TPACKET_V3:
2318 po->tp_version = val; 3120 po->tp_version = val;
2319 return 0; 3121 return 0;
2320 default: 3122 default:
@@ -2424,6 +3226,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2424 struct packet_sock *po = pkt_sk(sk); 3226 struct packet_sock *po = pkt_sk(sk);
2425 void *data; 3227 void *data;
2426 struct tpacket_stats st; 3228 struct tpacket_stats st;
3229 union tpacket_stats_u st_u;
2427 3230
2428 if (level != SOL_PACKET) 3231 if (level != SOL_PACKET)
2429 return -ENOPROTOOPT; 3232 return -ENOPROTOOPT;
@@ -2436,15 +3239,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2436 3239
2437 switch (optname) { 3240 switch (optname) {
2438 case PACKET_STATISTICS: 3241 case PACKET_STATISTICS:
2439 if (len > sizeof(struct tpacket_stats)) 3242 if (po->tp_version == TPACKET_V3) {
2440 len = sizeof(struct tpacket_stats); 3243 len = sizeof(struct tpacket_stats_v3);
3244 } else {
3245 if (len > sizeof(struct tpacket_stats))
3246 len = sizeof(struct tpacket_stats);
3247 }
2441 spin_lock_bh(&sk->sk_receive_queue.lock); 3248 spin_lock_bh(&sk->sk_receive_queue.lock);
2442 st = po->stats; 3249 if (po->tp_version == TPACKET_V3) {
3250 memcpy(&st_u.stats3, &po->stats,
3251 sizeof(struct tpacket_stats));
3252 st_u.stats3.tp_freeze_q_cnt =
3253 po->stats_u.stats3.tp_freeze_q_cnt;
3254 st_u.stats3.tp_packets += po->stats.tp_drops;
3255 data = &st_u.stats3;
3256 } else {
3257 st = po->stats;
3258 st.tp_packets += st.tp_drops;
3259 data = &st;
3260 }
2443 memset(&po->stats, 0, sizeof(st)); 3261 memset(&po->stats, 0, sizeof(st));
2444 spin_unlock_bh(&sk->sk_receive_queue.lock); 3262 spin_unlock_bh(&sk->sk_receive_queue.lock);
2445 st.tp_packets += st.tp_drops;
2446
2447 data = &st;
2448 break; 3263 break;
2449 case PACKET_AUXDATA: 3264 case PACKET_AUXDATA:
2450 if (len > sizeof(int)) 3265 if (len > sizeof(int))
@@ -2485,6 +3300,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2485 case TPACKET_V2: 3300 case TPACKET_V2:
2486 val = sizeof(struct tpacket2_hdr); 3301 val = sizeof(struct tpacket2_hdr);
2487 break; 3302 break;
3303 case TPACKET_V3:
3304 val = sizeof(struct tpacket3_hdr);
3305 break;
2488 default: 3306 default:
2489 return -EINVAL; 3307 return -EINVAL;
2490 } 3308 }
@@ -2641,7 +3459,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
2641 3459
2642 spin_lock_bh(&sk->sk_receive_queue.lock); 3460 spin_lock_bh(&sk->sk_receive_queue.lock);
2643 if (po->rx_ring.pg_vec) { 3461 if (po->rx_ring.pg_vec) {
2644 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) 3462 if (!packet_previous_rx_frame(po, &po->rx_ring,
3463 TP_STATUS_KERNEL))
2645 mask |= POLLIN | POLLRDNORM; 3464 mask |= POLLIN | POLLRDNORM;
2646 } 3465 }
2647 spin_unlock_bh(&sk->sk_receive_queue.lock); 3466 spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -2760,7 +3579,7 @@ out_free_pgvec:
2760 goto out; 3579 goto out;
2761} 3580}
2762 3581
2763static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 3582static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
2764 int closing, int tx_ring) 3583 int closing, int tx_ring)
2765{ 3584{
2766 struct pgv *pg_vec = NULL; 3585 struct pgv *pg_vec = NULL;
@@ -2769,7 +3588,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2769 struct packet_ring_buffer *rb; 3588 struct packet_ring_buffer *rb;
2770 struct sk_buff_head *rb_queue; 3589 struct sk_buff_head *rb_queue;
2771 __be16 num; 3590 __be16 num;
2772 int err; 3591 int err = -EINVAL;
3592 /* Added to avoid minimal code churn */
3593 struct tpacket_req *req = &req_u->req;
3594
3595 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3596 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3597 WARN(1, "Tx-ring is not supported.\n");
3598 goto out;
3599 }
2773 3600
2774 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 3601 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2775 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 3602 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -2795,6 +3622,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2795 case TPACKET_V2: 3622 case TPACKET_V2:
2796 po->tp_hdrlen = TPACKET2_HDRLEN; 3623 po->tp_hdrlen = TPACKET2_HDRLEN;
2797 break; 3624 break;
3625 case TPACKET_V3:
3626 po->tp_hdrlen = TPACKET3_HDRLEN;
3627 break;
2798 } 3628 }
2799 3629
2800 err = -EINVAL; 3630 err = -EINVAL;
@@ -2820,6 +3650,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2820 pg_vec = alloc_pg_vec(req, order); 3650 pg_vec = alloc_pg_vec(req, order);
2821 if (unlikely(!pg_vec)) 3651 if (unlikely(!pg_vec))
2822 goto out; 3652 goto out;
3653 switch (po->tp_version) {
3654 case TPACKET_V3:
3655 /* Transmit path is not supported. We checked
3656 * it above but just being paranoid
3657 */
3658 if (!tx_ring)
3659 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3660 break;
3661 default:
3662 break;
3663 }
2823 } 3664 }
2824 /* Done */ 3665 /* Done */
2825 else { 3666 else {
@@ -2872,7 +3713,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2872 register_prot_hook(sk); 3713 register_prot_hook(sk);
2873 } 3714 }
2874 spin_unlock(&po->bind_lock); 3715 spin_unlock(&po->bind_lock);
2875 3716 if (closing && (po->tp_version > TPACKET_V2)) {
3717 /* Because we don't support block-based V3 on tx-ring */
3718 if (!tx_ring)
3719 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3720 }
2876 release_sock(sk); 3721 release_sock(sk);
2877 3722
2878 if (pg_vec) 3723 if (pg_vec)