aboutsummaryrefslogtreecommitdiffstats
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c1012
1 files changed, 915 insertions, 97 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c698cec0a445..82a6f34d39d0 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -40,6 +40,10 @@
40 * byte arrays at the end of sockaddr_ll 40 * byte arrays at the end of sockaddr_ll
41 * and packet_mreq. 41 * and packet_mreq.
42 * Johann Baudy : Added TX RING. 42 * Johann Baudy : Added TX RING.
43 * Chetan Loke : Implemented TPACKET_V3 block abstraction
44 * layer.
45 * Copyright (C) 2011, <lokec@ccs.neu.edu>
46 *
43 * 47 *
44 * This program is free software; you can redistribute it and/or 48 * This program is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU General Public License 49 * modify it under the terms of the GNU General Public License
@@ -161,9 +165,56 @@ struct packet_mreq_max {
161 unsigned char mr_address[MAX_ADDR_LEN]; 165 unsigned char mr_address[MAX_ADDR_LEN];
162}; 166};
163 167
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 168static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
165 int closing, int tx_ring); 169 int closing, int tx_ring);
166 170
171
172#define V3_ALIGNMENT (8)
173
174#define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
175
176#define BLK_PLUS_PRIV(sz_of_priv) \
177 (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
178
179/* kbdq - kernel block descriptor queue */
180struct tpacket_kbdq_core {
181 struct pgv *pkbdq;
182 unsigned int feature_req_word;
183 unsigned int hdrlen;
184 unsigned char reset_pending_on_curr_blk;
185 unsigned char delete_blk_timer;
186 unsigned short kactive_blk_num;
187 unsigned short blk_sizeof_priv;
188
189 /* last_kactive_blk_num:
190 * trick to see if user-space has caught up
191 * in order to avoid refreshing timer when every single pkt arrives.
192 */
193 unsigned short last_kactive_blk_num;
194
195 char *pkblk_start;
196 char *pkblk_end;
197 int kblk_size;
198 unsigned int knum_blocks;
199 uint64_t knxt_seq_num;
200 char *prev;
201 char *nxt_offset;
202 struct sk_buff *skb;
203
204 atomic_t blk_fill_in_prog;
205
206 /* Default is set to 8ms */
207#define DEFAULT_PRB_RETIRE_TOV (8)
208
209 unsigned short retire_blk_tov;
210 unsigned short version;
211 unsigned long tov_in_jiffies;
212
213 /* timer to retire an outstanding block */
214 struct timer_list retire_blk_timer;
215};
216
217#define PGV_FROM_VMALLOC 1
167struct pgv { 218struct pgv {
168 char *buffer; 219 char *buffer;
169}; 220};
@@ -179,12 +230,44 @@ struct packet_ring_buffer {
179 unsigned int pg_vec_pages; 230 unsigned int pg_vec_pages;
180 unsigned int pg_vec_len; 231 unsigned int pg_vec_len;
181 232
233 struct tpacket_kbdq_core prb_bdqc;
182 atomic_t pending; 234 atomic_t pending;
183}; 235};
184 236
237#define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
238#define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts)
239#define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt)
240#define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len)
241#define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num)
242#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
243#define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x)))
244
185struct packet_sock; 245struct packet_sock;
186static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); 246static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
187 247
248static void *packet_previous_frame(struct packet_sock *po,
249 struct packet_ring_buffer *rb,
250 int status);
251static void packet_increment_head(struct packet_ring_buffer *buff);
252static int prb_curr_blk_in_use(struct tpacket_kbdq_core *,
253 struct tpacket_block_desc *);
254static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
255 struct packet_sock *);
256static void prb_retire_current_block(struct tpacket_kbdq_core *,
257 struct packet_sock *, unsigned int status);
258static int prb_queue_frozen(struct tpacket_kbdq_core *);
259static void prb_open_block(struct tpacket_kbdq_core *,
260 struct tpacket_block_desc *);
261static void prb_retire_rx_blk_timer_expired(unsigned long);
262static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
263static void prb_init_blk_timer(struct packet_sock *,
264 struct tpacket_kbdq_core *,
265 void (*func) (unsigned long));
266static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
267static void prb_clear_rxhash(struct tpacket_kbdq_core *,
268 struct tpacket3_hdr *);
269static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
270 struct tpacket3_hdr *);
188static void packet_flush_mclist(struct sock *sk); 271static void packet_flush_mclist(struct sock *sk);
189 272
190struct packet_fanout; 273struct packet_fanout;
@@ -193,6 +276,7 @@ struct packet_sock {
193 struct sock sk; 276 struct sock sk;
194 struct packet_fanout *fanout; 277 struct packet_fanout *fanout;
195 struct tpacket_stats stats; 278 struct tpacket_stats stats;
279 union tpacket_stats_u stats_u;
196 struct packet_ring_buffer rx_ring; 280 struct packet_ring_buffer rx_ring;
197 struct packet_ring_buffer tx_ring; 281 struct packet_ring_buffer tx_ring;
198 int copy_thresh; 282 int copy_thresh;
@@ -242,7 +326,16 @@ struct packet_skb_cb {
242 326
243#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) 327#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
244 328
245static inline struct packet_sock *pkt_sk(struct sock *sk) 329#define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
330#define GET_PBLOCK_DESC(x, bid) \
331 ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
332#define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \
333 ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
334#define GET_NEXT_PRB_BLK_NUM(x) \
335 (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
336 ((x)->kactive_blk_num+1) : 0)
337
338static struct packet_sock *pkt_sk(struct sock *sk)
246{ 339{
247 return (struct packet_sock *)sk; 340 return (struct packet_sock *)sk;
248} 341}
@@ -325,8 +418,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
325 h.h2->tp_status = status; 418 h.h2->tp_status = status;
326 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 419 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
327 break; 420 break;
421 case TPACKET_V3:
328 default: 422 default:
329 pr_err("TPACKET version not supported\n"); 423 WARN(1, "TPACKET version not supported.\n");
330 BUG(); 424 BUG();
331 } 425 }
332 426
@@ -351,8 +445,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
351 case TPACKET_V2: 445 case TPACKET_V2:
352 flush_dcache_page(pgv_to_page(&h.h2->tp_status)); 446 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
353 return h.h2->tp_status; 447 return h.h2->tp_status;
448 case TPACKET_V3:
354 default: 449 default:
355 pr_err("TPACKET version not supported\n"); 450 WARN(1, "TPACKET version not supported.\n");
356 BUG(); 451 BUG();
357 return 0; 452 return 0;
358 } 453 }
@@ -382,14 +477,678 @@ static void *packet_lookup_frame(struct packet_sock *po,
382 return h.raw; 477 return h.raw;
383} 478}
384 479
385static inline void *packet_current_frame(struct packet_sock *po, 480static void *packet_current_frame(struct packet_sock *po,
386 struct packet_ring_buffer *rb, 481 struct packet_ring_buffer *rb,
387 int status) 482 int status)
388{ 483{
389 return packet_lookup_frame(po, rb, rb->head, status); 484 return packet_lookup_frame(po, rb, rb->head, status);
390} 485}
391 486
392static inline void *packet_previous_frame(struct packet_sock *po, 487static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
488{
489 del_timer_sync(&pkc->retire_blk_timer);
490}
491
492static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
493 int tx_ring,
494 struct sk_buff_head *rb_queue)
495{
496 struct tpacket_kbdq_core *pkc;
497
498 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
499
500 spin_lock(&rb_queue->lock);
501 pkc->delete_blk_timer = 1;
502 spin_unlock(&rb_queue->lock);
503
504 prb_del_retire_blk_timer(pkc);
505}
506
507static void prb_init_blk_timer(struct packet_sock *po,
508 struct tpacket_kbdq_core *pkc,
509 void (*func) (unsigned long))
510{
511 init_timer(&pkc->retire_blk_timer);
512 pkc->retire_blk_timer.data = (long)po;
513 pkc->retire_blk_timer.function = func;
514 pkc->retire_blk_timer.expires = jiffies;
515}
516
517static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
518{
519 struct tpacket_kbdq_core *pkc;
520
521 if (tx_ring)
522 BUG();
523
524 pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
525 prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
526}
527
528static int prb_calc_retire_blk_tmo(struct packet_sock *po,
529 int blk_size_in_bytes)
530{
531 struct net_device *dev;
532 unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
533 struct ethtool_cmd ecmd;
534 int err;
535
536 rtnl_lock();
537 dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
538 if (unlikely(!dev)) {
539 rtnl_unlock();
540 return DEFAULT_PRB_RETIRE_TOV;
541 }
542 err = __ethtool_get_settings(dev, &ecmd);
543 rtnl_unlock();
544 if (!err) {
545 switch (ecmd.speed) {
546 case SPEED_10000:
547 msec = 1;
548 div = 10000/1000;
549 break;
550 case SPEED_1000:
551 msec = 1;
552 div = 1000/1000;
553 break;
554 /*
555 * If the link speed is so slow you don't really
556 * need to worry about perf anyways
557 */
558 case SPEED_100:
559 case SPEED_10:
560 default:
561 return DEFAULT_PRB_RETIRE_TOV;
562 }
563 }
564
565 mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
566
567 if (div)
568 mbits /= div;
569
570 tmo = mbits * msec;
571
572 if (div)
573 return tmo+1;
574 return tmo;
575}
576
577static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
578 union tpacket_req_u *req_u)
579{
580 p1->feature_req_word = req_u->req3.tp_feature_req_word;
581}
582
583static void init_prb_bdqc(struct packet_sock *po,
584 struct packet_ring_buffer *rb,
585 struct pgv *pg_vec,
586 union tpacket_req_u *req_u, int tx_ring)
587{
588 struct tpacket_kbdq_core *p1 = &rb->prb_bdqc;
589 struct tpacket_block_desc *pbd;
590
591 memset(p1, 0x0, sizeof(*p1));
592
593 p1->knxt_seq_num = 1;
594 p1->pkbdq = pg_vec;
595 pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
596 p1->pkblk_start = (char *)pg_vec[0].buffer;
597 p1->kblk_size = req_u->req3.tp_block_size;
598 p1->knum_blocks = req_u->req3.tp_block_nr;
599 p1->hdrlen = po->tp_hdrlen;
600 p1->version = po->tp_version;
601 p1->last_kactive_blk_num = 0;
602 po->stats_u.stats3.tp_freeze_q_cnt = 0;
603 if (req_u->req3.tp_retire_blk_tov)
604 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
605 else
606 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
607 req_u->req3.tp_block_size);
608 p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
609 p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
610
611 prb_init_ft_ops(p1, req_u);
612 prb_setup_retire_blk_timer(po, tx_ring);
613 prb_open_block(p1, pbd);
614}
615
616/* Do NOT update the last_blk_num first.
617 * Assumes sk_buff_head lock is held.
618 */
619static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
620{
621 mod_timer(&pkc->retire_blk_timer,
622 jiffies + pkc->tov_in_jiffies);
623 pkc->last_kactive_blk_num = pkc->kactive_blk_num;
624}
625
626/*
627 * Timer logic:
628 * 1) We refresh the timer only when we open a block.
629 * By doing this we don't waste cycles refreshing the timer
630 * on packet-by-packet basis.
631 *
632 * With a 1MB block-size, on a 1Gbps line, it will take
633 * i) ~8 ms to fill a block + ii) memcpy etc.
634 * In this cut we are not accounting for the memcpy time.
635 *
636 * So, if the user sets the 'tmo' to 10ms then the timer
637 * will never fire while the block is still getting filled
638 * (which is what we want). However, the user could choose
639 * to close a block early and that's fine.
640 *
641 * But when the timer does fire, we check whether or not to refresh it.
642 * Since the tmo granularity is in msecs, it is not too expensive
643 * to refresh the timer, lets say every '8' msecs.
644 * Either the user can set the 'tmo' or we can derive it based on
645 * a) line-speed and b) block-size.
646 * prb_calc_retire_blk_tmo() calculates the tmo.
647 *
648 */
649static void prb_retire_rx_blk_timer_expired(unsigned long data)
650{
651 struct packet_sock *po = (struct packet_sock *)data;
652 struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc;
653 unsigned int frozen;
654 struct tpacket_block_desc *pbd;
655
656 spin_lock(&po->sk.sk_receive_queue.lock);
657
658 frozen = prb_queue_frozen(pkc);
659 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
660
661 if (unlikely(pkc->delete_blk_timer))
662 goto out;
663
664 /* We only need to plug the race when the block is partially filled.
665 * tpacket_rcv:
666 * lock(); increment BLOCK_NUM_PKTS; unlock()
667 * copy_bits() is in progress ...
668 * timer fires on other cpu:
669 * we can't retire the current block because copy_bits
670 * is in progress.
671 *
672 */
673 if (BLOCK_NUM_PKTS(pbd)) {
674 while (atomic_read(&pkc->blk_fill_in_prog)) {
675 /* Waiting for skb_copy_bits to finish... */
676 cpu_relax();
677 }
678 }
679
680 if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
681 if (!frozen) {
682 prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
683 if (!prb_dispatch_next_block(pkc, po))
684 goto refresh_timer;
685 else
686 goto out;
687 } else {
688 /* Case 1. Queue was frozen because user-space was
689 * lagging behind.
690 */
691 if (prb_curr_blk_in_use(pkc, pbd)) {
692 /*
693 * Ok, user-space is still behind.
694 * So just refresh the timer.
695 */
696 goto refresh_timer;
697 } else {
698 /* Case 2. queue was frozen,user-space caught up,
699 * now the link went idle && the timer fired.
700 * We don't have a block to close.So we open this
701 * block and restart the timer.
702 * opening a block thaws the queue,restarts timer
703 * Thawing/timer-refresh is a side effect.
704 */
705 prb_open_block(pkc, pbd);
706 goto out;
707 }
708 }
709 }
710
711refresh_timer:
712 _prb_refresh_rx_retire_blk_timer(pkc);
713
714out:
715 spin_unlock(&po->sk.sk_receive_queue.lock);
716}
717
718static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
719 struct tpacket_block_desc *pbd1, __u32 status)
720{
721 /* Flush everything minus the block header */
722
723#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
724 u8 *start, *end;
725
726 start = (u8 *)pbd1;
727
728 /* Skip the block header(we know header WILL fit in 4K) */
729 start += PAGE_SIZE;
730
731 end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
732 for (; start < end; start += PAGE_SIZE)
733 flush_dcache_page(pgv_to_page(start));
734
735 smp_wmb();
736#endif
737
738 /* Now update the block status. */
739
740 BLOCK_STATUS(pbd1) = status;
741
742 /* Flush the block header */
743
744#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
745 start = (u8 *)pbd1;
746 flush_dcache_page(pgv_to_page(start));
747
748 smp_wmb();
749#endif
750}
751
752/*
753 * Side effect:
754 *
755 * 1) flush the block
756 * 2) Increment active_blk_num
757 *
758 * Note:We DONT refresh the timer on purpose.
759 * Because almost always the next block will be opened.
760 */
761static void prb_close_block(struct tpacket_kbdq_core *pkc1,
762 struct tpacket_block_desc *pbd1,
763 struct packet_sock *po, unsigned int stat)
764{
765 __u32 status = TP_STATUS_USER | stat;
766
767 struct tpacket3_hdr *last_pkt;
768 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
769
770 if (po->stats.tp_drops)
771 status |= TP_STATUS_LOSING;
772
773 last_pkt = (struct tpacket3_hdr *)pkc1->prev;
774 last_pkt->tp_next_offset = 0;
775
776 /* Get the ts of the last pkt */
777 if (BLOCK_NUM_PKTS(pbd1)) {
778 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
779 h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec;
780 } else {
781 /* Ok, we tmo'd - so get the current time */
782 struct timespec ts;
783 getnstimeofday(&ts);
784 h1->ts_last_pkt.ts_sec = ts.tv_sec;
785 h1->ts_last_pkt.ts_nsec = ts.tv_nsec;
786 }
787
788 smp_wmb();
789
790 /* Flush the block */
791 prb_flush_block(pkc1, pbd1, status);
792
793 pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
794}
795
796static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
797{
798 pkc->reset_pending_on_curr_blk = 0;
799}
800
801/*
802 * Side effect of opening a block:
803 *
804 * 1) prb_queue is thawed.
805 * 2) retire_blk_timer is refreshed.
806 *
807 */
808static void prb_open_block(struct tpacket_kbdq_core *pkc1,
809 struct tpacket_block_desc *pbd1)
810{
811 struct timespec ts;
812 struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
813
814 smp_rmb();
815
816 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
817
818 /* We could have just memset this but we will lose the
819 * flexibility of making the priv area sticky
820 */
821 BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
822 BLOCK_NUM_PKTS(pbd1) = 0;
823 BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
824 getnstimeofday(&ts);
825 h1->ts_first_pkt.ts_sec = ts.tv_sec;
826 h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
827 pkc1->pkblk_start = (char *)pbd1;
828 pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
829 BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
830 BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
831 BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
832 pbd1->version = pkc1->version;
833 pkc1->prev = pkc1->nxt_offset;
834 pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
835 prb_thaw_queue(pkc1);
836 _prb_refresh_rx_retire_blk_timer(pkc1);
837
838 smp_wmb();
839
840 return;
841 }
842
843 WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
844 pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
845 dump_stack();
846 BUG();
847}
848
849/*
850 * Queue freeze logic:
851 * 1) Assume tp_block_nr = 8 blocks.
852 * 2) At time 't0', user opens Rx ring.
853 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
854 * 4) user-space is either sleeping or processing block '0'.
855 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
856 * it will close block-7,loop around and try to fill block '0'.
857 * call-flow:
858 * __packet_lookup_frame_in_block
859 * prb_retire_current_block()
860 * prb_dispatch_next_block()
861 * |->(BLOCK_STATUS == USER) evaluates to true
862 * 5.1) Since block-0 is currently in-use, we just freeze the queue.
863 * 6) Now there are two cases:
864 * 6.1) Link goes idle right after the queue is frozen.
865 * But remember, the last open_block() refreshed the timer.
866 * When this timer expires,it will refresh itself so that we can
867 * re-open block-0 in near future.
868 * 6.2) Link is busy and keeps on receiving packets. This is a simple
869 * case and __packet_lookup_frame_in_block will check if block-0
870 * is free and can now be re-used.
871 */
872static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
873 struct packet_sock *po)
874{
875 pkc->reset_pending_on_curr_blk = 1;
876 po->stats_u.stats3.tp_freeze_q_cnt++;
877}
878
879#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
880
881/*
882 * If the next block is free then we will dispatch it
883 * and return a good offset.
884 * Else, we will freeze the queue.
885 * So, caller must check the return value.
886 */
887static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
888 struct packet_sock *po)
889{
890 struct tpacket_block_desc *pbd;
891
892 smp_rmb();
893
894 /* 1. Get current block num */
895 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
896
897 /* 2. If this block is currently in_use then freeze the queue */
898 if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
899 prb_freeze_queue(pkc, po);
900 return NULL;
901 }
902
903 /*
904 * 3.
905 * open this block and return the offset where the first packet
906 * needs to get stored.
907 */
908 prb_open_block(pkc, pbd);
909 return (void *)pkc->nxt_offset;
910}
911
912static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
913 struct packet_sock *po, unsigned int status)
914{
915 struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
916
917 /* retire/close the current block */
918 if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
919 /*
920 * Plug the case where copy_bits() is in progress on
921 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
922 * have space to copy the pkt in the current block and
923 * called prb_retire_current_block()
924 *
925 * We don't need to worry about the TMO case because
926 * the timer-handler already handled this case.
927 */
928 if (!(status & TP_STATUS_BLK_TMO)) {
929 while (atomic_read(&pkc->blk_fill_in_prog)) {
930 /* Waiting for skb_copy_bits to finish... */
931 cpu_relax();
932 }
933 }
934 prb_close_block(pkc, pbd, po, status);
935 return;
936 }
937
938 WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
939 dump_stack();
940 BUG();
941}
942
943static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc,
944 struct tpacket_block_desc *pbd)
945{
946 return TP_STATUS_USER & BLOCK_STATUS(pbd);
947}
948
949static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
950{
951 return pkc->reset_pending_on_curr_blk;
952}
953
954static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
955{
956 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
957 atomic_dec(&pkc->blk_fill_in_prog);
958}
959
960static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
961 struct tpacket3_hdr *ppd)
962{
963 ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
964}
965
966static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
967 struct tpacket3_hdr *ppd)
968{
969 ppd->hv1.tp_rxhash = 0;
970}
971
972static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
973 struct tpacket3_hdr *ppd)
974{
975 if (vlan_tx_tag_present(pkc->skb)) {
976 ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
977 ppd->tp_status = TP_STATUS_VLAN_VALID;
978 } else {
979 ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
980 }
981}
982
983static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
984 struct tpacket3_hdr *ppd)
985{
986 prb_fill_vlan_info(pkc, ppd);
987
988 if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
989 prb_fill_rxhash(pkc, ppd);
990 else
991 prb_clear_rxhash(pkc, ppd);
992}
993
994static void prb_fill_curr_block(char *curr,
995 struct tpacket_kbdq_core *pkc,
996 struct tpacket_block_desc *pbd,
997 unsigned int len)
998{
999 struct tpacket3_hdr *ppd;
1000
1001 ppd = (struct tpacket3_hdr *)curr;
1002 ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
1003 pkc->prev = curr;
1004 pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
1005 BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
1006 BLOCK_NUM_PKTS(pbd) += 1;
1007 atomic_inc(&pkc->blk_fill_in_prog);
1008 prb_run_all_ft_ops(pkc, ppd);
1009}
1010
1011/* Assumes caller has the sk->rx_queue.lock */
1012static void *__packet_lookup_frame_in_block(struct packet_sock *po,
1013 struct sk_buff *skb,
1014 int status,
1015 unsigned int len
1016 )
1017{
1018 struct tpacket_kbdq_core *pkc;
1019 struct tpacket_block_desc *pbd;
1020 char *curr, *end;
1021
1022 pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
1023 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1024
1025 /* Queue is frozen when user space is lagging behind */
1026 if (prb_queue_frozen(pkc)) {
1027 /*
1028 * Check if that last block which caused the queue to freeze,
1029 * is still in_use by user-space.
1030 */
1031 if (prb_curr_blk_in_use(pkc, pbd)) {
1032 /* Can't record this packet */
1033 return NULL;
1034 } else {
1035 /*
1036 * Ok, the block was released by user-space.
1037 * Now let's open that block.
1038 * opening a block also thaws the queue.
1039 * Thawing is a side effect.
1040 */
1041 prb_open_block(pkc, pbd);
1042 }
1043 }
1044
1045 smp_mb();
1046 curr = pkc->nxt_offset;
1047 pkc->skb = skb;
1048 end = (char *) ((char *)pbd + pkc->kblk_size);
1049
1050 /* first try the current block */
1051 if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
1052 prb_fill_curr_block(curr, pkc, pbd, len);
1053 return (void *)curr;
1054 }
1055
1056 /* Ok, close the current block */
1057 prb_retire_current_block(pkc, po, 0);
1058
1059 /* Now, try to dispatch the next block */
1060 curr = (char *)prb_dispatch_next_block(pkc, po);
1061 if (curr) {
1062 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
1063 prb_fill_curr_block(curr, pkc, pbd, len);
1064 return (void *)curr;
1065 }
1066
1067 /*
1068 * No free blocks are available.user_space hasn't caught up yet.
1069 * Queue was just frozen and now this packet will get dropped.
1070 */
1071 return NULL;
1072}
1073
1074static void *packet_current_rx_frame(struct packet_sock *po,
1075 struct sk_buff *skb,
1076 int status, unsigned int len)
1077{
1078 char *curr = NULL;
1079 switch (po->tp_version) {
1080 case TPACKET_V1:
1081 case TPACKET_V2:
1082 curr = packet_lookup_frame(po, &po->rx_ring,
1083 po->rx_ring.head, status);
1084 return curr;
1085 case TPACKET_V3:
1086 return __packet_lookup_frame_in_block(po, skb, status, len);
1087 default:
1088 WARN(1, "TPACKET version not supported\n");
1089 BUG();
1090 return 0;
1091 }
1092}
1093
1094static void *prb_lookup_block(struct packet_sock *po,
1095 struct packet_ring_buffer *rb,
1096 unsigned int previous,
1097 int status)
1098{
1099 struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
1100 struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
1101
1102 if (status != BLOCK_STATUS(pbd))
1103 return NULL;
1104 return pbd;
1105}
1106
1107static int prb_previous_blk_num(struct packet_ring_buffer *rb)
1108{
1109 unsigned int prev;
1110 if (rb->prb_bdqc.kactive_blk_num)
1111 prev = rb->prb_bdqc.kactive_blk_num-1;
1112 else
1113 prev = rb->prb_bdqc.knum_blocks-1;
1114 return prev;
1115}
1116
1117/* Assumes caller has held the rx_queue.lock */
1118static void *__prb_previous_block(struct packet_sock *po,
1119 struct packet_ring_buffer *rb,
1120 int status)
1121{
1122 unsigned int previous = prb_previous_blk_num(rb);
1123 return prb_lookup_block(po, rb, previous, status);
1124}
1125
1126static void *packet_previous_rx_frame(struct packet_sock *po,
1127 struct packet_ring_buffer *rb,
1128 int status)
1129{
1130 if (po->tp_version <= TPACKET_V2)
1131 return packet_previous_frame(po, rb, status);
1132
1133 return __prb_previous_block(po, rb, status);
1134}
1135
1136static void packet_increment_rx_head(struct packet_sock *po,
1137 struct packet_ring_buffer *rb)
1138{
1139 switch (po->tp_version) {
1140 case TPACKET_V1:
1141 case TPACKET_V2:
1142 return packet_increment_head(rb);
1143 case TPACKET_V3:
1144 default:
1145 WARN(1, "TPACKET version not supported.\n");
1146 BUG();
1147 return;
1148 }
1149}
1150
1151static void *packet_previous_frame(struct packet_sock *po,
393 struct packet_ring_buffer *rb, 1152 struct packet_ring_buffer *rb,
394 int status) 1153 int status)
395{ 1154{
@@ -397,7 +1156,7 @@ static inline void *packet_previous_frame(struct packet_sock *po,
397 return packet_lookup_frame(po, rb, previous, status); 1156 return packet_lookup_frame(po, rb, previous, status);
398} 1157}
399 1158
400static inline void packet_increment_head(struct packet_ring_buffer *buff) 1159static void packet_increment_head(struct packet_ring_buffer *buff)
401{ 1160{
402 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; 1161 buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
403} 1162}
@@ -454,43 +1213,6 @@ static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *sk
454 return f->arr[cpu % num]; 1213 return f->arr[cpu % num];
455} 1214}
456 1215
457static struct sk_buff *fanout_check_defrag(struct sk_buff *skb)
458{
459#ifdef CONFIG_INET
460 const struct iphdr *iph;
461 u32 len;
462
463 if (skb->protocol != htons(ETH_P_IP))
464 return skb;
465
466 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
467 return skb;
468
469 iph = ip_hdr(skb);
470 if (iph->ihl < 5 || iph->version != 4)
471 return skb;
472 if (!pskb_may_pull(skb, iph->ihl*4))
473 return skb;
474 iph = ip_hdr(skb);
475 len = ntohs(iph->tot_len);
476 if (skb->len < len || len < (iph->ihl * 4))
477 return skb;
478
479 if (ip_is_fragment(ip_hdr(skb))) {
480 skb = skb_share_check(skb, GFP_ATOMIC);
481 if (skb) {
482 if (pskb_trim_rcsum(skb, len))
483 return skb;
484 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
485 if (ip_defrag(skb, IP_DEFRAG_AF_PACKET))
486 return NULL;
487 skb->rxhash = 0;
488 }
489 }
490#endif
491 return skb;
492}
493
494static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, 1216static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
495 struct packet_type *pt, struct net_device *orig_dev) 1217 struct packet_type *pt, struct net_device *orig_dev)
496{ 1218{
@@ -509,7 +1231,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
509 case PACKET_FANOUT_HASH: 1231 case PACKET_FANOUT_HASH:
510 default: 1232 default:
511 if (f->defrag) { 1233 if (f->defrag) {
512 skb = fanout_check_defrag(skb); 1234 skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
513 if (!skb) 1235 if (!skb)
514 return 0; 1236 return 0;
515 } 1237 }
@@ -836,7 +1558,7 @@ out_free:
836 return err; 1558 return err;
837} 1559}
838 1560
839static inline unsigned int run_filter(const struct sk_buff *skb, 1561static unsigned int run_filter(const struct sk_buff *skb,
840 const struct sock *sk, 1562 const struct sock *sk,
841 unsigned int res) 1563 unsigned int res)
842{ 1564{
@@ -961,7 +1683,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
961 return 0; 1683 return 0;
962 1684
963drop_n_acct: 1685drop_n_acct:
964 po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); 1686 spin_lock(&sk->sk_receive_queue.lock);
1687 po->stats.tp_drops++;
1688 atomic_inc(&sk->sk_drops);
1689 spin_unlock(&sk->sk_receive_queue.lock);
965 1690
966drop_n_restore: 1691drop_n_restore:
967 if (skb_head != skb->data && skb_shared(skb)) { 1692 if (skb_head != skb->data && skb_shared(skb)) {
@@ -982,12 +1707,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
982 union { 1707 union {
983 struct tpacket_hdr *h1; 1708 struct tpacket_hdr *h1;
984 struct tpacket2_hdr *h2; 1709 struct tpacket2_hdr *h2;
1710 struct tpacket3_hdr *h3;
985 void *raw; 1711 void *raw;
986 } h; 1712 } h;
987 u8 *skb_head = skb->data; 1713 u8 *skb_head = skb->data;
988 int skb_len = skb->len; 1714 int skb_len = skb->len;
989 unsigned int snaplen, res; 1715 unsigned int snaplen, res;
990 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; 1716 unsigned long status = TP_STATUS_USER;
991 unsigned short macoff, netoff, hdrlen; 1717 unsigned short macoff, netoff, hdrlen;
992 struct sk_buff *copy_skb = NULL; 1718 struct sk_buff *copy_skb = NULL;
993 struct timeval tv; 1719 struct timeval tv;
@@ -1033,37 +1759,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1033 po->tp_reserve; 1759 po->tp_reserve;
1034 macoff = netoff - maclen; 1760 macoff = netoff - maclen;
1035 } 1761 }
1036 1762 if (po->tp_version <= TPACKET_V2) {
1037 if (macoff + snaplen > po->rx_ring.frame_size) { 1763 if (macoff + snaplen > po->rx_ring.frame_size) {
1038 if (po->copy_thresh && 1764 if (po->copy_thresh &&
1039 atomic_read(&sk->sk_rmem_alloc) + skb->truesize < 1765 atomic_read(&sk->sk_rmem_alloc) + skb->truesize
1040 (unsigned)sk->sk_rcvbuf) { 1766 < (unsigned)sk->sk_rcvbuf) {
1041 if (skb_shared(skb)) { 1767 if (skb_shared(skb)) {
1042 copy_skb = skb_clone(skb, GFP_ATOMIC); 1768 copy_skb = skb_clone(skb, GFP_ATOMIC);
1043 } else { 1769 } else {
1044 copy_skb = skb_get(skb); 1770 copy_skb = skb_get(skb);
1045 skb_head = skb->data; 1771 skb_head = skb->data;
1772 }
1773 if (copy_skb)
1774 skb_set_owner_r(copy_skb, sk);
1046 } 1775 }
1047 if (copy_skb) 1776 snaplen = po->rx_ring.frame_size - macoff;
1048 skb_set_owner_r(copy_skb, sk); 1777 if ((int)snaplen < 0)
1778 snaplen = 0;
1049 } 1779 }
1050 snaplen = po->rx_ring.frame_size - macoff;
1051 if ((int)snaplen < 0)
1052 snaplen = 0;
1053 } 1780 }
1054
1055 spin_lock(&sk->sk_receive_queue.lock); 1781 spin_lock(&sk->sk_receive_queue.lock);
1056 h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); 1782 h.raw = packet_current_rx_frame(po, skb,
1783 TP_STATUS_KERNEL, (macoff+snaplen));
1057 if (!h.raw) 1784 if (!h.raw)
1058 goto ring_is_full; 1785 goto ring_is_full;
1059 packet_increment_head(&po->rx_ring); 1786 if (po->tp_version <= TPACKET_V2) {
1787 packet_increment_rx_head(po, &po->rx_ring);
1788 /*
1789 * LOSING will be reported till you read the stats,
1790 * because it's COR - Clear On Read.
1791 * Anyways, moving it for V1/V2 only as V3 doesn't need this
1792 * at packet level.
1793 */
1794 if (po->stats.tp_drops)
1795 status |= TP_STATUS_LOSING;
1796 }
1060 po->stats.tp_packets++; 1797 po->stats.tp_packets++;
1061 if (copy_skb) { 1798 if (copy_skb) {
1062 status |= TP_STATUS_COPY; 1799 status |= TP_STATUS_COPY;
1063 __skb_queue_tail(&sk->sk_receive_queue, copy_skb); 1800 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
1064 } 1801 }
1065 if (!po->stats.tp_drops)
1066 status &= ~TP_STATUS_LOSING;
1067 spin_unlock(&sk->sk_receive_queue.lock); 1802 spin_unlock(&sk->sk_receive_queue.lock);
1068 1803
1069 skb_copy_bits(skb, 0, h.raw + macoff, snaplen); 1804 skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
@@ -1114,6 +1849,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1114 h.h2->tp_padding = 0; 1849 h.h2->tp_padding = 0;
1115 hdrlen = sizeof(*h.h2); 1850 hdrlen = sizeof(*h.h2);
1116 break; 1851 break;
1852 case TPACKET_V3:
1853 /* tp_nxt_offset,vlan are already populated above.
1854 * So DONT clear those fields here
1855 */
1856 h.h3->tp_status |= status;
1857 h.h3->tp_len = skb->len;
1858 h.h3->tp_snaplen = snaplen;
1859 h.h3->tp_mac = macoff;
1860 h.h3->tp_net = netoff;
1861 if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
1862 && shhwtstamps->syststamp.tv64)
1863 ts = ktime_to_timespec(shhwtstamps->syststamp);
1864 else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
1865 && shhwtstamps->hwtstamp.tv64)
1866 ts = ktime_to_timespec(shhwtstamps->hwtstamp);
1867 else if (skb->tstamp.tv64)
1868 ts = ktime_to_timespec(skb->tstamp);
1869 else
1870 getnstimeofday(&ts);
1871 h.h3->tp_sec = ts.tv_sec;
1872 h.h3->tp_nsec = ts.tv_nsec;
1873 hdrlen = sizeof(*h.h3);
1874 break;
1117 default: 1875 default:
1118 BUG(); 1876 BUG();
1119 } 1877 }
@@ -1134,13 +1892,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
1134 { 1892 {
1135 u8 *start, *end; 1893 u8 *start, *end;
1136 1894
1137 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); 1895 if (po->tp_version <= TPACKET_V2) {
1138 for (start = h.raw; start < end; start += PAGE_SIZE) 1896 end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
1139 flush_dcache_page(pgv_to_page(start)); 1897 + macoff + snaplen);
1898 for (start = h.raw; start < end; start += PAGE_SIZE)
1899 flush_dcache_page(pgv_to_page(start));
1900 }
1140 smp_wmb(); 1901 smp_wmb();
1141 } 1902 }
1142#endif 1903#endif
1143 __packet_set_status(po, h.raw, status); 1904 if (po->tp_version <= TPACKET_V2)
1905 __packet_set_status(po, h.raw, status);
1906 else
1907 prb_clear_blk_fill_status(&po->rx_ring);
1144 1908
1145 sk->sk_data_ready(sk, 0); 1909 sk->sk_data_ready(sk, 0);
1146 1910
@@ -1167,8 +1931,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
1167 struct packet_sock *po = pkt_sk(skb->sk); 1931 struct packet_sock *po = pkt_sk(skb->sk);
1168 void *ph; 1932 void *ph;
1169 1933
1170 BUG_ON(skb == NULL);
1171
1172 if (likely(po->tx_ring.pg_vec)) { 1934 if (likely(po->tx_ring.pg_vec)) {
1173 ph = skb_shinfo(skb)->destructor_arg; 1935 ph = skb_shinfo(skb)->destructor_arg;
1174 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); 1936 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
@@ -1405,10 +2167,10 @@ out:
1405 return err; 2167 return err;
1406} 2168}
1407 2169
1408static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, 2170static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1409 size_t reserve, size_t len, 2171 size_t reserve, size_t len,
1410 size_t linear, int noblock, 2172 size_t linear, int noblock,
1411 int *err) 2173 int *err)
1412{ 2174{
1413 struct sk_buff *skb; 2175 struct sk_buff *skb;
1414 2176
@@ -1631,7 +2393,7 @@ static int packet_release(struct socket *sock)
1631 struct sock *sk = sock->sk; 2393 struct sock *sk = sock->sk;
1632 struct packet_sock *po; 2394 struct packet_sock *po;
1633 struct net *net; 2395 struct net *net;
1634 struct tpacket_req req; 2396 union tpacket_req_u req_u;
1635 2397
1636 if (!sk) 2398 if (!sk)
1637 return 0; 2399 return 0;
@@ -1654,13 +2416,13 @@ static int packet_release(struct socket *sock)
1654 2416
1655 packet_flush_mclist(sk); 2417 packet_flush_mclist(sk);
1656 2418
1657 memset(&req, 0, sizeof(req)); 2419 memset(&req_u, 0, sizeof(req_u));
1658 2420
1659 if (po->rx_ring.pg_vec) 2421 if (po->rx_ring.pg_vec)
1660 packet_set_ring(sk, &req, 1, 0); 2422 packet_set_ring(sk, &req_u, 1, 0);
1661 2423
1662 if (po->tx_ring.pg_vec) 2424 if (po->tx_ring.pg_vec)
1663 packet_set_ring(sk, &req, 1, 1); 2425 packet_set_ring(sk, &req_u, 1, 1);
1664 2426
1665 fanout_release(sk); 2427 fanout_release(sk);
1666 2428
@@ -2280,15 +3042,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2280 case PACKET_RX_RING: 3042 case PACKET_RX_RING:
2281 case PACKET_TX_RING: 3043 case PACKET_TX_RING:
2282 { 3044 {
2283 struct tpacket_req req; 3045 union tpacket_req_u req_u;
3046 int len;
2284 3047
2285 if (optlen < sizeof(req)) 3048 switch (po->tp_version) {
3049 case TPACKET_V1:
3050 case TPACKET_V2:
3051 len = sizeof(req_u.req);
3052 break;
3053 case TPACKET_V3:
3054 default:
3055 len = sizeof(req_u.req3);
3056 break;
3057 }
3058 if (optlen < len)
2286 return -EINVAL; 3059 return -EINVAL;
2287 if (pkt_sk(sk)->has_vnet_hdr) 3060 if (pkt_sk(sk)->has_vnet_hdr)
2288 return -EINVAL; 3061 return -EINVAL;
2289 if (copy_from_user(&req, optval, sizeof(req))) 3062 if (copy_from_user(&req_u.req, optval, len))
2290 return -EFAULT; 3063 return -EFAULT;
2291 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); 3064 return packet_set_ring(sk, &req_u, 0,
3065 optname == PACKET_TX_RING);
2292 } 3066 }
2293 case PACKET_COPY_THRESH: 3067 case PACKET_COPY_THRESH:
2294 { 3068 {
@@ -2315,6 +3089,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
2315 switch (val) { 3089 switch (val) {
2316 case TPACKET_V1: 3090 case TPACKET_V1:
2317 case TPACKET_V2: 3091 case TPACKET_V2:
3092 case TPACKET_V3:
2318 po->tp_version = val; 3093 po->tp_version = val;
2319 return 0; 3094 return 0;
2320 default: 3095 default:
@@ -2424,6 +3199,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2424 struct packet_sock *po = pkt_sk(sk); 3199 struct packet_sock *po = pkt_sk(sk);
2425 void *data; 3200 void *data;
2426 struct tpacket_stats st; 3201 struct tpacket_stats st;
3202 union tpacket_stats_u st_u;
2427 3203
2428 if (level != SOL_PACKET) 3204 if (level != SOL_PACKET)
2429 return -ENOPROTOOPT; 3205 return -ENOPROTOOPT;
@@ -2436,15 +3212,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2436 3212
2437 switch (optname) { 3213 switch (optname) {
2438 case PACKET_STATISTICS: 3214 case PACKET_STATISTICS:
2439 if (len > sizeof(struct tpacket_stats)) 3215 if (po->tp_version == TPACKET_V3) {
2440 len = sizeof(struct tpacket_stats); 3216 len = sizeof(struct tpacket_stats_v3);
3217 } else {
3218 if (len > sizeof(struct tpacket_stats))
3219 len = sizeof(struct tpacket_stats);
3220 }
2441 spin_lock_bh(&sk->sk_receive_queue.lock); 3221 spin_lock_bh(&sk->sk_receive_queue.lock);
2442 st = po->stats; 3222 if (po->tp_version == TPACKET_V3) {
3223 memcpy(&st_u.stats3, &po->stats,
3224 sizeof(struct tpacket_stats));
3225 st_u.stats3.tp_freeze_q_cnt =
3226 po->stats_u.stats3.tp_freeze_q_cnt;
3227 st_u.stats3.tp_packets += po->stats.tp_drops;
3228 data = &st_u.stats3;
3229 } else {
3230 st = po->stats;
3231 st.tp_packets += st.tp_drops;
3232 data = &st;
3233 }
2443 memset(&po->stats, 0, sizeof(st)); 3234 memset(&po->stats, 0, sizeof(st));
2444 spin_unlock_bh(&sk->sk_receive_queue.lock); 3235 spin_unlock_bh(&sk->sk_receive_queue.lock);
2445 st.tp_packets += st.tp_drops;
2446
2447 data = &st;
2448 break; 3236 break;
2449 case PACKET_AUXDATA: 3237 case PACKET_AUXDATA:
2450 if (len > sizeof(int)) 3238 if (len > sizeof(int))
@@ -2485,6 +3273,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
2485 case TPACKET_V2: 3273 case TPACKET_V2:
2486 val = sizeof(struct tpacket2_hdr); 3274 val = sizeof(struct tpacket2_hdr);
2487 break; 3275 break;
3276 case TPACKET_V3:
3277 val = sizeof(struct tpacket3_hdr);
3278 break;
2488 default: 3279 default:
2489 return -EINVAL; 3280 return -EINVAL;
2490 } 3281 }
@@ -2641,7 +3432,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
2641 3432
2642 spin_lock_bh(&sk->sk_receive_queue.lock); 3433 spin_lock_bh(&sk->sk_receive_queue.lock);
2643 if (po->rx_ring.pg_vec) { 3434 if (po->rx_ring.pg_vec) {
2644 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) 3435 if (!packet_previous_rx_frame(po, &po->rx_ring,
3436 TP_STATUS_KERNEL))
2645 mask |= POLLIN | POLLRDNORM; 3437 mask |= POLLIN | POLLRDNORM;
2646 } 3438 }
2647 spin_unlock_bh(&sk->sk_receive_queue.lock); 3439 spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -2702,7 +3494,7 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
2702 kfree(pg_vec); 3494 kfree(pg_vec);
2703} 3495}
2704 3496
2705static inline char *alloc_one_pg_vec_page(unsigned long order) 3497static char *alloc_one_pg_vec_page(unsigned long order)
2706{ 3498{
2707 char *buffer = NULL; 3499 char *buffer = NULL;
2708 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | 3500 gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
@@ -2760,7 +3552,7 @@ out_free_pgvec:
2760 goto out; 3552 goto out;
2761} 3553}
2762 3554
2763static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 3555static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
2764 int closing, int tx_ring) 3556 int closing, int tx_ring)
2765{ 3557{
2766 struct pgv *pg_vec = NULL; 3558 struct pgv *pg_vec = NULL;
@@ -2769,7 +3561,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2769 struct packet_ring_buffer *rb; 3561 struct packet_ring_buffer *rb;
2770 struct sk_buff_head *rb_queue; 3562 struct sk_buff_head *rb_queue;
2771 __be16 num; 3563 __be16 num;
2772 int err; 3564 int err = -EINVAL;
3565 /* Added to avoid minimal code churn */
3566 struct tpacket_req *req = &req_u->req;
3567
3568 /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
3569 if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
3570 WARN(1, "Tx-ring is not supported.\n");
3571 goto out;
3572 }
2773 3573
2774 rb = tx_ring ? &po->tx_ring : &po->rx_ring; 3574 rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2775 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; 3575 rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -2795,6 +3595,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2795 case TPACKET_V2: 3595 case TPACKET_V2:
2796 po->tp_hdrlen = TPACKET2_HDRLEN; 3596 po->tp_hdrlen = TPACKET2_HDRLEN;
2797 break; 3597 break;
3598 case TPACKET_V3:
3599 po->tp_hdrlen = TPACKET3_HDRLEN;
3600 break;
2798 } 3601 }
2799 3602
2800 err = -EINVAL; 3603 err = -EINVAL;
@@ -2820,6 +3623,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2820 pg_vec = alloc_pg_vec(req, order); 3623 pg_vec = alloc_pg_vec(req, order);
2821 if (unlikely(!pg_vec)) 3624 if (unlikely(!pg_vec))
2822 goto out; 3625 goto out;
3626 switch (po->tp_version) {
3627 case TPACKET_V3:
3628 /* Transmit path is not supported. We checked
3629 * it above but just being paranoid
3630 */
3631 if (!tx_ring)
3632 init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
3633 break;
3634 default:
3635 break;
3636 }
2823 } 3637 }
2824 /* Done */ 3638 /* Done */
2825 else { 3639 else {
@@ -2872,7 +3686,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2872 register_prot_hook(sk); 3686 register_prot_hook(sk);
2873 } 3687 }
2874 spin_unlock(&po->bind_lock); 3688 spin_unlock(&po->bind_lock);
2875 3689 if (closing && (po->tp_version > TPACKET_V2)) {
3690 /* Because we don't support block-based V3 on tx-ring */
3691 if (!tx_ring)
3692 prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
3693 }
2876 release_sock(sk); 3694 release_sock(sk);
2877 3695
2878 if (pg_vec) 3696 if (pg_vec)