diff options
author | chetan loke <loke.chetan@gmail.com> | 2011-08-19 06:18:16 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-08-24 22:40:40 -0400 |
commit | f6fb8f100b807378fda19e83e5ac6828b638603a (patch) | |
tree | 66d30265f08fbf9745e3feb9af6f5a06fe38d71b /net/packet/af_packet.c | |
parent | 0d4691ce112be025019999df5f2a5e00c03f03c2 (diff) |
af-packet: TPACKET_V3 flexible buffer implementation.
1) Blocks can be configured with non-static frame-size.
2) Read/poll is at a block-level(as opposed to packet-level).
3) Added poll timeout to avoid indefinite user-space wait on idle links.
4) Added user-configurable knobs:
4.1) block::timeout.
4.2) tpkt_hdr::sk_rxhash.
Changes:
C1) tpacket_rcv()
C1.1) packet_current_frame() is replaced by packet_current_rx_frame()
The bulk of the processing is then moved in the following chain:
packet_current_rx_frame()
__packet_lookup_frame_in_block
fill_curr_block()
or
retire_current_block
dispatch_next_block
or
return NULL(queue is plugged/paused)
Signed-off-by: Chetan Loke <loke.chetan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 937 |
1 files changed, 891 insertions, 46 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c698cec0a445..4371e3a67789 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -40,6 +40,10 @@ | |||
40 | * byte arrays at the end of sockaddr_ll | 40 | * byte arrays at the end of sockaddr_ll |
41 | * and packet_mreq. | 41 | * and packet_mreq. |
42 | * Johann Baudy : Added TX RING. | 42 | * Johann Baudy : Added TX RING. |
43 | * Chetan Loke : Implemented TPACKET_V3 block abstraction | ||
44 | * layer. | ||
45 | * Copyright (C) 2011, <lokec@ccs.neu.edu> | ||
46 | * | ||
43 | * | 47 | * |
44 | * This program is free software; you can redistribute it and/or | 48 | * This program is free software; you can redistribute it and/or |
45 | * modify it under the terms of the GNU General Public License | 49 | * modify it under the terms of the GNU General Public License |
@@ -161,9 +165,56 @@ struct packet_mreq_max { | |||
161 | unsigned char mr_address[MAX_ADDR_LEN]; | 165 | unsigned char mr_address[MAX_ADDR_LEN]; |
162 | }; | 166 | }; |
163 | 167 | ||
164 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 168 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
165 | int closing, int tx_ring); | 169 | int closing, int tx_ring); |
166 | 170 | ||
171 | |||
172 | #define V3_ALIGNMENT (8) | ||
173 | |||
174 | #define BLK_HDR_LEN (ALIGN(sizeof(struct block_desc), V3_ALIGNMENT)) | ||
175 | |||
176 | #define BLK_PLUS_PRIV(sz_of_priv) \ | ||
177 | (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) | ||
178 | |||
179 | /* kbdq - kernel block descriptor queue */ | ||
180 | struct kbdq_core { | ||
181 | struct pgv *pkbdq; | ||
182 | unsigned int feature_req_word; | ||
183 | unsigned int hdrlen; | ||
184 | unsigned char reset_pending_on_curr_blk; | ||
185 | unsigned char delete_blk_timer; | ||
186 | unsigned short kactive_blk_num; | ||
187 | unsigned short blk_sizeof_priv; | ||
188 | |||
189 | /* last_kactive_blk_num: | ||
190 | * trick to see if user-space has caught up | ||
191 | * in order to avoid refreshing timer when every single pkt arrives. | ||
192 | */ | ||
193 | unsigned short last_kactive_blk_num; | ||
194 | |||
195 | char *pkblk_start; | ||
196 | char *pkblk_end; | ||
197 | int kblk_size; | ||
198 | unsigned int knum_blocks; | ||
199 | uint64_t knxt_seq_num; | ||
200 | char *prev; | ||
201 | char *nxt_offset; | ||
202 | struct sk_buff *skb; | ||
203 | |||
204 | atomic_t blk_fill_in_prog; | ||
205 | |||
206 | /* Default is set to 8ms */ | ||
207 | #define DEFAULT_PRB_RETIRE_TOV (8) | ||
208 | |||
209 | unsigned short retire_blk_tov; | ||
210 | unsigned short version; | ||
211 | unsigned long tov_in_jiffies; | ||
212 | |||
213 | /* timer to retire an outstanding block */ | ||
214 | struct timer_list retire_blk_timer; | ||
215 | }; | ||
216 | |||
217 | #define PGV_FROM_VMALLOC 1 | ||
167 | struct pgv { | 218 | struct pgv { |
168 | char *buffer; | 219 | char *buffer; |
169 | }; | 220 | }; |
@@ -179,12 +230,40 @@ struct packet_ring_buffer { | |||
179 | unsigned int pg_vec_pages; | 230 | unsigned int pg_vec_pages; |
180 | unsigned int pg_vec_len; | 231 | unsigned int pg_vec_len; |
181 | 232 | ||
233 | struct kbdq_core prb_bdqc; | ||
182 | atomic_t pending; | 234 | atomic_t pending; |
183 | }; | 235 | }; |
184 | 236 | ||
237 | #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) | ||
238 | #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) | ||
239 | #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) | ||
240 | #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) | ||
241 | #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) | ||
242 | #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) | ||
243 | #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) | ||
244 | |||
185 | struct packet_sock; | 245 | struct packet_sock; |
186 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); | 246 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); |
187 | 247 | ||
248 | static void *packet_previous_frame(struct packet_sock *po, | ||
249 | struct packet_ring_buffer *rb, | ||
250 | int status); | ||
251 | static void packet_increment_head(struct packet_ring_buffer *buff); | ||
252 | static int prb_curr_blk_in_use(struct kbdq_core *, | ||
253 | struct block_desc *); | ||
254 | static void *prb_dispatch_next_block(struct kbdq_core *, | ||
255 | struct packet_sock *); | ||
256 | static void prb_retire_current_block(struct kbdq_core *, | ||
257 | struct packet_sock *, unsigned int status); | ||
258 | static int prb_queue_frozen(struct kbdq_core *); | ||
259 | static void prb_open_block(struct kbdq_core *, struct block_desc *); | ||
260 | static void prb_retire_rx_blk_timer_expired(unsigned long); | ||
261 | static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *); | ||
262 | static void prb_init_blk_timer(struct packet_sock *, struct kbdq_core *, | ||
263 | void (*func) (unsigned long)); | ||
264 | static void prb_fill_rxhash(struct kbdq_core *, struct tpacket3_hdr *); | ||
265 | static void prb_clear_rxhash(struct kbdq_core *, struct tpacket3_hdr *); | ||
266 | static void prb_fill_vlan_info(struct kbdq_core *, struct tpacket3_hdr *); | ||
188 | static void packet_flush_mclist(struct sock *sk); | 267 | static void packet_flush_mclist(struct sock *sk); |
189 | 268 | ||
190 | struct packet_fanout; | 269 | struct packet_fanout; |
@@ -193,6 +272,7 @@ struct packet_sock { | |||
193 | struct sock sk; | 272 | struct sock sk; |
194 | struct packet_fanout *fanout; | 273 | struct packet_fanout *fanout; |
195 | struct tpacket_stats stats; | 274 | struct tpacket_stats stats; |
275 | union tpacket_stats_u stats_u; | ||
196 | struct packet_ring_buffer rx_ring; | 276 | struct packet_ring_buffer rx_ring; |
197 | struct packet_ring_buffer tx_ring; | 277 | struct packet_ring_buffer tx_ring; |
198 | int copy_thresh; | 278 | int copy_thresh; |
@@ -242,6 +322,15 @@ struct packet_skb_cb { | |||
242 | 322 | ||
243 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) | 323 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) |
244 | 324 | ||
325 | #define GET_PBDQC_FROM_RB(x) ((struct kbdq_core *)(&(x)->prb_bdqc)) | ||
326 | #define GET_PBLOCK_DESC(x, bid) \ | ||
327 | ((struct block_desc *)((x)->pkbdq[(bid)].buffer)) | ||
328 | #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ | ||
329 | ((struct block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) | ||
330 | #define GET_NEXT_PRB_BLK_NUM(x) \ | ||
331 | (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ | ||
332 | ((x)->kactive_blk_num+1) : 0) | ||
333 | |||
245 | static inline struct packet_sock *pkt_sk(struct sock *sk) | 334 | static inline struct packet_sock *pkt_sk(struct sock *sk) |
246 | { | 335 | { |
247 | return (struct packet_sock *)sk; | 336 | return (struct packet_sock *)sk; |
@@ -325,8 +414,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) | |||
325 | h.h2->tp_status = status; | 414 | h.h2->tp_status = status; |
326 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 415 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
327 | break; | 416 | break; |
417 | case TPACKET_V3: | ||
328 | default: | 418 | default: |
329 | pr_err("TPACKET version not supported\n"); | 419 | WARN(1, "TPACKET version not supported.\n"); |
330 | BUG(); | 420 | BUG(); |
331 | } | 421 | } |
332 | 422 | ||
@@ -351,8 +441,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame) | |||
351 | case TPACKET_V2: | 441 | case TPACKET_V2: |
352 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 442 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
353 | return h.h2->tp_status; | 443 | return h.h2->tp_status; |
444 | case TPACKET_V3: | ||
354 | default: | 445 | default: |
355 | pr_err("TPACKET version not supported\n"); | 446 | WARN(1, "TPACKET version not supported.\n"); |
356 | BUG(); | 447 | BUG(); |
357 | return 0; | 448 | return 0; |
358 | } | 449 | } |
@@ -389,6 +480,665 @@ static inline void *packet_current_frame(struct packet_sock *po, | |||
389 | return packet_lookup_frame(po, rb, rb->head, status); | 480 | return packet_lookup_frame(po, rb, rb->head, status); |
390 | } | 481 | } |
391 | 482 | ||
483 | static void prb_del_retire_blk_timer(struct kbdq_core *pkc) | ||
484 | { | ||
485 | del_timer_sync(&pkc->retire_blk_timer); | ||
486 | } | ||
487 | |||
488 | static void prb_shutdown_retire_blk_timer(struct packet_sock *po, | ||
489 | int tx_ring, | ||
490 | struct sk_buff_head *rb_queue) | ||
491 | { | ||
492 | struct kbdq_core *pkc; | ||
493 | |||
494 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
495 | |||
496 | spin_lock(&rb_queue->lock); | ||
497 | pkc->delete_blk_timer = 1; | ||
498 | spin_unlock(&rb_queue->lock); | ||
499 | |||
500 | prb_del_retire_blk_timer(pkc); | ||
501 | } | ||
502 | |||
503 | static void prb_init_blk_timer(struct packet_sock *po, | ||
504 | struct kbdq_core *pkc, | ||
505 | void (*func) (unsigned long)) | ||
506 | { | ||
507 | init_timer(&pkc->retire_blk_timer); | ||
508 | pkc->retire_blk_timer.data = (long)po; | ||
509 | pkc->retire_blk_timer.function = func; | ||
510 | pkc->retire_blk_timer.expires = jiffies; | ||
511 | } | ||
512 | |||
513 | static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) | ||
514 | { | ||
515 | struct kbdq_core *pkc; | ||
516 | |||
517 | if (tx_ring) | ||
518 | BUG(); | ||
519 | |||
520 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
521 | prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); | ||
522 | } | ||
523 | |||
524 | static int prb_calc_retire_blk_tmo(struct packet_sock *po, | ||
525 | int blk_size_in_bytes) | ||
526 | { | ||
527 | struct net_device *dev; | ||
528 | unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; | ||
529 | |||
530 | dev = dev_get_by_index(sock_net(&po->sk), po->ifindex); | ||
531 | if (unlikely(dev == NULL)) | ||
532 | return DEFAULT_PRB_RETIRE_TOV; | ||
533 | |||
534 | if (dev->ethtool_ops && dev->ethtool_ops->get_settings) { | ||
535 | struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, }; | ||
536 | |||
537 | if (!dev->ethtool_ops->get_settings(dev, &ecmd)) { | ||
538 | switch (ecmd.speed) { | ||
539 | case SPEED_10000: | ||
540 | msec = 1; | ||
541 | div = 10000/1000; | ||
542 | break; | ||
543 | case SPEED_1000: | ||
544 | msec = 1; | ||
545 | div = 1000/1000; | ||
546 | break; | ||
547 | /* | ||
548 | * If the link speed is so slow you don't really | ||
549 | * need to worry about perf anyways | ||
550 | */ | ||
551 | case SPEED_100: | ||
552 | case SPEED_10: | ||
553 | default: | ||
554 | return DEFAULT_PRB_RETIRE_TOV; | ||
555 | } | ||
556 | } | ||
557 | } | ||
558 | |||
559 | mbits = (blk_size_in_bytes * 8) / (1024 * 1024); | ||
560 | |||
561 | if (div) | ||
562 | mbits /= div; | ||
563 | |||
564 | tmo = mbits * msec; | ||
565 | |||
566 | if (div) | ||
567 | return tmo+1; | ||
568 | return tmo; | ||
569 | } | ||
570 | |||
571 | static void prb_init_ft_ops(struct kbdq_core *p1, | ||
572 | union tpacket_req_u *req_u) | ||
573 | { | ||
574 | p1->feature_req_word = req_u->req3.tp_feature_req_word; | ||
575 | } | ||
576 | |||
577 | static void init_prb_bdqc(struct packet_sock *po, | ||
578 | struct packet_ring_buffer *rb, | ||
579 | struct pgv *pg_vec, | ||
580 | union tpacket_req_u *req_u, int tx_ring) | ||
581 | { | ||
582 | struct kbdq_core *p1 = &rb->prb_bdqc; | ||
583 | struct block_desc *pbd; | ||
584 | |||
585 | memset(p1, 0x0, sizeof(*p1)); | ||
586 | |||
587 | p1->knxt_seq_num = 1; | ||
588 | p1->pkbdq = pg_vec; | ||
589 | pbd = (struct block_desc *)pg_vec[0].buffer; | ||
590 | p1->pkblk_start = (char *)pg_vec[0].buffer; | ||
591 | p1->kblk_size = req_u->req3.tp_block_size; | ||
592 | p1->knum_blocks = req_u->req3.tp_block_nr; | ||
593 | p1->hdrlen = po->tp_hdrlen; | ||
594 | p1->version = po->tp_version; | ||
595 | p1->last_kactive_blk_num = 0; | ||
596 | po->stats_u.stats3.tp_freeze_q_cnt = 0; | ||
597 | if (req_u->req3.tp_retire_blk_tov) | ||
598 | p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; | ||
599 | else | ||
600 | p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, | ||
601 | req_u->req3.tp_block_size); | ||
602 | p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); | ||
603 | p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; | ||
604 | |||
605 | prb_init_ft_ops(p1, req_u); | ||
606 | prb_setup_retire_blk_timer(po, tx_ring); | ||
607 | prb_open_block(p1, pbd); | ||
608 | } | ||
609 | |||
610 | /* Do NOT update the last_blk_num first. | ||
611 | * Assumes sk_buff_head lock is held. | ||
612 | */ | ||
613 | static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc) | ||
614 | { | ||
615 | mod_timer(&pkc->retire_blk_timer, | ||
616 | jiffies + pkc->tov_in_jiffies); | ||
617 | pkc->last_kactive_blk_num = pkc->kactive_blk_num; | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * Timer logic: | ||
622 | * 1) We refresh the timer only when we open a block. | ||
623 | * By doing this we don't waste cycles refreshing the timer | ||
624 | * on packet-by-packet basis. | ||
625 | * | ||
626 | * With a 1MB block-size, on a 1Gbps line, it will take | ||
627 | * i) ~8 ms to fill a block + ii) memcpy etc. | ||
628 | * In this cut we are not accounting for the memcpy time. | ||
629 | * | ||
630 | * So, if the user sets the 'tmo' to 10ms then the timer | ||
631 | * will never fire while the block is still getting filled | ||
632 | * (which is what we want). However, the user could choose | ||
633 | * to close a block early and that's fine. | ||
634 | * | ||
635 | * But when the timer does fire, we check whether or not to refresh it. | ||
636 | * Since the tmo granularity is in msecs, it is not too expensive | ||
637 | * to refresh the timer, lets say every '8' msecs. | ||
638 | * Either the user can set the 'tmo' or we can derive it based on | ||
639 | * a) line-speed and b) block-size. | ||
640 | * prb_calc_retire_blk_tmo() calculates the tmo. | ||
641 | * | ||
642 | */ | ||
643 | static void prb_retire_rx_blk_timer_expired(unsigned long data) | ||
644 | { | ||
645 | struct packet_sock *po = (struct packet_sock *)data; | ||
646 | struct kbdq_core *pkc = &po->rx_ring.prb_bdqc; | ||
647 | unsigned int frozen; | ||
648 | struct block_desc *pbd; | ||
649 | |||
650 | spin_lock(&po->sk.sk_receive_queue.lock); | ||
651 | |||
652 | frozen = prb_queue_frozen(pkc); | ||
653 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
654 | |||
655 | if (unlikely(pkc->delete_blk_timer)) | ||
656 | goto out; | ||
657 | |||
658 | /* We only need to plug the race when the block is partially filled. | ||
659 | * tpacket_rcv: | ||
660 | * lock(); increment BLOCK_NUM_PKTS; unlock() | ||
661 | * copy_bits() is in progress ... | ||
662 | * timer fires on other cpu: | ||
663 | * we can't retire the current block because copy_bits | ||
664 | * is in progress. | ||
665 | * | ||
666 | */ | ||
667 | if (BLOCK_NUM_PKTS(pbd)) { | ||
668 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
669 | /* Waiting for skb_copy_bits to finish... */ | ||
670 | cpu_relax(); | ||
671 | } | ||
672 | } | ||
673 | |||
674 | if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { | ||
675 | if (!frozen) { | ||
676 | prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); | ||
677 | if (!prb_dispatch_next_block(pkc, po)) | ||
678 | goto refresh_timer; | ||
679 | else | ||
680 | goto out; | ||
681 | } else { | ||
682 | /* Case 1. Queue was frozen because user-space was | ||
683 | * lagging behind. | ||
684 | */ | ||
685 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
686 | /* | ||
687 | * Ok, user-space is still behind. | ||
688 | * So just refresh the timer. | ||
689 | */ | ||
690 | goto refresh_timer; | ||
691 | } else { | ||
692 | /* Case 2. queue was frozen,user-space caught up, | ||
693 | * now the link went idle && the timer fired. | ||
694 | * We don't have a block to close.So we open this | ||
695 | * block and restart the timer. | ||
696 | * opening a block thaws the queue,restarts timer | ||
697 | * Thawing/timer-refresh is a side effect. | ||
698 | */ | ||
699 | prb_open_block(pkc, pbd); | ||
700 | goto out; | ||
701 | } | ||
702 | } | ||
703 | } | ||
704 | |||
705 | refresh_timer: | ||
706 | _prb_refresh_rx_retire_blk_timer(pkc); | ||
707 | |||
708 | out: | ||
709 | spin_unlock(&po->sk.sk_receive_queue.lock); | ||
710 | } | ||
711 | |||
712 | static inline void prb_flush_block(struct kbdq_core *pkc1, | ||
713 | struct block_desc *pbd1, __u32 status) | ||
714 | { | ||
715 | /* Flush everything minus the block header */ | ||
716 | |||
717 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
718 | u8 *start, *end; | ||
719 | |||
720 | start = (u8 *)pbd1; | ||
721 | |||
722 | /* Skip the block header(we know header WILL fit in 4K) */ | ||
723 | start += PAGE_SIZE; | ||
724 | |||
725 | end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); | ||
726 | for (; start < end; start += PAGE_SIZE) | ||
727 | flush_dcache_page(pgv_to_page(start)); | ||
728 | |||
729 | smp_wmb(); | ||
730 | #endif | ||
731 | |||
732 | /* Now update the block status. */ | ||
733 | |||
734 | BLOCK_STATUS(pbd1) = status; | ||
735 | |||
736 | /* Flush the block header */ | ||
737 | |||
738 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
739 | start = (u8 *)pbd1; | ||
740 | flush_dcache_page(pgv_to_page(start)); | ||
741 | |||
742 | smp_wmb(); | ||
743 | #endif | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * Side effect: | ||
748 | * | ||
749 | * 1) flush the block | ||
750 | * 2) Increment active_blk_num | ||
751 | * | ||
752 | * Note:We DONT refresh the timer on purpose. | ||
753 | * Because almost always the next block will be opened. | ||
754 | */ | ||
755 | static void prb_close_block(struct kbdq_core *pkc1, struct block_desc *pbd1, | ||
756 | struct packet_sock *po, unsigned int stat) | ||
757 | { | ||
758 | __u32 status = TP_STATUS_USER | stat; | ||
759 | |||
760 | struct tpacket3_hdr *last_pkt; | ||
761 | struct hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
762 | |||
763 | if (po->stats.tp_drops) | ||
764 | status |= TP_STATUS_LOSING; | ||
765 | |||
766 | last_pkt = (struct tpacket3_hdr *)pkc1->prev; | ||
767 | last_pkt->tp_next_offset = 0; | ||
768 | |||
769 | /* Get the ts of the last pkt */ | ||
770 | if (BLOCK_NUM_PKTS(pbd1)) { | ||
771 | h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; | ||
772 | h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; | ||
773 | } else { | ||
774 | /* Ok, we tmo'd - so get the current time */ | ||
775 | struct timespec ts; | ||
776 | getnstimeofday(&ts); | ||
777 | h1->ts_last_pkt.ts_sec = ts.tv_sec; | ||
778 | h1->ts_last_pkt.ts_nsec = ts.tv_nsec; | ||
779 | } | ||
780 | |||
781 | smp_wmb(); | ||
782 | |||
783 | /* Flush the block */ | ||
784 | prb_flush_block(pkc1, pbd1, status); | ||
785 | |||
786 | pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); | ||
787 | } | ||
788 | |||
789 | static inline void prb_thaw_queue(struct kbdq_core *pkc) | ||
790 | { | ||
791 | pkc->reset_pending_on_curr_blk = 0; | ||
792 | } | ||
793 | |||
794 | /* | ||
795 | * Side effect of opening a block: | ||
796 | * | ||
797 | * 1) prb_queue is thawed. | ||
798 | * 2) retire_blk_timer is refreshed. | ||
799 | * | ||
800 | */ | ||
801 | static void prb_open_block(struct kbdq_core *pkc1, struct block_desc *pbd1) | ||
802 | { | ||
803 | struct timespec ts; | ||
804 | struct hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
805 | |||
806 | smp_rmb(); | ||
807 | |||
808 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) { | ||
809 | |||
810 | /* We could have just memset this but we will lose the | ||
811 | * flexibility of making the priv area sticky | ||
812 | */ | ||
813 | BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; | ||
814 | BLOCK_NUM_PKTS(pbd1) = 0; | ||
815 | BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
816 | getnstimeofday(&ts); | ||
817 | h1->ts_first_pkt.ts_sec = ts.tv_sec; | ||
818 | h1->ts_first_pkt.ts_nsec = ts.tv_nsec; | ||
819 | pkc1->pkblk_start = (char *)pbd1; | ||
820 | pkc1->nxt_offset = (char *)(pkc1->pkblk_start + | ||
821 | BLK_PLUS_PRIV(pkc1->blk_sizeof_priv)); | ||
822 | BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
823 | BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; | ||
824 | pbd1->version = pkc1->version; | ||
825 | pkc1->prev = pkc1->nxt_offset; | ||
826 | pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; | ||
827 | prb_thaw_queue(pkc1); | ||
828 | _prb_refresh_rx_retire_blk_timer(pkc1); | ||
829 | |||
830 | smp_wmb(); | ||
831 | |||
832 | return; | ||
833 | } | ||
834 | |||
835 | WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n", | ||
836 | pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num); | ||
837 | dump_stack(); | ||
838 | BUG(); | ||
839 | } | ||
840 | |||
841 | /* | ||
842 | * Queue freeze logic: | ||
843 | * 1) Assume tp_block_nr = 8 blocks. | ||
844 | * 2) At time 't0', user opens Rx ring. | ||
845 | * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 | ||
846 | * 4) user-space is either sleeping or processing block '0'. | ||
847 | * 5) tpacket_rcv is currently filling block '7', since there is no space left, | ||
848 | * it will close block-7,loop around and try to fill block '0'. | ||
849 | * call-flow: | ||
850 | * __packet_lookup_frame_in_block | ||
851 | * prb_retire_current_block() | ||
852 | * prb_dispatch_next_block() | ||
853 | * |->(BLOCK_STATUS == USER) evaluates to true | ||
854 | * 5.1) Since block-0 is currently in-use, we just freeze the queue. | ||
855 | * 6) Now there are two cases: | ||
856 | * 6.1) Link goes idle right after the queue is frozen. | ||
857 | * But remember, the last open_block() refreshed the timer. | ||
858 | * When this timer expires,it will refresh itself so that we can | ||
859 | * re-open block-0 in near future. | ||
860 | * 6.2) Link is busy and keeps on receiving packets. This is a simple | ||
861 | * case and __packet_lookup_frame_in_block will check if block-0 | ||
862 | * is free and can now be re-used. | ||
863 | */ | ||
864 | static inline void prb_freeze_queue(struct kbdq_core *pkc, | ||
865 | struct packet_sock *po) | ||
866 | { | ||
867 | pkc->reset_pending_on_curr_blk = 1; | ||
868 | po->stats_u.stats3.tp_freeze_q_cnt++; | ||
869 | } | ||
870 | |||
871 | #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) | ||
872 | |||
873 | /* | ||
874 | * If the next block is free then we will dispatch it | ||
875 | * and return a good offset. | ||
876 | * Else, we will freeze the queue. | ||
877 | * So, caller must check the return value. | ||
878 | */ | ||
879 | static void *prb_dispatch_next_block(struct kbdq_core *pkc, | ||
880 | struct packet_sock *po) | ||
881 | { | ||
882 | struct block_desc *pbd; | ||
883 | |||
884 | smp_rmb(); | ||
885 | |||
886 | /* 1. Get current block num */ | ||
887 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
888 | |||
889 | /* 2. If this block is currently in_use then freeze the queue */ | ||
890 | if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { | ||
891 | prb_freeze_queue(pkc, po); | ||
892 | return NULL; | ||
893 | } | ||
894 | |||
895 | /* | ||
896 | * 3. | ||
897 | * open this block and return the offset where the first packet | ||
898 | * needs to get stored. | ||
899 | */ | ||
900 | prb_open_block(pkc, pbd); | ||
901 | return (void *)pkc->nxt_offset; | ||
902 | } | ||
903 | |||
904 | static void prb_retire_current_block(struct kbdq_core *pkc, | ||
905 | struct packet_sock *po, unsigned int status) | ||
906 | { | ||
907 | struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
908 | |||
909 | /* retire/close the current block */ | ||
910 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { | ||
911 | /* | ||
912 | * Plug the case where copy_bits() is in progress on | ||
913 | * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't | ||
914 | * have space to copy the pkt in the current block and | ||
915 | * called prb_retire_current_block() | ||
916 | * | ||
917 | * We don't need to worry about the TMO case because | ||
918 | * the timer-handler already handled this case. | ||
919 | */ | ||
920 | if (!(status & TP_STATUS_BLK_TMO)) { | ||
921 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
922 | /* Waiting for skb_copy_bits to finish... */ | ||
923 | cpu_relax(); | ||
924 | } | ||
925 | } | ||
926 | prb_close_block(pkc, pbd, po, status); | ||
927 | return; | ||
928 | } | ||
929 | |||
930 | WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd); | ||
931 | dump_stack(); | ||
932 | BUG(); | ||
933 | } | ||
934 | |||
935 | static inline int prb_curr_blk_in_use(struct kbdq_core *pkc, | ||
936 | struct block_desc *pbd) | ||
937 | { | ||
938 | return TP_STATUS_USER & BLOCK_STATUS(pbd); | ||
939 | } | ||
940 | |||
941 | static inline int prb_queue_frozen(struct kbdq_core *pkc) | ||
942 | { | ||
943 | return pkc->reset_pending_on_curr_blk; | ||
944 | } | ||
945 | |||
946 | static inline void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) | ||
947 | { | ||
948 | struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
949 | atomic_dec(&pkc->blk_fill_in_prog); | ||
950 | } | ||
951 | |||
952 | static inline void prb_fill_rxhash(struct kbdq_core *pkc, | ||
953 | struct tpacket3_hdr *ppd) | ||
954 | { | ||
955 | ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); | ||
956 | } | ||
957 | |||
958 | static inline void prb_clear_rxhash(struct kbdq_core *pkc, | ||
959 | struct tpacket3_hdr *ppd) | ||
960 | { | ||
961 | ppd->hv1.tp_rxhash = 0; | ||
962 | } | ||
963 | |||
964 | static inline void prb_fill_vlan_info(struct kbdq_core *pkc, | ||
965 | struct tpacket3_hdr *ppd) | ||
966 | { | ||
967 | if (vlan_tx_tag_present(pkc->skb)) { | ||
968 | ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); | ||
969 | ppd->tp_status = TP_STATUS_VLAN_VALID; | ||
970 | } else { | ||
971 | ppd->hv1.tp_vlan_tci = ppd->tp_status = 0; | ||
972 | } | ||
973 | } | ||
974 | |||
975 | static void prb_run_all_ft_ops(struct kbdq_core *pkc, | ||
976 | struct tpacket3_hdr *ppd) | ||
977 | { | ||
978 | prb_fill_vlan_info(pkc, ppd); | ||
979 | |||
980 | if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) | ||
981 | prb_fill_rxhash(pkc, ppd); | ||
982 | else | ||
983 | prb_clear_rxhash(pkc, ppd); | ||
984 | } | ||
985 | |||
986 | static inline void prb_fill_curr_block(char *curr, struct kbdq_core *pkc, | ||
987 | struct block_desc *pbd, | ||
988 | unsigned int len) | ||
989 | { | ||
990 | struct tpacket3_hdr *ppd; | ||
991 | |||
992 | ppd = (struct tpacket3_hdr *)curr; | ||
993 | ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
994 | pkc->prev = curr; | ||
995 | pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
996 | BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
997 | BLOCK_NUM_PKTS(pbd) += 1; | ||
998 | atomic_inc(&pkc->blk_fill_in_prog); | ||
999 | prb_run_all_ft_ops(pkc, ppd); | ||
1000 | } | ||
1001 | |||
1002 | /* Assumes caller has the sk->rx_queue.lock */ | ||
1003 | static void *__packet_lookup_frame_in_block(struct packet_sock *po, | ||
1004 | struct sk_buff *skb, | ||
1005 | int status, | ||
1006 | unsigned int len | ||
1007 | ) | ||
1008 | { | ||
1009 | struct kbdq_core *pkc; | ||
1010 | struct block_desc *pbd; | ||
1011 | char *curr, *end; | ||
1012 | |||
1013 | pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring)); | ||
1014 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
1015 | |||
1016 | /* Queue is frozen when user space is lagging behind */ | ||
1017 | if (prb_queue_frozen(pkc)) { | ||
1018 | /* | ||
1019 | * Check if that last block which caused the queue to freeze, | ||
1020 | * is still in_use by user-space. | ||
1021 | */ | ||
1022 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
1023 | /* Can't record this packet */ | ||
1024 | return NULL; | ||
1025 | } else { | ||
1026 | /* | ||
1027 | * Ok, the block was released by user-space. | ||
1028 | * Now let's open that block. | ||
1029 | * opening a block also thaws the queue. | ||
1030 | * Thawing is a side effect. | ||
1031 | */ | ||
1032 | prb_open_block(pkc, pbd); | ||
1033 | } | ||
1034 | } | ||
1035 | |||
1036 | smp_mb(); | ||
1037 | curr = pkc->nxt_offset; | ||
1038 | pkc->skb = skb; | ||
1039 | end = (char *) ((char *)pbd + pkc->kblk_size); | ||
1040 | |||
1041 | /* first try the current block */ | ||
1042 | if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { | ||
1043 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
1044 | return (void *)curr; | ||
1045 | } | ||
1046 | |||
1047 | /* Ok, close the current block */ | ||
1048 | prb_retire_current_block(pkc, po, 0); | ||
1049 | |||
1050 | /* Now, try to dispatch the next block */ | ||
1051 | curr = (char *)prb_dispatch_next_block(pkc, po); | ||
1052 | if (curr) { | ||
1053 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
1054 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
1055 | return (void *)curr; | ||
1056 | } | ||
1057 | |||
1058 | /* | ||
1059 | * No free blocks are available.user_space hasn't caught up yet. | ||
1060 | * Queue was just frozen and now this packet will get dropped. | ||
1061 | */ | ||
1062 | return NULL; | ||
1063 | } | ||
1064 | |||
1065 | static inline void *packet_current_rx_frame(struct packet_sock *po, | ||
1066 | struct sk_buff *skb, | ||
1067 | int status, unsigned int len) | ||
1068 | { | ||
1069 | char *curr = NULL; | ||
1070 | switch (po->tp_version) { | ||
1071 | case TPACKET_V1: | ||
1072 | case TPACKET_V2: | ||
1073 | curr = packet_lookup_frame(po, &po->rx_ring, | ||
1074 | po->rx_ring.head, status); | ||
1075 | return curr; | ||
1076 | case TPACKET_V3: | ||
1077 | return __packet_lookup_frame_in_block(po, skb, status, len); | ||
1078 | default: | ||
1079 | WARN(1, "TPACKET version not supported\n"); | ||
1080 | BUG(); | ||
1081 | return 0; | ||
1082 | } | ||
1083 | } | ||
1084 | |||
1085 | static inline void *prb_lookup_block(struct packet_sock *po, | ||
1086 | struct packet_ring_buffer *rb, | ||
1087 | unsigned int previous, | ||
1088 | int status) | ||
1089 | { | ||
1090 | struct kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
1091 | struct block_desc *pbd = GET_PBLOCK_DESC(pkc, previous); | ||
1092 | |||
1093 | if (status != BLOCK_STATUS(pbd)) | ||
1094 | return NULL; | ||
1095 | return pbd; | ||
1096 | } | ||
1097 | |||
1098 | static inline int prb_previous_blk_num(struct packet_ring_buffer *rb) | ||
1099 | { | ||
1100 | unsigned int prev; | ||
1101 | if (rb->prb_bdqc.kactive_blk_num) | ||
1102 | prev = rb->prb_bdqc.kactive_blk_num-1; | ||
1103 | else | ||
1104 | prev = rb->prb_bdqc.knum_blocks-1; | ||
1105 | return prev; | ||
1106 | } | ||
1107 | |||
1108 | /* Assumes caller has held the rx_queue.lock */ | ||
1109 | static inline void *__prb_previous_block(struct packet_sock *po, | ||
1110 | struct packet_ring_buffer *rb, | ||
1111 | int status) | ||
1112 | { | ||
1113 | unsigned int previous = prb_previous_blk_num(rb); | ||
1114 | return prb_lookup_block(po, rb, previous, status); | ||
1115 | } | ||
1116 | |||
1117 | static inline void *packet_previous_rx_frame(struct packet_sock *po, | ||
1118 | struct packet_ring_buffer *rb, | ||
1119 | int status) | ||
1120 | { | ||
1121 | if (po->tp_version <= TPACKET_V2) | ||
1122 | return packet_previous_frame(po, rb, status); | ||
1123 | |||
1124 | return __prb_previous_block(po, rb, status); | ||
1125 | } | ||
1126 | |||
1127 | static inline void packet_increment_rx_head(struct packet_sock *po, | ||
1128 | struct packet_ring_buffer *rb) | ||
1129 | { | ||
1130 | switch (po->tp_version) { | ||
1131 | case TPACKET_V1: | ||
1132 | case TPACKET_V2: | ||
1133 | return packet_increment_head(rb); | ||
1134 | case TPACKET_V3: | ||
1135 | default: | ||
1136 | WARN(1, "TPACKET version not supported.\n"); | ||
1137 | BUG(); | ||
1138 | return; | ||
1139 | } | ||
1140 | } | ||
1141 | |||
392 | static inline void *packet_previous_frame(struct packet_sock *po, | 1142 | static inline void *packet_previous_frame(struct packet_sock *po, |
393 | struct packet_ring_buffer *rb, | 1143 | struct packet_ring_buffer *rb, |
394 | int status) | 1144 | int status) |
@@ -982,12 +1732,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
982 | union { | 1732 | union { |
983 | struct tpacket_hdr *h1; | 1733 | struct tpacket_hdr *h1; |
984 | struct tpacket2_hdr *h2; | 1734 | struct tpacket2_hdr *h2; |
1735 | struct tpacket3_hdr *h3; | ||
985 | void *raw; | 1736 | void *raw; |
986 | } h; | 1737 | } h; |
987 | u8 *skb_head = skb->data; | 1738 | u8 *skb_head = skb->data; |
988 | int skb_len = skb->len; | 1739 | int skb_len = skb->len; |
989 | unsigned int snaplen, res; | 1740 | unsigned int snaplen, res; |
990 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; | 1741 | unsigned long status = TP_STATUS_USER; |
991 | unsigned short macoff, netoff, hdrlen; | 1742 | unsigned short macoff, netoff, hdrlen; |
992 | struct sk_buff *copy_skb = NULL; | 1743 | struct sk_buff *copy_skb = NULL; |
993 | struct timeval tv; | 1744 | struct timeval tv; |
@@ -1033,37 +1784,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1033 | po->tp_reserve; | 1784 | po->tp_reserve; |
1034 | macoff = netoff - maclen; | 1785 | macoff = netoff - maclen; |
1035 | } | 1786 | } |
1036 | 1787 | if (po->tp_version <= TPACKET_V2) { | |
1037 | if (macoff + snaplen > po->rx_ring.frame_size) { | 1788 | if (macoff + snaplen > po->rx_ring.frame_size) { |
1038 | if (po->copy_thresh && | 1789 | if (po->copy_thresh && |
1039 | atomic_read(&sk->sk_rmem_alloc) + skb->truesize < | 1790 | atomic_read(&sk->sk_rmem_alloc) + skb->truesize |
1040 | (unsigned)sk->sk_rcvbuf) { | 1791 | < (unsigned)sk->sk_rcvbuf) { |
1041 | if (skb_shared(skb)) { | 1792 | if (skb_shared(skb)) { |
1042 | copy_skb = skb_clone(skb, GFP_ATOMIC); | 1793 | copy_skb = skb_clone(skb, GFP_ATOMIC); |
1043 | } else { | 1794 | } else { |
1044 | copy_skb = skb_get(skb); | 1795 | copy_skb = skb_get(skb); |
1045 | skb_head = skb->data; | 1796 | skb_head = skb->data; |
1797 | } | ||
1798 | if (copy_skb) | ||
1799 | skb_set_owner_r(copy_skb, sk); | ||
1046 | } | 1800 | } |
1047 | if (copy_skb) | 1801 | snaplen = po->rx_ring.frame_size - macoff; |
1048 | skb_set_owner_r(copy_skb, sk); | 1802 | if ((int)snaplen < 0) |
1803 | snaplen = 0; | ||
1049 | } | 1804 | } |
1050 | snaplen = po->rx_ring.frame_size - macoff; | ||
1051 | if ((int)snaplen < 0) | ||
1052 | snaplen = 0; | ||
1053 | } | 1805 | } |
1054 | |||
1055 | spin_lock(&sk->sk_receive_queue.lock); | 1806 | spin_lock(&sk->sk_receive_queue.lock); |
1056 | h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); | 1807 | h.raw = packet_current_rx_frame(po, skb, |
1808 | TP_STATUS_KERNEL, (macoff+snaplen)); | ||
1057 | if (!h.raw) | 1809 | if (!h.raw) |
1058 | goto ring_is_full; | 1810 | goto ring_is_full; |
1059 | packet_increment_head(&po->rx_ring); | 1811 | if (po->tp_version <= TPACKET_V2) { |
1812 | packet_increment_rx_head(po, &po->rx_ring); | ||
1813 | /* | ||
1814 | * LOSING will be reported till you read the stats, | ||
1815 | * because it's COR - Clear On Read. | ||
1816 | * Anyways, moving it for V1/V2 only as V3 doesn't need this | ||
1817 | * at packet level. | ||
1818 | */ | ||
1819 | if (po->stats.tp_drops) | ||
1820 | status |= TP_STATUS_LOSING; | ||
1821 | } | ||
1060 | po->stats.tp_packets++; | 1822 | po->stats.tp_packets++; |
1061 | if (copy_skb) { | 1823 | if (copy_skb) { |
1062 | status |= TP_STATUS_COPY; | 1824 | status |= TP_STATUS_COPY; |
1063 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); | 1825 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); |
1064 | } | 1826 | } |
1065 | if (!po->stats.tp_drops) | ||
1066 | status &= ~TP_STATUS_LOSING; | ||
1067 | spin_unlock(&sk->sk_receive_queue.lock); | 1827 | spin_unlock(&sk->sk_receive_queue.lock); |
1068 | 1828 | ||
1069 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); | 1829 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); |
@@ -1114,6 +1874,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1114 | h.h2->tp_padding = 0; | 1874 | h.h2->tp_padding = 0; |
1115 | hdrlen = sizeof(*h.h2); | 1875 | hdrlen = sizeof(*h.h2); |
1116 | break; | 1876 | break; |
1877 | case TPACKET_V3: | ||
1878 | /* tp_nxt_offset,vlan are already populated above. | ||
1879 | * So DONT clear those fields here | ||
1880 | */ | ||
1881 | h.h3->tp_status |= status; | ||
1882 | h.h3->tp_len = skb->len; | ||
1883 | h.h3->tp_snaplen = snaplen; | ||
1884 | h.h3->tp_mac = macoff; | ||
1885 | h.h3->tp_net = netoff; | ||
1886 | if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) | ||
1887 | && shhwtstamps->syststamp.tv64) | ||
1888 | ts = ktime_to_timespec(shhwtstamps->syststamp); | ||
1889 | else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) | ||
1890 | && shhwtstamps->hwtstamp.tv64) | ||
1891 | ts = ktime_to_timespec(shhwtstamps->hwtstamp); | ||
1892 | else if (skb->tstamp.tv64) | ||
1893 | ts = ktime_to_timespec(skb->tstamp); | ||
1894 | else | ||
1895 | getnstimeofday(&ts); | ||
1896 | h.h3->tp_sec = ts.tv_sec; | ||
1897 | h.h3->tp_nsec = ts.tv_nsec; | ||
1898 | hdrlen = sizeof(*h.h3); | ||
1899 | break; | ||
1117 | default: | 1900 | default: |
1118 | BUG(); | 1901 | BUG(); |
1119 | } | 1902 | } |
@@ -1134,13 +1917,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1134 | { | 1917 | { |
1135 | u8 *start, *end; | 1918 | u8 *start, *end; |
1136 | 1919 | ||
1137 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); | 1920 | if (po->tp_version <= TPACKET_V2) { |
1138 | for (start = h.raw; start < end; start += PAGE_SIZE) | 1921 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw |
1139 | flush_dcache_page(pgv_to_page(start)); | 1922 | + macoff + snaplen); |
1923 | for (start = h.raw; start < end; start += PAGE_SIZE) | ||
1924 | flush_dcache_page(pgv_to_page(start)); | ||
1925 | } | ||
1140 | smp_wmb(); | 1926 | smp_wmb(); |
1141 | } | 1927 | } |
1142 | #endif | 1928 | #endif |
1143 | __packet_set_status(po, h.raw, status); | 1929 | if (po->tp_version <= TPACKET_V2) |
1930 | __packet_set_status(po, h.raw, status); | ||
1931 | else | ||
1932 | prb_clear_blk_fill_status(&po->rx_ring); | ||
1144 | 1933 | ||
1145 | sk->sk_data_ready(sk, 0); | 1934 | sk->sk_data_ready(sk, 0); |
1146 | 1935 | ||
@@ -1631,7 +2420,7 @@ static int packet_release(struct socket *sock) | |||
1631 | struct sock *sk = sock->sk; | 2420 | struct sock *sk = sock->sk; |
1632 | struct packet_sock *po; | 2421 | struct packet_sock *po; |
1633 | struct net *net; | 2422 | struct net *net; |
1634 | struct tpacket_req req; | 2423 | union tpacket_req_u req_u; |
1635 | 2424 | ||
1636 | if (!sk) | 2425 | if (!sk) |
1637 | return 0; | 2426 | return 0; |
@@ -1654,13 +2443,13 @@ static int packet_release(struct socket *sock) | |||
1654 | 2443 | ||
1655 | packet_flush_mclist(sk); | 2444 | packet_flush_mclist(sk); |
1656 | 2445 | ||
1657 | memset(&req, 0, sizeof(req)); | 2446 | memset(&req_u, 0, sizeof(req_u)); |
1658 | 2447 | ||
1659 | if (po->rx_ring.pg_vec) | 2448 | if (po->rx_ring.pg_vec) |
1660 | packet_set_ring(sk, &req, 1, 0); | 2449 | packet_set_ring(sk, &req_u, 1, 0); |
1661 | 2450 | ||
1662 | if (po->tx_ring.pg_vec) | 2451 | if (po->tx_ring.pg_vec) |
1663 | packet_set_ring(sk, &req, 1, 1); | 2452 | packet_set_ring(sk, &req_u, 1, 1); |
1664 | 2453 | ||
1665 | fanout_release(sk); | 2454 | fanout_release(sk); |
1666 | 2455 | ||
@@ -2280,15 +3069,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2280 | case PACKET_RX_RING: | 3069 | case PACKET_RX_RING: |
2281 | case PACKET_TX_RING: | 3070 | case PACKET_TX_RING: |
2282 | { | 3071 | { |
2283 | struct tpacket_req req; | 3072 | union tpacket_req_u req_u; |
3073 | int len; | ||
2284 | 3074 | ||
2285 | if (optlen < sizeof(req)) | 3075 | switch (po->tp_version) { |
3076 | case TPACKET_V1: | ||
3077 | case TPACKET_V2: | ||
3078 | len = sizeof(req_u.req); | ||
3079 | break; | ||
3080 | case TPACKET_V3: | ||
3081 | default: | ||
3082 | len = sizeof(req_u.req3); | ||
3083 | break; | ||
3084 | } | ||
3085 | if (optlen < len) | ||
2286 | return -EINVAL; | 3086 | return -EINVAL; |
2287 | if (pkt_sk(sk)->has_vnet_hdr) | 3087 | if (pkt_sk(sk)->has_vnet_hdr) |
2288 | return -EINVAL; | 3088 | return -EINVAL; |
2289 | if (copy_from_user(&req, optval, sizeof(req))) | 3089 | if (copy_from_user(&req_u.req, optval, len)) |
2290 | return -EFAULT; | 3090 | return -EFAULT; |
2291 | return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); | 3091 | return packet_set_ring(sk, &req_u, 0, |
3092 | optname == PACKET_TX_RING); | ||
2292 | } | 3093 | } |
2293 | case PACKET_COPY_THRESH: | 3094 | case PACKET_COPY_THRESH: |
2294 | { | 3095 | { |
@@ -2315,6 +3116,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2315 | switch (val) { | 3116 | switch (val) { |
2316 | case TPACKET_V1: | 3117 | case TPACKET_V1: |
2317 | case TPACKET_V2: | 3118 | case TPACKET_V2: |
3119 | case TPACKET_V3: | ||
2318 | po->tp_version = val; | 3120 | po->tp_version = val; |
2319 | return 0; | 3121 | return 0; |
2320 | default: | 3122 | default: |
@@ -2424,6 +3226,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2424 | struct packet_sock *po = pkt_sk(sk); | 3226 | struct packet_sock *po = pkt_sk(sk); |
2425 | void *data; | 3227 | void *data; |
2426 | struct tpacket_stats st; | 3228 | struct tpacket_stats st; |
3229 | union tpacket_stats_u st_u; | ||
2427 | 3230 | ||
2428 | if (level != SOL_PACKET) | 3231 | if (level != SOL_PACKET) |
2429 | return -ENOPROTOOPT; | 3232 | return -ENOPROTOOPT; |
@@ -2436,15 +3239,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2436 | 3239 | ||
2437 | switch (optname) { | 3240 | switch (optname) { |
2438 | case PACKET_STATISTICS: | 3241 | case PACKET_STATISTICS: |
2439 | if (len > sizeof(struct tpacket_stats)) | 3242 | if (po->tp_version == TPACKET_V3) { |
2440 | len = sizeof(struct tpacket_stats); | 3243 | len = sizeof(struct tpacket_stats_v3); |
3244 | } else { | ||
3245 | if (len > sizeof(struct tpacket_stats)) | ||
3246 | len = sizeof(struct tpacket_stats); | ||
3247 | } | ||
2441 | spin_lock_bh(&sk->sk_receive_queue.lock); | 3248 | spin_lock_bh(&sk->sk_receive_queue.lock); |
2442 | st = po->stats; | 3249 | if (po->tp_version == TPACKET_V3) { |
3250 | memcpy(&st_u.stats3, &po->stats, | ||
3251 | sizeof(struct tpacket_stats)); | ||
3252 | st_u.stats3.tp_freeze_q_cnt = | ||
3253 | po->stats_u.stats3.tp_freeze_q_cnt; | ||
3254 | st_u.stats3.tp_packets += po->stats.tp_drops; | ||
3255 | data = &st_u.stats3; | ||
3256 | } else { | ||
3257 | st = po->stats; | ||
3258 | st.tp_packets += st.tp_drops; | ||
3259 | data = &st; | ||
3260 | } | ||
2443 | memset(&po->stats, 0, sizeof(st)); | 3261 | memset(&po->stats, 0, sizeof(st)); |
2444 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 3262 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
2445 | st.tp_packets += st.tp_drops; | ||
2446 | |||
2447 | data = &st; | ||
2448 | break; | 3263 | break; |
2449 | case PACKET_AUXDATA: | 3264 | case PACKET_AUXDATA: |
2450 | if (len > sizeof(int)) | 3265 | if (len > sizeof(int)) |
@@ -2485,6 +3300,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2485 | case TPACKET_V2: | 3300 | case TPACKET_V2: |
2486 | val = sizeof(struct tpacket2_hdr); | 3301 | val = sizeof(struct tpacket2_hdr); |
2487 | break; | 3302 | break; |
3303 | case TPACKET_V3: | ||
3304 | val = sizeof(struct tpacket3_hdr); | ||
3305 | break; | ||
2488 | default: | 3306 | default: |
2489 | return -EINVAL; | 3307 | return -EINVAL; |
2490 | } | 3308 | } |
@@ -2641,7 +3459,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, | |||
2641 | 3459 | ||
2642 | spin_lock_bh(&sk->sk_receive_queue.lock); | 3460 | spin_lock_bh(&sk->sk_receive_queue.lock); |
2643 | if (po->rx_ring.pg_vec) { | 3461 | if (po->rx_ring.pg_vec) { |
2644 | if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) | 3462 | if (!packet_previous_rx_frame(po, &po->rx_ring, |
3463 | TP_STATUS_KERNEL)) | ||
2645 | mask |= POLLIN | POLLRDNORM; | 3464 | mask |= POLLIN | POLLRDNORM; |
2646 | } | 3465 | } |
2647 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 3466 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
@@ -2760,7 +3579,7 @@ out_free_pgvec: | |||
2760 | goto out; | 3579 | goto out; |
2761 | } | 3580 | } |
2762 | 3581 | ||
2763 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 3582 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
2764 | int closing, int tx_ring) | 3583 | int closing, int tx_ring) |
2765 | { | 3584 | { |
2766 | struct pgv *pg_vec = NULL; | 3585 | struct pgv *pg_vec = NULL; |
@@ -2769,7 +3588,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2769 | struct packet_ring_buffer *rb; | 3588 | struct packet_ring_buffer *rb; |
2770 | struct sk_buff_head *rb_queue; | 3589 | struct sk_buff_head *rb_queue; |
2771 | __be16 num; | 3590 | __be16 num; |
2772 | int err; | 3591 | int err = -EINVAL; |
3592 | /* Added to avoid minimal code churn */ | ||
3593 | struct tpacket_req *req = &req_u->req; | ||
3594 | |||
3595 | /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ | ||
3596 | if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { | ||
3597 | WARN(1, "Tx-ring is not supported.\n"); | ||
3598 | goto out; | ||
3599 | } | ||
2773 | 3600 | ||
2774 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; | 3601 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; |
2775 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; | 3602 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; |
@@ -2795,6 +3622,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2795 | case TPACKET_V2: | 3622 | case TPACKET_V2: |
2796 | po->tp_hdrlen = TPACKET2_HDRLEN; | 3623 | po->tp_hdrlen = TPACKET2_HDRLEN; |
2797 | break; | 3624 | break; |
3625 | case TPACKET_V3: | ||
3626 | po->tp_hdrlen = TPACKET3_HDRLEN; | ||
3627 | break; | ||
2798 | } | 3628 | } |
2799 | 3629 | ||
2800 | err = -EINVAL; | 3630 | err = -EINVAL; |
@@ -2820,6 +3650,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2820 | pg_vec = alloc_pg_vec(req, order); | 3650 | pg_vec = alloc_pg_vec(req, order); |
2821 | if (unlikely(!pg_vec)) | 3651 | if (unlikely(!pg_vec)) |
2822 | goto out; | 3652 | goto out; |
3653 | switch (po->tp_version) { | ||
3654 | case TPACKET_V3: | ||
3655 | /* Transmit path is not supported. We checked | ||
3656 | * it above but just being paranoid | ||
3657 | */ | ||
3658 | if (!tx_ring) | ||
3659 | init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); | ||
3660 | break; | ||
3661 | default: | ||
3662 | break; | ||
3663 | } | ||
2823 | } | 3664 | } |
2824 | /* Done */ | 3665 | /* Done */ |
2825 | else { | 3666 | else { |
@@ -2872,7 +3713,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2872 | register_prot_hook(sk); | 3713 | register_prot_hook(sk); |
2873 | } | 3714 | } |
2874 | spin_unlock(&po->bind_lock); | 3715 | spin_unlock(&po->bind_lock); |
2875 | 3716 | if (closing && (po->tp_version > TPACKET_V2)) { | |
3717 | /* Because we don't support block-based V3 on tx-ring */ | ||
3718 | if (!tx_ring) | ||
3719 | prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); | ||
3720 | } | ||
2876 | release_sock(sk); | 3721 | release_sock(sk); |
2877 | 3722 | ||
2878 | if (pg_vec) | 3723 | if (pg_vec) |