diff options
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r-- | net/packet/af_packet.c | 1012 |
1 files changed, 915 insertions, 97 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c698cec0a445..82a6f34d39d0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c | |||
@@ -40,6 +40,10 @@ | |||
40 | * byte arrays at the end of sockaddr_ll | 40 | * byte arrays at the end of sockaddr_ll |
41 | * and packet_mreq. | 41 | * and packet_mreq. |
42 | * Johann Baudy : Added TX RING. | 42 | * Johann Baudy : Added TX RING. |
43 | * Chetan Loke : Implemented TPACKET_V3 block abstraction | ||
44 | * layer. | ||
45 | * Copyright (C) 2011, <lokec@ccs.neu.edu> | ||
46 | * | ||
43 | * | 47 | * |
44 | * This program is free software; you can redistribute it and/or | 48 | * This program is free software; you can redistribute it and/or |
45 | * modify it under the terms of the GNU General Public License | 49 | * modify it under the terms of the GNU General Public License |
@@ -161,9 +165,56 @@ struct packet_mreq_max { | |||
161 | unsigned char mr_address[MAX_ADDR_LEN]; | 165 | unsigned char mr_address[MAX_ADDR_LEN]; |
162 | }; | 166 | }; |
163 | 167 | ||
164 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 168 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
165 | int closing, int tx_ring); | 169 | int closing, int tx_ring); |
166 | 170 | ||
171 | |||
172 | #define V3_ALIGNMENT (8) | ||
173 | |||
174 | #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) | ||
175 | |||
176 | #define BLK_PLUS_PRIV(sz_of_priv) \ | ||
177 | (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) | ||
178 | |||
179 | /* kbdq - kernel block descriptor queue */ | ||
180 | struct tpacket_kbdq_core { | ||
181 | struct pgv *pkbdq; | ||
182 | unsigned int feature_req_word; | ||
183 | unsigned int hdrlen; | ||
184 | unsigned char reset_pending_on_curr_blk; | ||
185 | unsigned char delete_blk_timer; | ||
186 | unsigned short kactive_blk_num; | ||
187 | unsigned short blk_sizeof_priv; | ||
188 | |||
189 | /* last_kactive_blk_num: | ||
190 | * trick to see if user-space has caught up | ||
191 | * in order to avoid refreshing timer when every single pkt arrives. | ||
192 | */ | ||
193 | unsigned short last_kactive_blk_num; | ||
194 | |||
195 | char *pkblk_start; | ||
196 | char *pkblk_end; | ||
197 | int kblk_size; | ||
198 | unsigned int knum_blocks; | ||
199 | uint64_t knxt_seq_num; | ||
200 | char *prev; | ||
201 | char *nxt_offset; | ||
202 | struct sk_buff *skb; | ||
203 | |||
204 | atomic_t blk_fill_in_prog; | ||
205 | |||
206 | /* Default is set to 8ms */ | ||
207 | #define DEFAULT_PRB_RETIRE_TOV (8) | ||
208 | |||
209 | unsigned short retire_blk_tov; | ||
210 | unsigned short version; | ||
211 | unsigned long tov_in_jiffies; | ||
212 | |||
213 | /* timer to retire an outstanding block */ | ||
214 | struct timer_list retire_blk_timer; | ||
215 | }; | ||
216 | |||
217 | #define PGV_FROM_VMALLOC 1 | ||
167 | struct pgv { | 218 | struct pgv { |
168 | char *buffer; | 219 | char *buffer; |
169 | }; | 220 | }; |
@@ -179,12 +230,44 @@ struct packet_ring_buffer { | |||
179 | unsigned int pg_vec_pages; | 230 | unsigned int pg_vec_pages; |
180 | unsigned int pg_vec_len; | 231 | unsigned int pg_vec_len; |
181 | 232 | ||
233 | struct tpacket_kbdq_core prb_bdqc; | ||
182 | atomic_t pending; | 234 | atomic_t pending; |
183 | }; | 235 | }; |
184 | 236 | ||
237 | #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) | ||
238 | #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) | ||
239 | #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) | ||
240 | #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) | ||
241 | #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) | ||
242 | #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) | ||
243 | #define BLOCK_PRIV(x) ((void *)((char *)(x) + BLOCK_O2PRIV(x))) | ||
244 | |||
185 | struct packet_sock; | 245 | struct packet_sock; |
186 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); | 246 | static int tpacket_snd(struct packet_sock *po, struct msghdr *msg); |
187 | 247 | ||
248 | static void *packet_previous_frame(struct packet_sock *po, | ||
249 | struct packet_ring_buffer *rb, | ||
250 | int status); | ||
251 | static void packet_increment_head(struct packet_ring_buffer *buff); | ||
252 | static int prb_curr_blk_in_use(struct tpacket_kbdq_core *, | ||
253 | struct tpacket_block_desc *); | ||
254 | static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, | ||
255 | struct packet_sock *); | ||
256 | static void prb_retire_current_block(struct tpacket_kbdq_core *, | ||
257 | struct packet_sock *, unsigned int status); | ||
258 | static int prb_queue_frozen(struct tpacket_kbdq_core *); | ||
259 | static void prb_open_block(struct tpacket_kbdq_core *, | ||
260 | struct tpacket_block_desc *); | ||
261 | static void prb_retire_rx_blk_timer_expired(unsigned long); | ||
262 | static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); | ||
263 | static void prb_init_blk_timer(struct packet_sock *, | ||
264 | struct tpacket_kbdq_core *, | ||
265 | void (*func) (unsigned long)); | ||
266 | static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); | ||
267 | static void prb_clear_rxhash(struct tpacket_kbdq_core *, | ||
268 | struct tpacket3_hdr *); | ||
269 | static void prb_fill_vlan_info(struct tpacket_kbdq_core *, | ||
270 | struct tpacket3_hdr *); | ||
188 | static void packet_flush_mclist(struct sock *sk); | 271 | static void packet_flush_mclist(struct sock *sk); |
189 | 272 | ||
190 | struct packet_fanout; | 273 | struct packet_fanout; |
@@ -193,6 +276,7 @@ struct packet_sock { | |||
193 | struct sock sk; | 276 | struct sock sk; |
194 | struct packet_fanout *fanout; | 277 | struct packet_fanout *fanout; |
195 | struct tpacket_stats stats; | 278 | struct tpacket_stats stats; |
279 | union tpacket_stats_u stats_u; | ||
196 | struct packet_ring_buffer rx_ring; | 280 | struct packet_ring_buffer rx_ring; |
197 | struct packet_ring_buffer tx_ring; | 281 | struct packet_ring_buffer tx_ring; |
198 | int copy_thresh; | 282 | int copy_thresh; |
@@ -242,7 +326,16 @@ struct packet_skb_cb { | |||
242 | 326 | ||
243 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) | 327 | #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) |
244 | 328 | ||
245 | static inline struct packet_sock *pkt_sk(struct sock *sk) | 329 | #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) |
330 | #define GET_PBLOCK_DESC(x, bid) \ | ||
331 | ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) | ||
332 | #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ | ||
333 | ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) | ||
334 | #define GET_NEXT_PRB_BLK_NUM(x) \ | ||
335 | (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ | ||
336 | ((x)->kactive_blk_num+1) : 0) | ||
337 | |||
338 | static struct packet_sock *pkt_sk(struct sock *sk) | ||
246 | { | 339 | { |
247 | return (struct packet_sock *)sk; | 340 | return (struct packet_sock *)sk; |
248 | } | 341 | } |
@@ -325,8 +418,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) | |||
325 | h.h2->tp_status = status; | 418 | h.h2->tp_status = status; |
326 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 419 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
327 | break; | 420 | break; |
421 | case TPACKET_V3: | ||
328 | default: | 422 | default: |
329 | pr_err("TPACKET version not supported\n"); | 423 | WARN(1, "TPACKET version not supported.\n"); |
330 | BUG(); | 424 | BUG(); |
331 | } | 425 | } |
332 | 426 | ||
@@ -351,8 +445,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame) | |||
351 | case TPACKET_V2: | 445 | case TPACKET_V2: |
352 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); | 446 | flush_dcache_page(pgv_to_page(&h.h2->tp_status)); |
353 | return h.h2->tp_status; | 447 | return h.h2->tp_status; |
448 | case TPACKET_V3: | ||
354 | default: | 449 | default: |
355 | pr_err("TPACKET version not supported\n"); | 450 | WARN(1, "TPACKET version not supported.\n"); |
356 | BUG(); | 451 | BUG(); |
357 | return 0; | 452 | return 0; |
358 | } | 453 | } |
@@ -382,14 +477,678 @@ static void *packet_lookup_frame(struct packet_sock *po, | |||
382 | return h.raw; | 477 | return h.raw; |
383 | } | 478 | } |
384 | 479 | ||
385 | static inline void *packet_current_frame(struct packet_sock *po, | 480 | static void *packet_current_frame(struct packet_sock *po, |
386 | struct packet_ring_buffer *rb, | 481 | struct packet_ring_buffer *rb, |
387 | int status) | 482 | int status) |
388 | { | 483 | { |
389 | return packet_lookup_frame(po, rb, rb->head, status); | 484 | return packet_lookup_frame(po, rb, rb->head, status); |
390 | } | 485 | } |
391 | 486 | ||
392 | static inline void *packet_previous_frame(struct packet_sock *po, | 487 | static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) |
488 | { | ||
489 | del_timer_sync(&pkc->retire_blk_timer); | ||
490 | } | ||
491 | |||
492 | static void prb_shutdown_retire_blk_timer(struct packet_sock *po, | ||
493 | int tx_ring, | ||
494 | struct sk_buff_head *rb_queue) | ||
495 | { | ||
496 | struct tpacket_kbdq_core *pkc; | ||
497 | |||
498 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
499 | |||
500 | spin_lock(&rb_queue->lock); | ||
501 | pkc->delete_blk_timer = 1; | ||
502 | spin_unlock(&rb_queue->lock); | ||
503 | |||
504 | prb_del_retire_blk_timer(pkc); | ||
505 | } | ||
506 | |||
507 | static void prb_init_blk_timer(struct packet_sock *po, | ||
508 | struct tpacket_kbdq_core *pkc, | ||
509 | void (*func) (unsigned long)) | ||
510 | { | ||
511 | init_timer(&pkc->retire_blk_timer); | ||
512 | pkc->retire_blk_timer.data = (long)po; | ||
513 | pkc->retire_blk_timer.function = func; | ||
514 | pkc->retire_blk_timer.expires = jiffies; | ||
515 | } | ||
516 | |||
517 | static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring) | ||
518 | { | ||
519 | struct tpacket_kbdq_core *pkc; | ||
520 | |||
521 | if (tx_ring) | ||
522 | BUG(); | ||
523 | |||
524 | pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc; | ||
525 | prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired); | ||
526 | } | ||
527 | |||
528 | static int prb_calc_retire_blk_tmo(struct packet_sock *po, | ||
529 | int blk_size_in_bytes) | ||
530 | { | ||
531 | struct net_device *dev; | ||
532 | unsigned int mbits = 0, msec = 0, div = 0, tmo = 0; | ||
533 | struct ethtool_cmd ecmd; | ||
534 | int err; | ||
535 | |||
536 | rtnl_lock(); | ||
537 | dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); | ||
538 | if (unlikely(!dev)) { | ||
539 | rtnl_unlock(); | ||
540 | return DEFAULT_PRB_RETIRE_TOV; | ||
541 | } | ||
542 | err = __ethtool_get_settings(dev, &ecmd); | ||
543 | rtnl_unlock(); | ||
544 | if (!err) { | ||
545 | switch (ecmd.speed) { | ||
546 | case SPEED_10000: | ||
547 | msec = 1; | ||
548 | div = 10000/1000; | ||
549 | break; | ||
550 | case SPEED_1000: | ||
551 | msec = 1; | ||
552 | div = 1000/1000; | ||
553 | break; | ||
554 | /* | ||
555 | * If the link speed is so slow you don't really | ||
556 | * need to worry about perf anyways | ||
557 | */ | ||
558 | case SPEED_100: | ||
559 | case SPEED_10: | ||
560 | default: | ||
561 | return DEFAULT_PRB_RETIRE_TOV; | ||
562 | } | ||
563 | } | ||
564 | |||
565 | mbits = (blk_size_in_bytes * 8) / (1024 * 1024); | ||
566 | |||
567 | if (div) | ||
568 | mbits /= div; | ||
569 | |||
570 | tmo = mbits * msec; | ||
571 | |||
572 | if (div) | ||
573 | return tmo+1; | ||
574 | return tmo; | ||
575 | } | ||
576 | |||
577 | static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, | ||
578 | union tpacket_req_u *req_u) | ||
579 | { | ||
580 | p1->feature_req_word = req_u->req3.tp_feature_req_word; | ||
581 | } | ||
582 | |||
583 | static void init_prb_bdqc(struct packet_sock *po, | ||
584 | struct packet_ring_buffer *rb, | ||
585 | struct pgv *pg_vec, | ||
586 | union tpacket_req_u *req_u, int tx_ring) | ||
587 | { | ||
588 | struct tpacket_kbdq_core *p1 = &rb->prb_bdqc; | ||
589 | struct tpacket_block_desc *pbd; | ||
590 | |||
591 | memset(p1, 0x0, sizeof(*p1)); | ||
592 | |||
593 | p1->knxt_seq_num = 1; | ||
594 | p1->pkbdq = pg_vec; | ||
595 | pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; | ||
596 | p1->pkblk_start = (char *)pg_vec[0].buffer; | ||
597 | p1->kblk_size = req_u->req3.tp_block_size; | ||
598 | p1->knum_blocks = req_u->req3.tp_block_nr; | ||
599 | p1->hdrlen = po->tp_hdrlen; | ||
600 | p1->version = po->tp_version; | ||
601 | p1->last_kactive_blk_num = 0; | ||
602 | po->stats_u.stats3.tp_freeze_q_cnt = 0; | ||
603 | if (req_u->req3.tp_retire_blk_tov) | ||
604 | p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; | ||
605 | else | ||
606 | p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, | ||
607 | req_u->req3.tp_block_size); | ||
608 | p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); | ||
609 | p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; | ||
610 | |||
611 | prb_init_ft_ops(p1, req_u); | ||
612 | prb_setup_retire_blk_timer(po, tx_ring); | ||
613 | prb_open_block(p1, pbd); | ||
614 | } | ||
615 | |||
616 | /* Do NOT update the last_blk_num first. | ||
617 | * Assumes sk_buff_head lock is held. | ||
618 | */ | ||
619 | static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) | ||
620 | { | ||
621 | mod_timer(&pkc->retire_blk_timer, | ||
622 | jiffies + pkc->tov_in_jiffies); | ||
623 | pkc->last_kactive_blk_num = pkc->kactive_blk_num; | ||
624 | } | ||
625 | |||
626 | /* | ||
627 | * Timer logic: | ||
628 | * 1) We refresh the timer only when we open a block. | ||
629 | * By doing this we don't waste cycles refreshing the timer | ||
630 | * on packet-by-packet basis. | ||
631 | * | ||
632 | * With a 1MB block-size, on a 1Gbps line, it will take | ||
633 | * i) ~8 ms to fill a block + ii) memcpy etc. | ||
634 | * In this cut we are not accounting for the memcpy time. | ||
635 | * | ||
636 | * So, if the user sets the 'tmo' to 10ms then the timer | ||
637 | * will never fire while the block is still getting filled | ||
638 | * (which is what we want). However, the user could choose | ||
639 | * to close a block early and that's fine. | ||
640 | * | ||
641 | * But when the timer does fire, we check whether or not to refresh it. | ||
642 | * Since the tmo granularity is in msecs, it is not too expensive | ||
643 | * to refresh the timer, lets say every '8' msecs. | ||
644 | * Either the user can set the 'tmo' or we can derive it based on | ||
645 | * a) line-speed and b) block-size. | ||
646 | * prb_calc_retire_blk_tmo() calculates the tmo. | ||
647 | * | ||
648 | */ | ||
649 | static void prb_retire_rx_blk_timer_expired(unsigned long data) | ||
650 | { | ||
651 | struct packet_sock *po = (struct packet_sock *)data; | ||
652 | struct tpacket_kbdq_core *pkc = &po->rx_ring.prb_bdqc; | ||
653 | unsigned int frozen; | ||
654 | struct tpacket_block_desc *pbd; | ||
655 | |||
656 | spin_lock(&po->sk.sk_receive_queue.lock); | ||
657 | |||
658 | frozen = prb_queue_frozen(pkc); | ||
659 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
660 | |||
661 | if (unlikely(pkc->delete_blk_timer)) | ||
662 | goto out; | ||
663 | |||
664 | /* We only need to plug the race when the block is partially filled. | ||
665 | * tpacket_rcv: | ||
666 | * lock(); increment BLOCK_NUM_PKTS; unlock() | ||
667 | * copy_bits() is in progress ... | ||
668 | * timer fires on other cpu: | ||
669 | * we can't retire the current block because copy_bits | ||
670 | * is in progress. | ||
671 | * | ||
672 | */ | ||
673 | if (BLOCK_NUM_PKTS(pbd)) { | ||
674 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
675 | /* Waiting for skb_copy_bits to finish... */ | ||
676 | cpu_relax(); | ||
677 | } | ||
678 | } | ||
679 | |||
680 | if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { | ||
681 | if (!frozen) { | ||
682 | prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); | ||
683 | if (!prb_dispatch_next_block(pkc, po)) | ||
684 | goto refresh_timer; | ||
685 | else | ||
686 | goto out; | ||
687 | } else { | ||
688 | /* Case 1. Queue was frozen because user-space was | ||
689 | * lagging behind. | ||
690 | */ | ||
691 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
692 | /* | ||
693 | * Ok, user-space is still behind. | ||
694 | * So just refresh the timer. | ||
695 | */ | ||
696 | goto refresh_timer; | ||
697 | } else { | ||
698 | /* Case 2. queue was frozen,user-space caught up, | ||
699 | * now the link went idle && the timer fired. | ||
700 | * We don't have a block to close.So we open this | ||
701 | * block and restart the timer. | ||
702 | * opening a block thaws the queue,restarts timer | ||
703 | * Thawing/timer-refresh is a side effect. | ||
704 | */ | ||
705 | prb_open_block(pkc, pbd); | ||
706 | goto out; | ||
707 | } | ||
708 | } | ||
709 | } | ||
710 | |||
711 | refresh_timer: | ||
712 | _prb_refresh_rx_retire_blk_timer(pkc); | ||
713 | |||
714 | out: | ||
715 | spin_unlock(&po->sk.sk_receive_queue.lock); | ||
716 | } | ||
717 | |||
718 | static void prb_flush_block(struct tpacket_kbdq_core *pkc1, | ||
719 | struct tpacket_block_desc *pbd1, __u32 status) | ||
720 | { | ||
721 | /* Flush everything minus the block header */ | ||
722 | |||
723 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
724 | u8 *start, *end; | ||
725 | |||
726 | start = (u8 *)pbd1; | ||
727 | |||
728 | /* Skip the block header(we know header WILL fit in 4K) */ | ||
729 | start += PAGE_SIZE; | ||
730 | |||
731 | end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); | ||
732 | for (; start < end; start += PAGE_SIZE) | ||
733 | flush_dcache_page(pgv_to_page(start)); | ||
734 | |||
735 | smp_wmb(); | ||
736 | #endif | ||
737 | |||
738 | /* Now update the block status. */ | ||
739 | |||
740 | BLOCK_STATUS(pbd1) = status; | ||
741 | |||
742 | /* Flush the block header */ | ||
743 | |||
744 | #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 | ||
745 | start = (u8 *)pbd1; | ||
746 | flush_dcache_page(pgv_to_page(start)); | ||
747 | |||
748 | smp_wmb(); | ||
749 | #endif | ||
750 | } | ||
751 | |||
752 | /* | ||
753 | * Side effect: | ||
754 | * | ||
755 | * 1) flush the block | ||
756 | * 2) Increment active_blk_num | ||
757 | * | ||
758 | * Note:We DONT refresh the timer on purpose. | ||
759 | * Because almost always the next block will be opened. | ||
760 | */ | ||
761 | static void prb_close_block(struct tpacket_kbdq_core *pkc1, | ||
762 | struct tpacket_block_desc *pbd1, | ||
763 | struct packet_sock *po, unsigned int stat) | ||
764 | { | ||
765 | __u32 status = TP_STATUS_USER | stat; | ||
766 | |||
767 | struct tpacket3_hdr *last_pkt; | ||
768 | struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
769 | |||
770 | if (po->stats.tp_drops) | ||
771 | status |= TP_STATUS_LOSING; | ||
772 | |||
773 | last_pkt = (struct tpacket3_hdr *)pkc1->prev; | ||
774 | last_pkt->tp_next_offset = 0; | ||
775 | |||
776 | /* Get the ts of the last pkt */ | ||
777 | if (BLOCK_NUM_PKTS(pbd1)) { | ||
778 | h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; | ||
779 | h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; | ||
780 | } else { | ||
781 | /* Ok, we tmo'd - so get the current time */ | ||
782 | struct timespec ts; | ||
783 | getnstimeofday(&ts); | ||
784 | h1->ts_last_pkt.ts_sec = ts.tv_sec; | ||
785 | h1->ts_last_pkt.ts_nsec = ts.tv_nsec; | ||
786 | } | ||
787 | |||
788 | smp_wmb(); | ||
789 | |||
790 | /* Flush the block */ | ||
791 | prb_flush_block(pkc1, pbd1, status); | ||
792 | |||
793 | pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); | ||
794 | } | ||
795 | |||
796 | static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) | ||
797 | { | ||
798 | pkc->reset_pending_on_curr_blk = 0; | ||
799 | } | ||
800 | |||
801 | /* | ||
802 | * Side effect of opening a block: | ||
803 | * | ||
804 | * 1) prb_queue is thawed. | ||
805 | * 2) retire_blk_timer is refreshed. | ||
806 | * | ||
807 | */ | ||
808 | static void prb_open_block(struct tpacket_kbdq_core *pkc1, | ||
809 | struct tpacket_block_desc *pbd1) | ||
810 | { | ||
811 | struct timespec ts; | ||
812 | struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; | ||
813 | |||
814 | smp_rmb(); | ||
815 | |||
816 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) { | ||
817 | |||
818 | /* We could have just memset this but we will lose the | ||
819 | * flexibility of making the priv area sticky | ||
820 | */ | ||
821 | BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; | ||
822 | BLOCK_NUM_PKTS(pbd1) = 0; | ||
823 | BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
824 | getnstimeofday(&ts); | ||
825 | h1->ts_first_pkt.ts_sec = ts.tv_sec; | ||
826 | h1->ts_first_pkt.ts_nsec = ts.tv_nsec; | ||
827 | pkc1->pkblk_start = (char *)pbd1; | ||
828 | pkc1->nxt_offset = (char *)(pkc1->pkblk_start + | ||
829 | BLK_PLUS_PRIV(pkc1->blk_sizeof_priv)); | ||
830 | BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); | ||
831 | BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; | ||
832 | pbd1->version = pkc1->version; | ||
833 | pkc1->prev = pkc1->nxt_offset; | ||
834 | pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; | ||
835 | prb_thaw_queue(pkc1); | ||
836 | _prb_refresh_rx_retire_blk_timer(pkc1); | ||
837 | |||
838 | smp_wmb(); | ||
839 | |||
840 | return; | ||
841 | } | ||
842 | |||
843 | WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n", | ||
844 | pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num); | ||
845 | dump_stack(); | ||
846 | BUG(); | ||
847 | } | ||
848 | |||
849 | /* | ||
850 | * Queue freeze logic: | ||
851 | * 1) Assume tp_block_nr = 8 blocks. | ||
852 | * 2) At time 't0', user opens Rx ring. | ||
853 | * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 | ||
854 | * 4) user-space is either sleeping or processing block '0'. | ||
855 | * 5) tpacket_rcv is currently filling block '7', since there is no space left, | ||
856 | * it will close block-7,loop around and try to fill block '0'. | ||
857 | * call-flow: | ||
858 | * __packet_lookup_frame_in_block | ||
859 | * prb_retire_current_block() | ||
860 | * prb_dispatch_next_block() | ||
861 | * |->(BLOCK_STATUS == USER) evaluates to true | ||
862 | * 5.1) Since block-0 is currently in-use, we just freeze the queue. | ||
863 | * 6) Now there are two cases: | ||
864 | * 6.1) Link goes idle right after the queue is frozen. | ||
865 | * But remember, the last open_block() refreshed the timer. | ||
866 | * When this timer expires,it will refresh itself so that we can | ||
867 | * re-open block-0 in near future. | ||
868 | * 6.2) Link is busy and keeps on receiving packets. This is a simple | ||
869 | * case and __packet_lookup_frame_in_block will check if block-0 | ||
870 | * is free and can now be re-used. | ||
871 | */ | ||
872 | static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, | ||
873 | struct packet_sock *po) | ||
874 | { | ||
875 | pkc->reset_pending_on_curr_blk = 1; | ||
876 | po->stats_u.stats3.tp_freeze_q_cnt++; | ||
877 | } | ||
878 | |||
879 | #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) | ||
880 | |||
881 | /* | ||
882 | * If the next block is free then we will dispatch it | ||
883 | * and return a good offset. | ||
884 | * Else, we will freeze the queue. | ||
885 | * So, caller must check the return value. | ||
886 | */ | ||
887 | static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, | ||
888 | struct packet_sock *po) | ||
889 | { | ||
890 | struct tpacket_block_desc *pbd; | ||
891 | |||
892 | smp_rmb(); | ||
893 | |||
894 | /* 1. Get current block num */ | ||
895 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
896 | |||
897 | /* 2. If this block is currently in_use then freeze the queue */ | ||
898 | if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { | ||
899 | prb_freeze_queue(pkc, po); | ||
900 | return NULL; | ||
901 | } | ||
902 | |||
903 | /* | ||
904 | * 3. | ||
905 | * open this block and return the offset where the first packet | ||
906 | * needs to get stored. | ||
907 | */ | ||
908 | prb_open_block(pkc, pbd); | ||
909 | return (void *)pkc->nxt_offset; | ||
910 | } | ||
911 | |||
912 | static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, | ||
913 | struct packet_sock *po, unsigned int status) | ||
914 | { | ||
915 | struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
916 | |||
917 | /* retire/close the current block */ | ||
918 | if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { | ||
919 | /* | ||
920 | * Plug the case where copy_bits() is in progress on | ||
921 | * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't | ||
922 | * have space to copy the pkt in the current block and | ||
923 | * called prb_retire_current_block() | ||
924 | * | ||
925 | * We don't need to worry about the TMO case because | ||
926 | * the timer-handler already handled this case. | ||
927 | */ | ||
928 | if (!(status & TP_STATUS_BLK_TMO)) { | ||
929 | while (atomic_read(&pkc->blk_fill_in_prog)) { | ||
930 | /* Waiting for skb_copy_bits to finish... */ | ||
931 | cpu_relax(); | ||
932 | } | ||
933 | } | ||
934 | prb_close_block(pkc, pbd, po, status); | ||
935 | return; | ||
936 | } | ||
937 | |||
938 | WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd); | ||
939 | dump_stack(); | ||
940 | BUG(); | ||
941 | } | ||
942 | |||
943 | static int prb_curr_blk_in_use(struct tpacket_kbdq_core *pkc, | ||
944 | struct tpacket_block_desc *pbd) | ||
945 | { | ||
946 | return TP_STATUS_USER & BLOCK_STATUS(pbd); | ||
947 | } | ||
948 | |||
949 | static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) | ||
950 | { | ||
951 | return pkc->reset_pending_on_curr_blk; | ||
952 | } | ||
953 | |||
954 | static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) | ||
955 | { | ||
956 | struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
957 | atomic_dec(&pkc->blk_fill_in_prog); | ||
958 | } | ||
959 | |||
960 | static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, | ||
961 | struct tpacket3_hdr *ppd) | ||
962 | { | ||
963 | ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb); | ||
964 | } | ||
965 | |||
966 | static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, | ||
967 | struct tpacket3_hdr *ppd) | ||
968 | { | ||
969 | ppd->hv1.tp_rxhash = 0; | ||
970 | } | ||
971 | |||
972 | static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, | ||
973 | struct tpacket3_hdr *ppd) | ||
974 | { | ||
975 | if (vlan_tx_tag_present(pkc->skb)) { | ||
976 | ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb); | ||
977 | ppd->tp_status = TP_STATUS_VLAN_VALID; | ||
978 | } else { | ||
979 | ppd->hv1.tp_vlan_tci = ppd->tp_status = 0; | ||
980 | } | ||
981 | } | ||
982 | |||
983 | static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, | ||
984 | struct tpacket3_hdr *ppd) | ||
985 | { | ||
986 | prb_fill_vlan_info(pkc, ppd); | ||
987 | |||
988 | if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) | ||
989 | prb_fill_rxhash(pkc, ppd); | ||
990 | else | ||
991 | prb_clear_rxhash(pkc, ppd); | ||
992 | } | ||
993 | |||
994 | static void prb_fill_curr_block(char *curr, | ||
995 | struct tpacket_kbdq_core *pkc, | ||
996 | struct tpacket_block_desc *pbd, | ||
997 | unsigned int len) | ||
998 | { | ||
999 | struct tpacket3_hdr *ppd; | ||
1000 | |||
1001 | ppd = (struct tpacket3_hdr *)curr; | ||
1002 | ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
1003 | pkc->prev = curr; | ||
1004 | pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
1005 | BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); | ||
1006 | BLOCK_NUM_PKTS(pbd) += 1; | ||
1007 | atomic_inc(&pkc->blk_fill_in_prog); | ||
1008 | prb_run_all_ft_ops(pkc, ppd); | ||
1009 | } | ||
1010 | |||
1011 | /* Assumes caller has the sk->rx_queue.lock */ | ||
1012 | static void *__packet_lookup_frame_in_block(struct packet_sock *po, | ||
1013 | struct sk_buff *skb, | ||
1014 | int status, | ||
1015 | unsigned int len | ||
1016 | ) | ||
1017 | { | ||
1018 | struct tpacket_kbdq_core *pkc; | ||
1019 | struct tpacket_block_desc *pbd; | ||
1020 | char *curr, *end; | ||
1021 | |||
1022 | pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring)); | ||
1023 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
1024 | |||
1025 | /* Queue is frozen when user space is lagging behind */ | ||
1026 | if (prb_queue_frozen(pkc)) { | ||
1027 | /* | ||
1028 | * Check if that last block which caused the queue to freeze, | ||
1029 | * is still in_use by user-space. | ||
1030 | */ | ||
1031 | if (prb_curr_blk_in_use(pkc, pbd)) { | ||
1032 | /* Can't record this packet */ | ||
1033 | return NULL; | ||
1034 | } else { | ||
1035 | /* | ||
1036 | * Ok, the block was released by user-space. | ||
1037 | * Now let's open that block. | ||
1038 | * opening a block also thaws the queue. | ||
1039 | * Thawing is a side effect. | ||
1040 | */ | ||
1041 | prb_open_block(pkc, pbd); | ||
1042 | } | ||
1043 | } | ||
1044 | |||
1045 | smp_mb(); | ||
1046 | curr = pkc->nxt_offset; | ||
1047 | pkc->skb = skb; | ||
1048 | end = (char *) ((char *)pbd + pkc->kblk_size); | ||
1049 | |||
1050 | /* first try the current block */ | ||
1051 | if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { | ||
1052 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
1053 | return (void *)curr; | ||
1054 | } | ||
1055 | |||
1056 | /* Ok, close the current block */ | ||
1057 | prb_retire_current_block(pkc, po, 0); | ||
1058 | |||
1059 | /* Now, try to dispatch the next block */ | ||
1060 | curr = (char *)prb_dispatch_next_block(pkc, po); | ||
1061 | if (curr) { | ||
1062 | pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); | ||
1063 | prb_fill_curr_block(curr, pkc, pbd, len); | ||
1064 | return (void *)curr; | ||
1065 | } | ||
1066 | |||
1067 | /* | ||
1068 | * No free blocks are available.user_space hasn't caught up yet. | ||
1069 | * Queue was just frozen and now this packet will get dropped. | ||
1070 | */ | ||
1071 | return NULL; | ||
1072 | } | ||
1073 | |||
1074 | static void *packet_current_rx_frame(struct packet_sock *po, | ||
1075 | struct sk_buff *skb, | ||
1076 | int status, unsigned int len) | ||
1077 | { | ||
1078 | char *curr = NULL; | ||
1079 | switch (po->tp_version) { | ||
1080 | case TPACKET_V1: | ||
1081 | case TPACKET_V2: | ||
1082 | curr = packet_lookup_frame(po, &po->rx_ring, | ||
1083 | po->rx_ring.head, status); | ||
1084 | return curr; | ||
1085 | case TPACKET_V3: | ||
1086 | return __packet_lookup_frame_in_block(po, skb, status, len); | ||
1087 | default: | ||
1088 | WARN(1, "TPACKET version not supported\n"); | ||
1089 | BUG(); | ||
1090 | return 0; | ||
1091 | } | ||
1092 | } | ||
1093 | |||
1094 | static void *prb_lookup_block(struct packet_sock *po, | ||
1095 | struct packet_ring_buffer *rb, | ||
1096 | unsigned int previous, | ||
1097 | int status) | ||
1098 | { | ||
1099 | struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); | ||
1100 | struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous); | ||
1101 | |||
1102 | if (status != BLOCK_STATUS(pbd)) | ||
1103 | return NULL; | ||
1104 | return pbd; | ||
1105 | } | ||
1106 | |||
1107 | static int prb_previous_blk_num(struct packet_ring_buffer *rb) | ||
1108 | { | ||
1109 | unsigned int prev; | ||
1110 | if (rb->prb_bdqc.kactive_blk_num) | ||
1111 | prev = rb->prb_bdqc.kactive_blk_num-1; | ||
1112 | else | ||
1113 | prev = rb->prb_bdqc.knum_blocks-1; | ||
1114 | return prev; | ||
1115 | } | ||
1116 | |||
1117 | /* Assumes caller has held the rx_queue.lock */ | ||
1118 | static void *__prb_previous_block(struct packet_sock *po, | ||
1119 | struct packet_ring_buffer *rb, | ||
1120 | int status) | ||
1121 | { | ||
1122 | unsigned int previous = prb_previous_blk_num(rb); | ||
1123 | return prb_lookup_block(po, rb, previous, status); | ||
1124 | } | ||
1125 | |||
1126 | static void *packet_previous_rx_frame(struct packet_sock *po, | ||
1127 | struct packet_ring_buffer *rb, | ||
1128 | int status) | ||
1129 | { | ||
1130 | if (po->tp_version <= TPACKET_V2) | ||
1131 | return packet_previous_frame(po, rb, status); | ||
1132 | |||
1133 | return __prb_previous_block(po, rb, status); | ||
1134 | } | ||
1135 | |||
1136 | static void packet_increment_rx_head(struct packet_sock *po, | ||
1137 | struct packet_ring_buffer *rb) | ||
1138 | { | ||
1139 | switch (po->tp_version) { | ||
1140 | case TPACKET_V1: | ||
1141 | case TPACKET_V2: | ||
1142 | return packet_increment_head(rb); | ||
1143 | case TPACKET_V3: | ||
1144 | default: | ||
1145 | WARN(1, "TPACKET version not supported.\n"); | ||
1146 | BUG(); | ||
1147 | return; | ||
1148 | } | ||
1149 | } | ||
1150 | |||
1151 | static void *packet_previous_frame(struct packet_sock *po, | ||
393 | struct packet_ring_buffer *rb, | 1152 | struct packet_ring_buffer *rb, |
394 | int status) | 1153 | int status) |
395 | { | 1154 | { |
@@ -397,7 +1156,7 @@ static inline void *packet_previous_frame(struct packet_sock *po, | |||
397 | return packet_lookup_frame(po, rb, previous, status); | 1156 | return packet_lookup_frame(po, rb, previous, status); |
398 | } | 1157 | } |
399 | 1158 | ||
400 | static inline void packet_increment_head(struct packet_ring_buffer *buff) | 1159 | static void packet_increment_head(struct packet_ring_buffer *buff) |
401 | { | 1160 | { |
402 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; | 1161 | buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; |
403 | } | 1162 | } |
@@ -454,43 +1213,6 @@ static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *sk | |||
454 | return f->arr[cpu % num]; | 1213 | return f->arr[cpu % num]; |
455 | } | 1214 | } |
456 | 1215 | ||
457 | static struct sk_buff *fanout_check_defrag(struct sk_buff *skb) | ||
458 | { | ||
459 | #ifdef CONFIG_INET | ||
460 | const struct iphdr *iph; | ||
461 | u32 len; | ||
462 | |||
463 | if (skb->protocol != htons(ETH_P_IP)) | ||
464 | return skb; | ||
465 | |||
466 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
467 | return skb; | ||
468 | |||
469 | iph = ip_hdr(skb); | ||
470 | if (iph->ihl < 5 || iph->version != 4) | ||
471 | return skb; | ||
472 | if (!pskb_may_pull(skb, iph->ihl*4)) | ||
473 | return skb; | ||
474 | iph = ip_hdr(skb); | ||
475 | len = ntohs(iph->tot_len); | ||
476 | if (skb->len < len || len < (iph->ihl * 4)) | ||
477 | return skb; | ||
478 | |||
479 | if (ip_is_fragment(ip_hdr(skb))) { | ||
480 | skb = skb_share_check(skb, GFP_ATOMIC); | ||
481 | if (skb) { | ||
482 | if (pskb_trim_rcsum(skb, len)) | ||
483 | return skb; | ||
484 | memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); | ||
485 | if (ip_defrag(skb, IP_DEFRAG_AF_PACKET)) | ||
486 | return NULL; | ||
487 | skb->rxhash = 0; | ||
488 | } | ||
489 | } | ||
490 | #endif | ||
491 | return skb; | ||
492 | } | ||
493 | |||
494 | static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | 1216 | static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, |
495 | struct packet_type *pt, struct net_device *orig_dev) | 1217 | struct packet_type *pt, struct net_device *orig_dev) |
496 | { | 1218 | { |
@@ -509,7 +1231,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, | |||
509 | case PACKET_FANOUT_HASH: | 1231 | case PACKET_FANOUT_HASH: |
510 | default: | 1232 | default: |
511 | if (f->defrag) { | 1233 | if (f->defrag) { |
512 | skb = fanout_check_defrag(skb); | 1234 | skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET); |
513 | if (!skb) | 1235 | if (!skb) |
514 | return 0; | 1236 | return 0; |
515 | } | 1237 | } |
@@ -836,7 +1558,7 @@ out_free: | |||
836 | return err; | 1558 | return err; |
837 | } | 1559 | } |
838 | 1560 | ||
839 | static inline unsigned int run_filter(const struct sk_buff *skb, | 1561 | static unsigned int run_filter(const struct sk_buff *skb, |
840 | const struct sock *sk, | 1562 | const struct sock *sk, |
841 | unsigned int res) | 1563 | unsigned int res) |
842 | { | 1564 | { |
@@ -961,7 +1683,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, | |||
961 | return 0; | 1683 | return 0; |
962 | 1684 | ||
963 | drop_n_acct: | 1685 | drop_n_acct: |
964 | po->stats.tp_drops = atomic_inc_return(&sk->sk_drops); | 1686 | spin_lock(&sk->sk_receive_queue.lock); |
1687 | po->stats.tp_drops++; | ||
1688 | atomic_inc(&sk->sk_drops); | ||
1689 | spin_unlock(&sk->sk_receive_queue.lock); | ||
965 | 1690 | ||
966 | drop_n_restore: | 1691 | drop_n_restore: |
967 | if (skb_head != skb->data && skb_shared(skb)) { | 1692 | if (skb_head != skb->data && skb_shared(skb)) { |
@@ -982,12 +1707,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
982 | union { | 1707 | union { |
983 | struct tpacket_hdr *h1; | 1708 | struct tpacket_hdr *h1; |
984 | struct tpacket2_hdr *h2; | 1709 | struct tpacket2_hdr *h2; |
1710 | struct tpacket3_hdr *h3; | ||
985 | void *raw; | 1711 | void *raw; |
986 | } h; | 1712 | } h; |
987 | u8 *skb_head = skb->data; | 1713 | u8 *skb_head = skb->data; |
988 | int skb_len = skb->len; | 1714 | int skb_len = skb->len; |
989 | unsigned int snaplen, res; | 1715 | unsigned int snaplen, res; |
990 | unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; | 1716 | unsigned long status = TP_STATUS_USER; |
991 | unsigned short macoff, netoff, hdrlen; | 1717 | unsigned short macoff, netoff, hdrlen; |
992 | struct sk_buff *copy_skb = NULL; | 1718 | struct sk_buff *copy_skb = NULL; |
993 | struct timeval tv; | 1719 | struct timeval tv; |
@@ -1033,37 +1759,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1033 | po->tp_reserve; | 1759 | po->tp_reserve; |
1034 | macoff = netoff - maclen; | 1760 | macoff = netoff - maclen; |
1035 | } | 1761 | } |
1036 | 1762 | if (po->tp_version <= TPACKET_V2) { | |
1037 | if (macoff + snaplen > po->rx_ring.frame_size) { | 1763 | if (macoff + snaplen > po->rx_ring.frame_size) { |
1038 | if (po->copy_thresh && | 1764 | if (po->copy_thresh && |
1039 | atomic_read(&sk->sk_rmem_alloc) + skb->truesize < | 1765 | atomic_read(&sk->sk_rmem_alloc) + skb->truesize |
1040 | (unsigned)sk->sk_rcvbuf) { | 1766 | < (unsigned)sk->sk_rcvbuf) { |
1041 | if (skb_shared(skb)) { | 1767 | if (skb_shared(skb)) { |
1042 | copy_skb = skb_clone(skb, GFP_ATOMIC); | 1768 | copy_skb = skb_clone(skb, GFP_ATOMIC); |
1043 | } else { | 1769 | } else { |
1044 | copy_skb = skb_get(skb); | 1770 | copy_skb = skb_get(skb); |
1045 | skb_head = skb->data; | 1771 | skb_head = skb->data; |
1772 | } | ||
1773 | if (copy_skb) | ||
1774 | skb_set_owner_r(copy_skb, sk); | ||
1046 | } | 1775 | } |
1047 | if (copy_skb) | 1776 | snaplen = po->rx_ring.frame_size - macoff; |
1048 | skb_set_owner_r(copy_skb, sk); | 1777 | if ((int)snaplen < 0) |
1778 | snaplen = 0; | ||
1049 | } | 1779 | } |
1050 | snaplen = po->rx_ring.frame_size - macoff; | ||
1051 | if ((int)snaplen < 0) | ||
1052 | snaplen = 0; | ||
1053 | } | 1780 | } |
1054 | |||
1055 | spin_lock(&sk->sk_receive_queue.lock); | 1781 | spin_lock(&sk->sk_receive_queue.lock); |
1056 | h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL); | 1782 | h.raw = packet_current_rx_frame(po, skb, |
1783 | TP_STATUS_KERNEL, (macoff+snaplen)); | ||
1057 | if (!h.raw) | 1784 | if (!h.raw) |
1058 | goto ring_is_full; | 1785 | goto ring_is_full; |
1059 | packet_increment_head(&po->rx_ring); | 1786 | if (po->tp_version <= TPACKET_V2) { |
1787 | packet_increment_rx_head(po, &po->rx_ring); | ||
1788 | /* | ||
1789 | * LOSING will be reported till you read the stats, | ||
1790 | * because it's COR - Clear On Read. | ||
1791 | * Anyways, moving it for V1/V2 only as V3 doesn't need this | ||
1792 | * at packet level. | ||
1793 | */ | ||
1794 | if (po->stats.tp_drops) | ||
1795 | status |= TP_STATUS_LOSING; | ||
1796 | } | ||
1060 | po->stats.tp_packets++; | 1797 | po->stats.tp_packets++; |
1061 | if (copy_skb) { | 1798 | if (copy_skb) { |
1062 | status |= TP_STATUS_COPY; | 1799 | status |= TP_STATUS_COPY; |
1063 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); | 1800 | __skb_queue_tail(&sk->sk_receive_queue, copy_skb); |
1064 | } | 1801 | } |
1065 | if (!po->stats.tp_drops) | ||
1066 | status &= ~TP_STATUS_LOSING; | ||
1067 | spin_unlock(&sk->sk_receive_queue.lock); | 1802 | spin_unlock(&sk->sk_receive_queue.lock); |
1068 | 1803 | ||
1069 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); | 1804 | skb_copy_bits(skb, 0, h.raw + macoff, snaplen); |
@@ -1114,6 +1849,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1114 | h.h2->tp_padding = 0; | 1849 | h.h2->tp_padding = 0; |
1115 | hdrlen = sizeof(*h.h2); | 1850 | hdrlen = sizeof(*h.h2); |
1116 | break; | 1851 | break; |
1852 | case TPACKET_V3: | ||
1853 | /* tp_nxt_offset,vlan are already populated above. | ||
1854 | * So DONT clear those fields here | ||
1855 | */ | ||
1856 | h.h3->tp_status |= status; | ||
1857 | h.h3->tp_len = skb->len; | ||
1858 | h.h3->tp_snaplen = snaplen; | ||
1859 | h.h3->tp_mac = macoff; | ||
1860 | h.h3->tp_net = netoff; | ||
1861 | if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE) | ||
1862 | && shhwtstamps->syststamp.tv64) | ||
1863 | ts = ktime_to_timespec(shhwtstamps->syststamp); | ||
1864 | else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE) | ||
1865 | && shhwtstamps->hwtstamp.tv64) | ||
1866 | ts = ktime_to_timespec(shhwtstamps->hwtstamp); | ||
1867 | else if (skb->tstamp.tv64) | ||
1868 | ts = ktime_to_timespec(skb->tstamp); | ||
1869 | else | ||
1870 | getnstimeofday(&ts); | ||
1871 | h.h3->tp_sec = ts.tv_sec; | ||
1872 | h.h3->tp_nsec = ts.tv_nsec; | ||
1873 | hdrlen = sizeof(*h.h3); | ||
1874 | break; | ||
1117 | default: | 1875 | default: |
1118 | BUG(); | 1876 | BUG(); |
1119 | } | 1877 | } |
@@ -1134,13 +1892,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, | |||
1134 | { | 1892 | { |
1135 | u8 *start, *end; | 1893 | u8 *start, *end; |
1136 | 1894 | ||
1137 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen); | 1895 | if (po->tp_version <= TPACKET_V2) { |
1138 | for (start = h.raw; start < end; start += PAGE_SIZE) | 1896 | end = (u8 *)PAGE_ALIGN((unsigned long)h.raw |
1139 | flush_dcache_page(pgv_to_page(start)); | 1897 | + macoff + snaplen); |
1898 | for (start = h.raw; start < end; start += PAGE_SIZE) | ||
1899 | flush_dcache_page(pgv_to_page(start)); | ||
1900 | } | ||
1140 | smp_wmb(); | 1901 | smp_wmb(); |
1141 | } | 1902 | } |
1142 | #endif | 1903 | #endif |
1143 | __packet_set_status(po, h.raw, status); | 1904 | if (po->tp_version <= TPACKET_V2) |
1905 | __packet_set_status(po, h.raw, status); | ||
1906 | else | ||
1907 | prb_clear_blk_fill_status(&po->rx_ring); | ||
1144 | 1908 | ||
1145 | sk->sk_data_ready(sk, 0); | 1909 | sk->sk_data_ready(sk, 0); |
1146 | 1910 | ||
@@ -1167,8 +1931,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb) | |||
1167 | struct packet_sock *po = pkt_sk(skb->sk); | 1931 | struct packet_sock *po = pkt_sk(skb->sk); |
1168 | void *ph; | 1932 | void *ph; |
1169 | 1933 | ||
1170 | BUG_ON(skb == NULL); | ||
1171 | |||
1172 | if (likely(po->tx_ring.pg_vec)) { | 1934 | if (likely(po->tx_ring.pg_vec)) { |
1173 | ph = skb_shinfo(skb)->destructor_arg; | 1935 | ph = skb_shinfo(skb)->destructor_arg; |
1174 | BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); | 1936 | BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING); |
@@ -1405,10 +2167,10 @@ out: | |||
1405 | return err; | 2167 | return err; |
1406 | } | 2168 | } |
1407 | 2169 | ||
1408 | static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, | 2170 | static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, |
1409 | size_t reserve, size_t len, | 2171 | size_t reserve, size_t len, |
1410 | size_t linear, int noblock, | 2172 | size_t linear, int noblock, |
1411 | int *err) | 2173 | int *err) |
1412 | { | 2174 | { |
1413 | struct sk_buff *skb; | 2175 | struct sk_buff *skb; |
1414 | 2176 | ||
@@ -1631,7 +2393,7 @@ static int packet_release(struct socket *sock) | |||
1631 | struct sock *sk = sock->sk; | 2393 | struct sock *sk = sock->sk; |
1632 | struct packet_sock *po; | 2394 | struct packet_sock *po; |
1633 | struct net *net; | 2395 | struct net *net; |
1634 | struct tpacket_req req; | 2396 | union tpacket_req_u req_u; |
1635 | 2397 | ||
1636 | if (!sk) | 2398 | if (!sk) |
1637 | return 0; | 2399 | return 0; |
@@ -1654,13 +2416,13 @@ static int packet_release(struct socket *sock) | |||
1654 | 2416 | ||
1655 | packet_flush_mclist(sk); | 2417 | packet_flush_mclist(sk); |
1656 | 2418 | ||
1657 | memset(&req, 0, sizeof(req)); | 2419 | memset(&req_u, 0, sizeof(req_u)); |
1658 | 2420 | ||
1659 | if (po->rx_ring.pg_vec) | 2421 | if (po->rx_ring.pg_vec) |
1660 | packet_set_ring(sk, &req, 1, 0); | 2422 | packet_set_ring(sk, &req_u, 1, 0); |
1661 | 2423 | ||
1662 | if (po->tx_ring.pg_vec) | 2424 | if (po->tx_ring.pg_vec) |
1663 | packet_set_ring(sk, &req, 1, 1); | 2425 | packet_set_ring(sk, &req_u, 1, 1); |
1664 | 2426 | ||
1665 | fanout_release(sk); | 2427 | fanout_release(sk); |
1666 | 2428 | ||
@@ -2280,15 +3042,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2280 | case PACKET_RX_RING: | 3042 | case PACKET_RX_RING: |
2281 | case PACKET_TX_RING: | 3043 | case PACKET_TX_RING: |
2282 | { | 3044 | { |
2283 | struct tpacket_req req; | 3045 | union tpacket_req_u req_u; |
3046 | int len; | ||
2284 | 3047 | ||
2285 | if (optlen < sizeof(req)) | 3048 | switch (po->tp_version) { |
3049 | case TPACKET_V1: | ||
3050 | case TPACKET_V2: | ||
3051 | len = sizeof(req_u.req); | ||
3052 | break; | ||
3053 | case TPACKET_V3: | ||
3054 | default: | ||
3055 | len = sizeof(req_u.req3); | ||
3056 | break; | ||
3057 | } | ||
3058 | if (optlen < len) | ||
2286 | return -EINVAL; | 3059 | return -EINVAL; |
2287 | if (pkt_sk(sk)->has_vnet_hdr) | 3060 | if (pkt_sk(sk)->has_vnet_hdr) |
2288 | return -EINVAL; | 3061 | return -EINVAL; |
2289 | if (copy_from_user(&req, optval, sizeof(req))) | 3062 | if (copy_from_user(&req_u.req, optval, len)) |
2290 | return -EFAULT; | 3063 | return -EFAULT; |
2291 | return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING); | 3064 | return packet_set_ring(sk, &req_u, 0, |
3065 | optname == PACKET_TX_RING); | ||
2292 | } | 3066 | } |
2293 | case PACKET_COPY_THRESH: | 3067 | case PACKET_COPY_THRESH: |
2294 | { | 3068 | { |
@@ -2315,6 +3089,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv | |||
2315 | switch (val) { | 3089 | switch (val) { |
2316 | case TPACKET_V1: | 3090 | case TPACKET_V1: |
2317 | case TPACKET_V2: | 3091 | case TPACKET_V2: |
3092 | case TPACKET_V3: | ||
2318 | po->tp_version = val; | 3093 | po->tp_version = val; |
2319 | return 0; | 3094 | return 0; |
2320 | default: | 3095 | default: |
@@ -2424,6 +3199,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2424 | struct packet_sock *po = pkt_sk(sk); | 3199 | struct packet_sock *po = pkt_sk(sk); |
2425 | void *data; | 3200 | void *data; |
2426 | struct tpacket_stats st; | 3201 | struct tpacket_stats st; |
3202 | union tpacket_stats_u st_u; | ||
2427 | 3203 | ||
2428 | if (level != SOL_PACKET) | 3204 | if (level != SOL_PACKET) |
2429 | return -ENOPROTOOPT; | 3205 | return -ENOPROTOOPT; |
@@ -2436,15 +3212,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2436 | 3212 | ||
2437 | switch (optname) { | 3213 | switch (optname) { |
2438 | case PACKET_STATISTICS: | 3214 | case PACKET_STATISTICS: |
2439 | if (len > sizeof(struct tpacket_stats)) | 3215 | if (po->tp_version == TPACKET_V3) { |
2440 | len = sizeof(struct tpacket_stats); | 3216 | len = sizeof(struct tpacket_stats_v3); |
3217 | } else { | ||
3218 | if (len > sizeof(struct tpacket_stats)) | ||
3219 | len = sizeof(struct tpacket_stats); | ||
3220 | } | ||
2441 | spin_lock_bh(&sk->sk_receive_queue.lock); | 3221 | spin_lock_bh(&sk->sk_receive_queue.lock); |
2442 | st = po->stats; | 3222 | if (po->tp_version == TPACKET_V3) { |
3223 | memcpy(&st_u.stats3, &po->stats, | ||
3224 | sizeof(struct tpacket_stats)); | ||
3225 | st_u.stats3.tp_freeze_q_cnt = | ||
3226 | po->stats_u.stats3.tp_freeze_q_cnt; | ||
3227 | st_u.stats3.tp_packets += po->stats.tp_drops; | ||
3228 | data = &st_u.stats3; | ||
3229 | } else { | ||
3230 | st = po->stats; | ||
3231 | st.tp_packets += st.tp_drops; | ||
3232 | data = &st; | ||
3233 | } | ||
2443 | memset(&po->stats, 0, sizeof(st)); | 3234 | memset(&po->stats, 0, sizeof(st)); |
2444 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 3235 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
2445 | st.tp_packets += st.tp_drops; | ||
2446 | |||
2447 | data = &st; | ||
2448 | break; | 3236 | break; |
2449 | case PACKET_AUXDATA: | 3237 | case PACKET_AUXDATA: |
2450 | if (len > sizeof(int)) | 3238 | if (len > sizeof(int)) |
@@ -2485,6 +3273,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, | |||
2485 | case TPACKET_V2: | 3273 | case TPACKET_V2: |
2486 | val = sizeof(struct tpacket2_hdr); | 3274 | val = sizeof(struct tpacket2_hdr); |
2487 | break; | 3275 | break; |
3276 | case TPACKET_V3: | ||
3277 | val = sizeof(struct tpacket3_hdr); | ||
3278 | break; | ||
2488 | default: | 3279 | default: |
2489 | return -EINVAL; | 3280 | return -EINVAL; |
2490 | } | 3281 | } |
@@ -2641,7 +3432,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock, | |||
2641 | 3432 | ||
2642 | spin_lock_bh(&sk->sk_receive_queue.lock); | 3433 | spin_lock_bh(&sk->sk_receive_queue.lock); |
2643 | if (po->rx_ring.pg_vec) { | 3434 | if (po->rx_ring.pg_vec) { |
2644 | if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) | 3435 | if (!packet_previous_rx_frame(po, &po->rx_ring, |
3436 | TP_STATUS_KERNEL)) | ||
2645 | mask |= POLLIN | POLLRDNORM; | 3437 | mask |= POLLIN | POLLRDNORM; |
2646 | } | 3438 | } |
2647 | spin_unlock_bh(&sk->sk_receive_queue.lock); | 3439 | spin_unlock_bh(&sk->sk_receive_queue.lock); |
@@ -2702,7 +3494,7 @@ static void free_pg_vec(struct pgv *pg_vec, unsigned int order, | |||
2702 | kfree(pg_vec); | 3494 | kfree(pg_vec); |
2703 | } | 3495 | } |
2704 | 3496 | ||
2705 | static inline char *alloc_one_pg_vec_page(unsigned long order) | 3497 | static char *alloc_one_pg_vec_page(unsigned long order) |
2706 | { | 3498 | { |
2707 | char *buffer = NULL; | 3499 | char *buffer = NULL; |
2708 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | | 3500 | gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | |
@@ -2760,7 +3552,7 @@ out_free_pgvec: | |||
2760 | goto out; | 3552 | goto out; |
2761 | } | 3553 | } |
2762 | 3554 | ||
2763 | static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | 3555 | static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
2764 | int closing, int tx_ring) | 3556 | int closing, int tx_ring) |
2765 | { | 3557 | { |
2766 | struct pgv *pg_vec = NULL; | 3558 | struct pgv *pg_vec = NULL; |
@@ -2769,7 +3561,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2769 | struct packet_ring_buffer *rb; | 3561 | struct packet_ring_buffer *rb; |
2770 | struct sk_buff_head *rb_queue; | 3562 | struct sk_buff_head *rb_queue; |
2771 | __be16 num; | 3563 | __be16 num; |
2772 | int err; | 3564 | int err = -EINVAL; |
3565 | /* Added to avoid minimal code churn */ | ||
3566 | struct tpacket_req *req = &req_u->req; | ||
3567 | |||
3568 | /* Opening a Tx-ring is NOT supported in TPACKET_V3 */ | ||
3569 | if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) { | ||
3570 | WARN(1, "Tx-ring is not supported.\n"); | ||
3571 | goto out; | ||
3572 | } | ||
2773 | 3573 | ||
2774 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; | 3574 | rb = tx_ring ? &po->tx_ring : &po->rx_ring; |
2775 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; | 3575 | rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; |
@@ -2795,6 +3595,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2795 | case TPACKET_V2: | 3595 | case TPACKET_V2: |
2796 | po->tp_hdrlen = TPACKET2_HDRLEN; | 3596 | po->tp_hdrlen = TPACKET2_HDRLEN; |
2797 | break; | 3597 | break; |
3598 | case TPACKET_V3: | ||
3599 | po->tp_hdrlen = TPACKET3_HDRLEN; | ||
3600 | break; | ||
2798 | } | 3601 | } |
2799 | 3602 | ||
2800 | err = -EINVAL; | 3603 | err = -EINVAL; |
@@ -2820,6 +3623,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2820 | pg_vec = alloc_pg_vec(req, order); | 3623 | pg_vec = alloc_pg_vec(req, order); |
2821 | if (unlikely(!pg_vec)) | 3624 | if (unlikely(!pg_vec)) |
2822 | goto out; | 3625 | goto out; |
3626 | switch (po->tp_version) { | ||
3627 | case TPACKET_V3: | ||
3628 | /* Transmit path is not supported. We checked | ||
3629 | * it above but just being paranoid | ||
3630 | */ | ||
3631 | if (!tx_ring) | ||
3632 | init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring); | ||
3633 | break; | ||
3634 | default: | ||
3635 | break; | ||
3636 | } | ||
2823 | } | 3637 | } |
2824 | /* Done */ | 3638 | /* Done */ |
2825 | else { | 3639 | else { |
@@ -2872,7 +3686,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, | |||
2872 | register_prot_hook(sk); | 3686 | register_prot_hook(sk); |
2873 | } | 3687 | } |
2874 | spin_unlock(&po->bind_lock); | 3688 | spin_unlock(&po->bind_lock); |
2875 | 3689 | if (closing && (po->tp_version > TPACKET_V2)) { | |
3690 | /* Because we don't support block-based V3 on tx-ring */ | ||
3691 | if (!tx_ring) | ||
3692 | prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); | ||
3693 | } | ||
2876 | release_sock(sk); | 3694 | release_sock(sk); |
2877 | 3695 | ||
2878 | if (pg_vec) | 3696 | if (pg_vec) |